illumos-gate Old usr/src/uts/common/fs/nfs/nfs

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1990, 2010, Oracle and/or its affiliates. All rights reserved.
  23  *
  24  *      Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T.
  25  *      All rights reserved.
  26  */
  27 
  28 #include <sys/param.h>
  29 #include <sys/types.h>
  30 #include <sys/systm.h>
  31 #include <sys/cred.h>
  32 #include <sys/time.h>
  33 #include <sys/vnode.h>
  34 #include <sys/vfs.h>
  35 #include <sys/vfs_opreg.h>
  36 #include <sys/file.h>
  37 #include <sys/filio.h>
  38 #include <sys/uio.h>
  39 #include <sys/buf.h>
  40 #include <sys/mman.h>
  41 #include <sys/pathname.h>
  42 #include <sys/dirent.h>
  43 #include <sys/debug.h>
  44 #include <sys/vmsystm.h>
  45 #include <sys/fcntl.h>
  46 #include <sys/flock.h>
  47 #include <sys/swap.h>
  48 #include <sys/errno.h>
  49 #include <sys/strsubr.h>
  50 #include <sys/sysmacros.h>
  51 #include <sys/kmem.h>
  52 #include <sys/cmn_err.h>
  53 #include <sys/pathconf.h>
  54 #include <sys/utsname.h>
  55 #include <sys/dnlc.h>
  56 #include <sys/acl.h>
  57 #include <sys/atomic.h>
  58 #include <sys/policy.h>
  59 #include <sys/sdt.h>
  60 
  61 #include <rpc/types.h>
  62 #include <rpc/auth.h>
  63 #include <rpc/clnt.h>
  64 
  65 #include <nfs/nfs.h>
  66 #include <nfs/nfs_clnt.h>
  67 #include <nfs/rnode.h>
  68 #include <nfs/nfs_acl.h>
  69 #include <nfs/lm.h>
  70 
  71 #include <vm/hat.h>
  72 #include <vm/as.h>
  73 #include <vm/page.h>
  74 #include <vm/pvn.h>
  75 #include <vm/seg.h>
  76 #include <vm/seg_map.h>
  77 #include <vm/seg_kpm.h>
  78 #include <vm/seg_vn.h>
  79 
  80 #include <fs/fs_subr.h>
  81 
  82 #include <sys/ddi.h>
  83 
  84 static int      nfs_rdwrlbn(vnode_t *, page_t *, u_offset_t, size_t, int,
  85                         cred_t *);
  86 static int      nfswrite(vnode_t *, caddr_t, uint_t, int, cred_t *);
  87 static int      nfsread(vnode_t *, caddr_t, uint_t, int, size_t *, cred_t *);
  88 static int      nfssetattr(vnode_t *, struct vattr *, int, cred_t *);
  89 static int      nfslookup_dnlc(vnode_t *, char *, vnode_t **, cred_t *);
  90 static int      nfslookup_otw(vnode_t *, char *, vnode_t **, cred_t *, int);
  91 static int      nfsrename(vnode_t *, char *, vnode_t *, char *, cred_t *,
  92                         caller_context_t *);
  93 static int      nfsreaddir(vnode_t *, rddir_cache *, cred_t *);
  94 static int      nfs_bio(struct buf *, cred_t *);
  95 static int      nfs_getapage(vnode_t *, u_offset_t, size_t, uint_t *,
  96                         page_t *[], size_t, struct seg *, caddr_t,
  97                         enum seg_rw, cred_t *);
  98 static void     nfs_readahead(vnode_t *, u_offset_t, caddr_t, struct seg *,
  99                         cred_t *);
 100 static int      nfs_sync_putapage(vnode_t *, page_t *, u_offset_t, size_t,
 101                         int, cred_t *);
 102 static int      nfs_sync_pageio(vnode_t *, page_t *, u_offset_t, size_t,
 103                         int, cred_t *);
 104 static void     nfs_delmap_callback(struct as *, void *, uint_t);
 105 
 106 /*
 107  * Error flags used to pass information about certain special errors
 108  * which need to be handled specially.
 109  */
 110 #define NFS_EOF                 -98
 111 
 112 /*
 113  * These are the vnode ops routines which implement the vnode interface to
 114  * the networked file system.  These routines just take their parameters,
 115  * make them look networkish by putting the right info into interface structs,
 116  * and then calling the appropriate remote routine(s) to do the work.
 117  *
 118  * Note on directory name lookup cacheing:  If we detect a stale fhandle,
 119  * we purge the directory cache relative to that vnode.  This way, the
 120  * user won't get burned by the cache repeatedly.  See <nfs/rnode.h> for
 121  * more details on rnode locking.
 122  */
 123 
 124 static int      nfs_open(vnode_t **, int, cred_t *, caller_context_t *);
 125 static int      nfs_close(vnode_t *, int, int, offset_t, cred_t *,
 126                         caller_context_t *);
 127 static int      nfs_read(vnode_t *, struct uio *, int, cred_t *,
 128                         caller_context_t *);
 129 static int      nfs_write(vnode_t *, struct uio *, int, cred_t *,
 130                         caller_context_t *);
 131 static int      nfs_ioctl(vnode_t *, int, intptr_t, int, cred_t *, int *,
 132                         caller_context_t *);
 133 static int      nfs_getattr(vnode_t *, struct vattr *, int, cred_t *,
 134                         caller_context_t *);
 135 static int      nfs_setattr(vnode_t *, struct vattr *, int, cred_t *,
 136                         caller_context_t *);
 137 static int      nfs_access(vnode_t *, int, int, cred_t *, caller_context_t *);
 138 static int      nfs_accessx(void *, int, cred_t *);
 139 static int      nfs_readlink(vnode_t *, struct uio *, cred_t *,
 140                         caller_context_t *);
 141 static int      nfs_fsync(vnode_t *, int, cred_t *, caller_context_t *);
 142 static void     nfs_inactive(vnode_t *, cred_t *, caller_context_t *);
 143 static int      nfs_lookup(vnode_t *, char *, vnode_t **, struct pathname *,
 144                         int, vnode_t *, cred_t *, caller_context_t *,
 145                         int *, pathname_t *);
 146 static int      nfs_create(vnode_t *, char *, struct vattr *, enum vcexcl,
 147                         int, vnode_t **, cred_t *, int, caller_context_t *,
 148                         vsecattr_t *);
 149 static int      nfs_remove(vnode_t *, char *, cred_t *, caller_context_t *,
 150                         int);
 151 static int      nfs_link(vnode_t *, vnode_t *, char *, cred_t *,
 152                         caller_context_t *, int);
 153 static int      nfs_rename(vnode_t *, char *, vnode_t *, char *, cred_t *,
 154                         caller_context_t *, int);
 155 static int      nfs_mkdir(vnode_t *, char *, struct vattr *, vnode_t **,
 156                         cred_t *, caller_context_t *, int, vsecattr_t *);
 157 static int      nfs_rmdir(vnode_t *, char *, vnode_t *, cred_t *,
 158                         caller_context_t *, int);
 159 static int      nfs_symlink(vnode_t *, char *, struct vattr *, char *,
 160                         cred_t *, caller_context_t *, int);
 161 static int      nfs_readdir(vnode_t *, struct uio *, cred_t *, int *,
 162                         caller_context_t *, int);
 163 static int      nfs_fid(vnode_t *, fid_t *, caller_context_t *);
 164 static int      nfs_rwlock(vnode_t *, int, caller_context_t *);
 165 static void     nfs_rwunlock(vnode_t *, int, caller_context_t *);
 166 static int      nfs_seek(vnode_t *, offset_t, offset_t *, caller_context_t *);
 167 static int      nfs_getpage(vnode_t *, offset_t, size_t, uint_t *,
 168                         page_t *[], size_t, struct seg *, caddr_t,
 169                         enum seg_rw, cred_t *, caller_context_t *);
 170 static int      nfs_putpage(vnode_t *, offset_t, size_t, int, cred_t *,
 171                         caller_context_t *);
 172 static int      nfs_map(vnode_t *, offset_t, struct as *, caddr_t *, size_t,
 173                         uchar_t, uchar_t, uint_t, cred_t *, caller_context_t *);
 174 static int      nfs_addmap(vnode_t *, offset_t, struct as *, caddr_t, size_t,
 175                         uchar_t, uchar_t, uint_t, cred_t *, caller_context_t *);
 176 static int      nfs_frlock(vnode_t *, int, struct flock64 *, int, offset_t,
 177                         struct flk_callback *, cred_t *, caller_context_t *);
 178 static int      nfs_space(vnode_t *, int, struct flock64 *, int, offset_t,
 179                         cred_t *, caller_context_t *);
 180 static int      nfs_realvp(vnode_t *, vnode_t **, caller_context_t *);
 181 static int      nfs_delmap(vnode_t *, offset_t, struct as *, caddr_t, size_t,
 182                         uint_t, uint_t, uint_t, cred_t *, caller_context_t *);
 183 static int      nfs_pathconf(vnode_t *, int, ulong_t *, cred_t *,
 184                         caller_context_t *);
 185 static int      nfs_pageio(vnode_t *, page_t *, u_offset_t, size_t, int,
 186                         cred_t *, caller_context_t *);
 187 static int      nfs_setsecattr(vnode_t *, vsecattr_t *, int, cred_t *,
 188                         caller_context_t *);
 189 static int      nfs_getsecattr(vnode_t *, vsecattr_t *, int, cred_t *,
 190                         caller_context_t *);
 191 static int      nfs_shrlock(vnode_t *, int, struct shrlock *, int, cred_t *,
 192                         caller_context_t *);
 193 
 194 struct vnodeops *nfs_vnodeops;
 195 
 196 const fs_operation_def_t nfs_vnodeops_template[] = {
 197         VOPNAME_OPEN,           { .vop_open = nfs_open },
 198         VOPNAME_CLOSE,          { .vop_close = nfs_close },
 199         VOPNAME_READ,           { .vop_read = nfs_read },
 200         VOPNAME_WRITE,          { .vop_write = nfs_write },
 201         VOPNAME_IOCTL,          { .vop_ioctl = nfs_ioctl },
 202         VOPNAME_GETATTR,        { .vop_getattr = nfs_getattr },
 203         VOPNAME_SETATTR,        { .vop_setattr = nfs_setattr },
 204         VOPNAME_ACCESS,         { .vop_access = nfs_access },
 205         VOPNAME_LOOKUP,         { .vop_lookup = nfs_lookup },
 206         VOPNAME_CREATE,         { .vop_create = nfs_create },
 207         VOPNAME_REMOVE,         { .vop_remove = nfs_remove },
 208         VOPNAME_LINK,           { .vop_link = nfs_link },
 209         VOPNAME_RENAME,         { .vop_rename = nfs_rename },
 210         VOPNAME_MKDIR,          { .vop_mkdir = nfs_mkdir },
 211         VOPNAME_RMDIR,          { .vop_rmdir = nfs_rmdir },
 212         VOPNAME_READDIR,        { .vop_readdir = nfs_readdir },
 213         VOPNAME_SYMLINK,        { .vop_symlink = nfs_symlink },
 214         VOPNAME_READLINK,       { .vop_readlink = nfs_readlink },
 215         VOPNAME_FSYNC,          { .vop_fsync = nfs_fsync },
 216         VOPNAME_INACTIVE,       { .vop_inactive = nfs_inactive },
 217         VOPNAME_FID,            { .vop_fid = nfs_fid },
 218         VOPNAME_RWLOCK,         { .vop_rwlock = nfs_rwlock },
 219         VOPNAME_RWUNLOCK,       { .vop_rwunlock = nfs_rwunlock },
 220         VOPNAME_SEEK,           { .vop_seek = nfs_seek },
 221         VOPNAME_FRLOCK,         { .vop_frlock = nfs_frlock },
 222         VOPNAME_SPACE,          { .vop_space = nfs_space },
 223         VOPNAME_REALVP,         { .vop_realvp = nfs_realvp },
 224         VOPNAME_GETPAGE,        { .vop_getpage = nfs_getpage },
 225         VOPNAME_PUTPAGE,        { .vop_putpage = nfs_putpage },
 226         VOPNAME_MAP,            { .vop_map = nfs_map },
 227         VOPNAME_ADDMAP,         { .vop_addmap = nfs_addmap },
 228         VOPNAME_DELMAP,         { .vop_delmap = nfs_delmap },
 229         VOPNAME_DUMP,           { .vop_dump = nfs_dump },
 230         VOPNAME_PATHCONF,       { .vop_pathconf = nfs_pathconf },
 231         VOPNAME_PAGEIO,         { .vop_pageio = nfs_pageio },
 232         VOPNAME_SETSECATTR,     { .vop_setsecattr = nfs_setsecattr },
 233         VOPNAME_GETSECATTR,     { .vop_getsecattr = nfs_getsecattr },
 234         VOPNAME_SHRLOCK,        { .vop_shrlock = nfs_shrlock },
 235         VOPNAME_VNEVENT,        { .vop_vnevent = fs_vnevent_support },
 236         NULL,                   NULL
 237 };
 238 
 239 /*
 240  * XXX:  This is referenced in modstubs.s
 241  */
 242 struct vnodeops *
 243 nfs_getvnodeops(void)
 244 {
 245         return (nfs_vnodeops);
 246 }
 247 
 248 /* ARGSUSED */
 249 static int
 250 nfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
 251 {
 252         int error;
 253         struct vattr va;
 254         rnode_t *rp;
 255         vnode_t *vp;
 256 
 257         vp = *vpp;
 258         rp = VTOR(vp);
 259         if (nfs_zone() != VTOMI(vp)->mi_zone)
 260                 return (EIO);
 261         mutex_enter(&rp->r_statelock);
 262         if (rp->r_cred == NULL) {
 263                 crhold(cr);
 264                 rp->r_cred = cr;
 265         }
 266         mutex_exit(&rp->r_statelock);
 267 
 268         /*
 269          * If there is no cached data or if close-to-open
 270          * consistency checking is turned off, we can avoid
 271          * the over the wire getattr.  Otherwise, if the
 272          * file system is mounted readonly, then just verify
 273          * the caches are up to date using the normal mechanism.
 274          * Else, if the file is not mmap'd, then just mark
 275          * the attributes as timed out.  They will be refreshed
 276          * and the caches validated prior to being used.
 277          * Else, the file system is mounted writeable so
 278          * force an over the wire GETATTR in order to ensure
 279          * that all cached data is valid.
 280          */
 281         if (vp->v_count > 1 ||
 282             ((vn_has_cached_data(vp) || HAVE_RDDIR_CACHE(rp)) &&
 283             !(VTOMI(vp)->mi_flags & MI_NOCTO))) {
 284                 if (vn_is_readonly(vp))
 285                         error = nfs_validate_caches(vp, cr);
 286                 else if (rp->r_mapcnt == 0 && vp->v_count == 1) {
 287                         PURGE_ATTRCACHE(vp);
 288                         error = 0;
 289                 } else {
 290                         va.va_mask = AT_ALL;
 291                         error = nfs_getattr_otw(vp, &va, cr);
 292                 }
 293         } else
 294                 error = 0;
 295 
 296         return (error);
 297 }
 298 
 299 /* ARGSUSED */
 300 static int
 301 nfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
 302         caller_context_t *ct)
 303 {
 304         rnode_t *rp;
 305         int error;
 306         struct vattr va;
 307 
 308         /*
 309          * zone_enter(2) prevents processes from changing zones with NFS files
 310          * open; if we happen to get here from the wrong zone we can't do
 311          * anything over the wire.
 312          */
 313         if (VTOMI(vp)->mi_zone != nfs_zone()) {
 314                 /*
 315                  * We could attempt to clean up locks, except we're sure
 316                  * that the current process didn't acquire any locks on
 317                  * the file: any attempt to lock a file belong to another zone
 318                  * will fail, and one can't lock an NFS file and then change
 319                  * zones, as that fails too.
 320                  *
 321                  * Returning an error here is the sane thing to do.  A
 322                  * subsequent call to VN_RELE() which translates to a
 323                  * nfs_inactive() will clean up state: if the zone of the
 324                  * vnode's origin is still alive and kicking, an async worker
 325                  * thread will handle the request (from the correct zone), and
 326                  * everything (minus the final nfs_getattr_otw() call) should
 327                  * be OK. If the zone is going away nfs_async_inactive() will
 328                  * throw away cached pages inline.
 329                  */
 330                 return (EIO);
 331         }
 332 
 333         /*
 334          * If we are using local locking for this filesystem, then
 335          * release all of the SYSV style record locks.  Otherwise,
 336          * we are doing network locking and we need to release all
 337          * of the network locks.  All of the locks held by this
 338          * process on this file are released no matter what the
 339          * incoming reference count is.
 340          */
 341         if (VTOMI(vp)->mi_flags & MI_LLOCK) {
 342                 cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
 343                 cleanshares(vp, ttoproc(curthread)->p_pid);
 344         } else
 345                 nfs_lockrelease(vp, flag, offset, cr);
 346 
 347         if (count > 1)
 348                 return (0);
 349 
 350         /*
 351          * If the file has been `unlinked', then purge the
 352          * DNLC so that this vnode will get reycled quicker
 353          * and the .nfs* file on the server will get removed.
 354          */
 355         rp = VTOR(vp);
 356         if (rp->r_unldvp != NULL)
 357                 dnlc_purge_vp(vp);
 358 
 359         /*
 360          * If the file was open for write and there are pages,
 361          * then if the file system was mounted using the "no-close-
 362          *      to-open" semantics, then start an asynchronous flush
 363          *      of the all of the pages in the file.
 364          * else the file system was not mounted using the "no-close-
 365          *      to-open" semantics, then do a synchronous flush and
 366          *      commit of all of the dirty and uncommitted pages.
 367          *
 368          * The asynchronous flush of the pages in the "nocto" path
 369          * mostly just associates a cred pointer with the rnode so
 370          * writes which happen later will have a better chance of
 371          * working.  It also starts the data being written to the
 372          * server, but without unnecessarily delaying the application.
 373          */
 374         if ((flag & FWRITE) && vn_has_cached_data(vp)) {
 375                 if ((VTOMI(vp)->mi_flags & MI_NOCTO)) {
 376                         error = nfs_putpage(vp, (offset_t)0, 0, B_ASYNC,
 377                             cr, ct);
 378                         if (error == EAGAIN)
 379                                 error = 0;
 380                 } else
 381                         error = nfs_putpage(vp, (offset_t)0, 0, 0, cr, ct);
 382                 if (!error) {
 383                         mutex_enter(&rp->r_statelock);
 384                         error = rp->r_error;
 385                         rp->r_error = 0;
 386                         mutex_exit(&rp->r_statelock);
 387                 }
 388         } else {
 389                 mutex_enter(&rp->r_statelock);
 390                 error = rp->r_error;
 391                 rp->r_error = 0;
 392                 mutex_exit(&rp->r_statelock);
 393         }
 394 
 395         /*
 396          * If RWRITEATTR is set, then issue an over the wire GETATTR to
 397          * refresh the attribute cache with a set of attributes which
 398          * weren't returned from a WRITE.  This will enable the close-
 399          * to-open processing to work.
 400          */
 401         if (rp->r_flags & RWRITEATTR)
 402                 (void) nfs_getattr_otw(vp, &va, cr);
 403 
 404         return (error);
 405 }
 406 
 407 /* ARGSUSED */
 408 static int
 409 nfs_read(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
 410         caller_context_t *ct)
 411 {
 412         rnode_t *rp;
 413         u_offset_t off;
 414         offset_t diff;
 415         int on;
 416         size_t n;
 417         caddr_t base;
 418         uint_t flags;
 419         int error;
 420         mntinfo_t *mi;
 421 
 422         rp = VTOR(vp);
 423         mi = VTOMI(vp);
 424 
 425         if (nfs_zone() != mi->mi_zone)
 426                 return (EIO);
 427 
 428         ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER));
 429 
 430         if (vp->v_type != VREG)
 431                 return (EISDIR);
 432 
 433         if (uiop->uio_resid == 0)
 434                 return (0);
 435 
 436         if (uiop->uio_loffset > MAXOFF32_T)
 437                 return (EFBIG);
 438 
 439         if (uiop->uio_loffset < 0 ||
 440             uiop->uio_loffset + uiop->uio_resid > MAXOFF32_T)
 441                 return (EINVAL);
 442 
 443         /*
 444          * Bypass VM if caching has been disabled (e.g., locking) or if
 445          * using client-side direct I/O and the file is not mmap'd and
 446          * there are no cached pages.
 447          */
 448         if ((vp->v_flag & VNOCACHE) ||
 449             (((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO)) &&
 450             rp->r_mapcnt == 0 && rp->r_inmap == 0 &&
 451             !vn_has_cached_data(vp))) {
 452                 size_t bufsize;
 453                 size_t resid = 0;
 454 
 455                 /*
 456                  * Let's try to do read in as large a chunk as we can
 457                  * (Filesystem (NFS client) bsize if possible/needed).
 458                  * For V3, this is 32K and for V2, this is 8K.
 459                  */
 460                 bufsize = MIN(uiop->uio_resid, VTOMI(vp)->mi_curread);
 461                 base = kmem_alloc(bufsize, KM_SLEEP);
 462                 do {
 463                         n = MIN(uiop->uio_resid, bufsize);
 464                         error = nfsread(vp, base, uiop->uio_offset, n,
 465                             &resid, cr);
 466                         if (!error) {
 467                                 n -= resid;
 468                                 error = uiomove(base, n, UIO_READ, uiop);
 469                         }
 470                 } while (!error && uiop->uio_resid > 0 && n > 0);
 471                 kmem_free(base, bufsize);
 472                 return (error);
 473         }
 474 
 475         error = 0;
 476 
 477         do {
 478                 off = uiop->uio_loffset & MAXBMASK; /* mapping offset */
 479                 on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */
 480                 n = MIN(MAXBSIZE - on, uiop->uio_resid);
 481 
 482                 error = nfs_validate_caches(vp, cr);
 483                 if (error)
 484                         break;
 485 
 486                 mutex_enter(&rp->r_statelock);
 487                 while (rp->r_flags & RINCACHEPURGE) {
 488                         if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
 489                                 mutex_exit(&rp->r_statelock);
 490                                 return (EINTR);
 491                         }
 492                 }
 493                 diff = rp->r_size - uiop->uio_loffset;
 494                 mutex_exit(&rp->r_statelock);
 495                 if (diff <= 0)
 496                         break;
 497                 if (diff < n)
 498                         n = (size_t)diff;
 499 
 500                 if (vpm_enable) {
 501                         /*
 502                          * Copy data.
 503                          */
 504                         error = vpm_data_copy(vp, off + on, n, uiop,
 505                             1, NULL, 0, S_READ);
 506                 } else {
 507                         base = segmap_getmapflt(segkmap, vp, off + on, n,
 508                             1, S_READ);
 509                         error = uiomove(base + on, n, UIO_READ, uiop);
 510                 }
 511 
 512                 if (!error) {
 513                         /*
 514                          * If read a whole block or read to eof,
 515                          * won't need this buffer again soon.
 516                          */
 517                         mutex_enter(&rp->r_statelock);
 518                         if (n + on == MAXBSIZE ||
 519                             uiop->uio_loffset == rp->r_size)
 520                                 flags = SM_DONTNEED;
 521                         else
 522                                 flags = 0;
 523                         mutex_exit(&rp->r_statelock);
 524                         if (vpm_enable) {
 525                                 error = vpm_sync_pages(vp, off, n, flags);
 526                         } else {
 527                                 error = segmap_release(segkmap, base, flags);
 528                         }
 529                 } else {
 530                         if (vpm_enable) {
 531                                 (void) vpm_sync_pages(vp, off, n, 0);
 532                         } else {
 533                                 (void) segmap_release(segkmap, base, 0);
 534                         }
 535                 }
 536         } while (!error && uiop->uio_resid > 0);
 537 
 538         return (error);
 539 }
 540 
 541 /* ARGSUSED */
 542 static int
 543 nfs_write(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
 544         caller_context_t *ct)
 545 {
 546         rnode_t *rp;
 547         u_offset_t off;
 548         caddr_t base;
 549         uint_t flags;
 550         int remainder;
 551         size_t n;
 552         int on;
 553         int error;
 554         int resid;
 555         offset_t offset;
 556         rlim_t limit;
 557         mntinfo_t *mi;
 558 
 559         rp = VTOR(vp);
 560 
 561         mi = VTOMI(vp);
 562         if (nfs_zone() != mi->mi_zone)
 563                 return (EIO);
 564         if (vp->v_type != VREG)
 565                 return (EISDIR);
 566 
 567         if (uiop->uio_resid == 0)
 568                 return (0);
 569 
 570         if (ioflag & FAPPEND) {
 571                 struct vattr va;
 572 
 573                 /*
 574                  * Must serialize if appending.
 575                  */
 576                 if (nfs_rw_lock_held(&rp->r_rwlock, RW_READER)) {
 577                         nfs_rw_exit(&rp->r_rwlock);
 578                         if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER,
 579                             INTR(vp)))
 580                                 return (EINTR);
 581                 }
 582 
 583                 va.va_mask = AT_SIZE;
 584                 error = nfsgetattr(vp, &va, cr);
 585                 if (error)
 586                         return (error);
 587                 uiop->uio_loffset = va.va_size;
 588         }
 589 
 590         if (uiop->uio_loffset > MAXOFF32_T)
 591                 return (EFBIG);
 592 
 593         offset = uiop->uio_loffset + uiop->uio_resid;
 594 
 595         if (uiop->uio_loffset < 0 || offset > MAXOFF32_T)
 596                 return (EINVAL);
 597 
 598         if (uiop->uio_llimit > (rlim64_t)MAXOFF32_T) {
 599                 limit = MAXOFF32_T;
 600         } else {
 601                 limit = (rlim_t)uiop->uio_llimit;
 602         }
 603 
 604         /*
 605          * Check to make sure that the process will not exceed
 606          * its limit on file size.  It is okay to write up to
 607          * the limit, but not beyond.  Thus, the write which
 608          * reaches the limit will be short and the next write
 609          * will return an error.
 610          */
 611         remainder = 0;
 612         if (offset > limit) {
 613                 remainder = offset - limit;
 614                 uiop->uio_resid = limit - uiop->uio_offset;
 615                 if (uiop->uio_resid <= 0) {
 616                         proc_t *p = ttoproc(curthread);
 617 
 618                         uiop->uio_resid += remainder;
 619                         mutex_enter(&p->p_lock);
 620                         (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
 621                             p->p_rctls, p, RCA_UNSAFE_SIGINFO);
 622                         mutex_exit(&p->p_lock);
 623                         return (EFBIG);
 624                 }
 625         }
 626 
 627         if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR(vp)))
 628                 return (EINTR);
 629 
 630         /*
 631          * Bypass VM if caching has been disabled (e.g., locking) or if
 632          * using client-side direct I/O and the file is not mmap'd and
 633          * there are no cached pages.
 634          */
 635         if ((vp->v_flag & VNOCACHE) ||
 636             (((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO)) &&
 637             rp->r_mapcnt == 0 && rp->r_inmap == 0 &&
 638             !vn_has_cached_data(vp))) {
 639                 size_t bufsize;
 640                 int count;
 641                 uint_t org_offset;
 642 
 643 nfs_fwrite:
 644                 if (rp->r_flags & RSTALE) {
 645                         resid = uiop->uio_resid;
 646                         offset = uiop->uio_loffset;
 647                         error = rp->r_error;
 648                         /*
 649                          * A close may have cleared r_error, if so,
 650                          * propagate ESTALE error return properly
 651                          */
 652                         if (error == 0)
 653                                 error = ESTALE;
 654                         goto bottom;
 655                 }
 656                 bufsize = MIN(uiop->uio_resid, mi->mi_curwrite);
 657                 base = kmem_alloc(bufsize, KM_SLEEP);
 658                 do {
 659                         resid = uiop->uio_resid;
 660                         offset = uiop->uio_loffset;
 661                         count = MIN(uiop->uio_resid, bufsize);
 662                         org_offset = uiop->uio_offset;
 663                         error = uiomove(base, count, UIO_WRITE, uiop);
 664                         if (!error) {
 665                                 error = nfswrite(vp, base, org_offset,
 666                                     count, cr);
 667                         }
 668                 } while (!error && uiop->uio_resid > 0);
 669                 kmem_free(base, bufsize);
 670                 goto bottom;
 671         }
 672 
 673         do {
 674                 off = uiop->uio_loffset & MAXBMASK; /* mapping offset */
 675                 on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */
 676                 n = MIN(MAXBSIZE - on, uiop->uio_resid);
 677 
 678                 resid = uiop->uio_resid;
 679                 offset = uiop->uio_loffset;
 680 
 681                 if (rp->r_flags & RSTALE) {
 682                         error = rp->r_error;
 683                         /*
 684                          * A close may have cleared r_error, if so,
 685                          * propagate ESTALE error return properly
 686                          */
 687                         if (error == 0)
 688                                 error = ESTALE;
 689                         break;
 690                 }
 691 
 692                 /*
 693                  * Don't create dirty pages faster than they
 694                  * can be cleaned so that the system doesn't
 695                  * get imbalanced.  If the async queue is
 696                  * maxed out, then wait for it to drain before
 697                  * creating more dirty pages.  Also, wait for
 698                  * any threads doing pagewalks in the vop_getattr
 699                  * entry points so that they don't block for
 700                  * long periods.
 701                  */
 702                 mutex_enter(&rp->r_statelock);
 703                 while ((mi->mi_max_threads != 0 &&
 704                     rp->r_awcount > 2 * mi->mi_max_threads) ||
 705                     rp->r_gcount > 0) {
 706                         if (INTR(vp)) {
 707                                 klwp_t *lwp = ttolwp(curthread);
 708 
 709                                 if (lwp != NULL)
 710                                         lwp->lwp_nostop++;
 711                                 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
 712                                         mutex_exit(&rp->r_statelock);
 713                                         if (lwp != NULL)
 714                                                 lwp->lwp_nostop--;
 715                                         error = EINTR;
 716                                         goto bottom;
 717                                 }
 718                                 if (lwp != NULL)
 719                                         lwp->lwp_nostop--;
 720                         } else
 721                                 cv_wait(&rp->r_cv, &rp->r_statelock);
 722                 }
 723                 mutex_exit(&rp->r_statelock);
 724 
 725                 /*
 726                  * Touch the page and fault it in if it is not in core
 727                  * before segmap_getmapflt or vpm_data_copy can lock it.
 728                  * This is to avoid the deadlock if the buffer is mapped
 729                  * to the same file through mmap which we want to write.
 730                  */
 731                 uio_prefaultpages((long)n, uiop);
 732 
 733                 if (vpm_enable) {
 734                         /*
 735                          * It will use kpm mappings, so no need to
 736                          * pass an address.
 737                          */
 738                         error = writerp(rp, NULL, n, uiop, 0);
 739                 } else  {
 740                         if (segmap_kpm) {
 741                                 int pon = uiop->uio_loffset & PAGEOFFSET;
 742                                 size_t pn = MIN(PAGESIZE - pon,
 743                                     uiop->uio_resid);
 744                                 int pagecreate;
 745 
 746                                 mutex_enter(&rp->r_statelock);
 747                                 pagecreate = (pon == 0) && (pn == PAGESIZE ||
 748                                     uiop->uio_loffset + pn >= rp->r_size);
 749                                 mutex_exit(&rp->r_statelock);
 750 
 751                                 base = segmap_getmapflt(segkmap, vp, off + on,
 752                                     pn, !pagecreate, S_WRITE);
 753 
 754                                 error = writerp(rp, base + pon, n, uiop,
 755                                     pagecreate);
 756 
 757                         } else {
 758                                 base = segmap_getmapflt(segkmap, vp, off + on,
 759                                     n, 0, S_READ);
 760                                 error = writerp(rp, base + on, n, uiop, 0);
 761                         }
 762                 }
 763 
 764                 if (!error) {
 765                         if (mi->mi_flags & MI_NOAC)
 766                                 flags = SM_WRITE;
 767                         else if (n + on == MAXBSIZE || IS_SWAPVP(vp)) {
 768                                 /*
 769                                  * Have written a whole block.
 770                                  * Start an asynchronous write
 771                                  * and mark the buffer to
 772                                  * indicate that it won't be
 773                                  * needed again soon.
 774                                  */
 775                                 flags = SM_WRITE | SM_ASYNC | SM_DONTNEED;
 776                         } else
 777                                 flags = 0;
 778                         if ((ioflag & (FSYNC|FDSYNC)) ||
 779                             (rp->r_flags & ROUTOFSPACE)) {
 780                                 flags &= ~SM_ASYNC;
 781                                 flags |= SM_WRITE;
 782                         }
 783                         if (vpm_enable) {
 784                                 error = vpm_sync_pages(vp, off, n, flags);
 785                         } else {
 786                                 error = segmap_release(segkmap, base, flags);
 787                         }
 788                 } else {
 789                         if (vpm_enable) {
 790                                 (void) vpm_sync_pages(vp, off, n, 0);
 791                         } else {
 792                                 (void) segmap_release(segkmap, base, 0);
 793                         }
 794                         /*
 795                          * In the event that we got an access error while
 796                          * faulting in a page for a write-only file just
 797                          * force a write.
 798                          */
 799                         if (error == EACCES)
 800                                 goto nfs_fwrite;
 801                 }
 802         } while (!error && uiop->uio_resid > 0);
 803 
 804 bottom:
 805         if (error) {
 806                 uiop->uio_resid = resid + remainder;
 807                 uiop->uio_loffset = offset;
 808         } else
 809                 uiop->uio_resid += remainder;
 810 
 811         nfs_rw_exit(&rp->r_lkserlock);
 812 
 813         return (error);
 814 }
 815 
 816 /*
 817  * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED}
 818  */
 819 static int
 820 nfs_rdwrlbn(vnode_t *vp, page_t *pp, u_offset_t off, size_t len,
 821         int flags, cred_t *cr)
 822 {
 823         struct buf *bp;
 824         int error;
 825 
 826         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
 827         bp = pageio_setup(pp, len, vp, flags);
 828         ASSERT(bp != NULL);
 829 
 830         /*
 831          * pageio_setup should have set b_addr to 0.  This
 832          * is correct since we want to do I/O on a page
 833          * boundary.  bp_mapin will use this addr to calculate
 834          * an offset, and then set b_addr to the kernel virtual
 835          * address it allocated for us.
 836          */
 837         ASSERT(bp->b_un.b_addr == 0);
 838 
 839         bp->b_edev = 0;
 840         bp->b_dev = 0;
 841         bp->b_lblkno = lbtodb(off);
 842         bp->b_file = vp;
 843         bp->b_offset = (offset_t)off;
 844         bp_mapin(bp);
 845 
 846         error = nfs_bio(bp, cr);
 847 
 848         bp_mapout(bp);
 849         pageio_done(bp);
 850 
 851         return (error);
 852 }
 853 
 854 /*
 855  * Write to file.  Writes to remote server in largest size
 856  * chunks that the server can handle.  Write is synchronous.
 857  */
 858 static int
 859 nfswrite(vnode_t *vp, caddr_t base, uint_t offset, int count, cred_t *cr)
 860 {
 861         rnode_t *rp;
 862         mntinfo_t *mi;
 863         struct nfswriteargs wa;
 864         struct nfsattrstat ns;
 865         int error;
 866         int tsize;
 867         int douprintf;
 868 
 869         douprintf = 1;
 870 
 871         rp = VTOR(vp);
 872         mi = VTOMI(vp);
 873 
 874         ASSERT(nfs_zone() == mi->mi_zone);
 875 
 876         wa.wa_args = &wa.wa_args_buf;
 877         wa.wa_fhandle = *VTOFH(vp);
 878 
 879         do {
 880                 tsize = MIN(mi->mi_curwrite, count);
 881                 wa.wa_data = base;
 882                 wa.wa_begoff = offset;
 883                 wa.wa_totcount = tsize;
 884                 wa.wa_count = tsize;
 885                 wa.wa_offset = offset;
 886 
 887                 if (mi->mi_io_kstats) {
 888                         mutex_enter(&mi->mi_lock);
 889                         kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
 890                         mutex_exit(&mi->mi_lock);
 891                 }
 892                 wa.wa_mblk = NULL;
 893                 do {
 894                         error = rfs2call(mi, RFS_WRITE,
 895                             xdr_writeargs, (caddr_t)&wa,
 896                             xdr_attrstat, (caddr_t)&ns, cr,
 897                             &douprintf, &ns.ns_status, 0, NULL);
 898                 } while (error == ENFS_TRYAGAIN);
 899                 if (mi->mi_io_kstats) {
 900                         mutex_enter(&mi->mi_lock);
 901                         kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
 902                         mutex_exit(&mi->mi_lock);
 903                 }
 904 
 905                 if (!error) {
 906                         error = geterrno(ns.ns_status);
 907                         /*
 908                          * Can't check for stale fhandle and purge caches
 909                          * here because pages are held by nfs_getpage.
 910                          * Just mark the attribute cache as timed out
 911                          * and set RWRITEATTR to indicate that the file
 912                          * was modified with a WRITE operation.
 913                          */
 914                         if (!error) {
 915                                 count -= tsize;
 916                                 base += tsize;
 917                                 offset += tsize;
 918                                 if (mi->mi_io_kstats) {
 919                                         mutex_enter(&mi->mi_lock);
 920                                         KSTAT_IO_PTR(mi->mi_io_kstats)->
 921                                             writes++;
 922                                         KSTAT_IO_PTR(mi->mi_io_kstats)->
 923                                             nwritten += tsize;
 924                                         mutex_exit(&mi->mi_lock);
 925                                 }
 926                                 lwp_stat_update(LWP_STAT_OUBLK, 1);
 927                                 mutex_enter(&rp->r_statelock);
 928                                 PURGE_ATTRCACHE_LOCKED(rp);
 929                                 rp->r_flags |= RWRITEATTR;
 930                                 mutex_exit(&rp->r_statelock);
 931                         }
 932                 }
 933         } while (!error && count);
 934 
 935         return (error);
 936 }
 937 
 938 /*
 939  * Read from a file.  Reads data in largest chunks our interface can handle.
 940  */
 941 static int
 942 nfsread(vnode_t *vp, caddr_t base, uint_t offset,
 943     int count, size_t *residp, cred_t *cr)
 944 {
 945         mntinfo_t *mi;
 946         struct nfsreadargs ra;
 947         struct nfsrdresult rr;
 948         int tsize;
 949         int error;
 950         int douprintf;
 951         failinfo_t fi;
 952         rnode_t *rp;
 953         struct vattr va;
 954         hrtime_t t;
 955 
 956         rp = VTOR(vp);
 957         mi = VTOMI(vp);
 958 
 959         ASSERT(nfs_zone() == mi->mi_zone);
 960 
 961         douprintf = 1;
 962 
 963         ra.ra_fhandle = *VTOFH(vp);
 964 
 965         fi.vp = vp;
 966         fi.fhp = (caddr_t)&ra.ra_fhandle;
 967         fi.copyproc = nfscopyfh;
 968         fi.lookupproc = nfslookup;
 969         fi.xattrdirproc = acl_getxattrdir2;
 970 
 971         do {
 972                 if (mi->mi_io_kstats) {
 973                         mutex_enter(&mi->mi_lock);
 974                         kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
 975                         mutex_exit(&mi->mi_lock);
 976                 }
 977 
 978                 do {
 979                         tsize = MIN(mi->mi_curread, count);
 980                         rr.rr_data = base;
 981                         ra.ra_offset = offset;
 982                         ra.ra_totcount = tsize;
 983                         ra.ra_count = tsize;
 984                         ra.ra_data = base;
 985                         t = gethrtime();
 986                         error = rfs2call(mi, RFS_READ,
 987                             xdr_readargs, (caddr_t)&ra,
 988                             xdr_rdresult, (caddr_t)&rr, cr,
 989                             &douprintf, &rr.rr_status, 0, &fi);
 990                 } while (error == ENFS_TRYAGAIN);
 991 
 992                 if (mi->mi_io_kstats) {
 993                         mutex_enter(&mi->mi_lock);
 994                         kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
 995                         mutex_exit(&mi->mi_lock);
 996                 }
 997 
 998                 if (!error) {
 999                         error = geterrno(rr.rr_status);
1000                         if (!error) {
1001                                 count -= rr.rr_count;
1002                                 base += rr.rr_count;
1003                                 offset += rr.rr_count;
1004                                 if (mi->mi_io_kstats) {
1005                                         mutex_enter(&mi->mi_lock);
1006                                         KSTAT_IO_PTR(mi->mi_io_kstats)->reads++;
1007                                         KSTAT_IO_PTR(mi->mi_io_kstats)->nread +=
1008                                             rr.rr_count;
1009                                         mutex_exit(&mi->mi_lock);
1010                                 }
1011                                 lwp_stat_update(LWP_STAT_INBLK, 1);
1012                         }
1013                 }
1014         } while (!error && count && rr.rr_count == tsize);
1015 
1016         *residp = count;
1017 
1018         if (!error) {
1019                 /*
1020                  * Since no error occurred, we have the current
1021                  * attributes and we need to do a cache check and then
1022                  * potentially update the cached attributes.  We can't
1023                  * use the normal attribute check and cache mechanisms
1024                  * because they might cause a cache flush which would
1025                  * deadlock.  Instead, we just check the cache to see
1026                  * if the attributes have changed.  If it is, then we
1027                  * just mark the attributes as out of date.  The next
1028                  * time that the attributes are checked, they will be
1029                  * out of date, new attributes will be fetched, and
1030                  * the page cache will be flushed.  If the attributes
1031                  * weren't changed, then we just update the cached
1032                  * attributes with these attributes.
1033                  */
1034                 /*
1035                  * If NFS_ACL is supported on the server, then the
1036                  * attributes returned by server may have minimal
1037                  * permissions sometimes denying access to users having
1038                  * proper access.  To get the proper attributes, mark
1039                  * the attributes as expired so that they will be
1040                  * regotten via the NFS_ACL GETATTR2 procedure.
1041                  */
1042                 error = nattr_to_vattr(vp, &rr.rr_attr, &va);
1043                 mutex_enter(&rp->r_statelock);
1044                 if (error || !CACHE_VALID(rp, va.va_mtime, va.va_size) ||
1045                     (mi->mi_flags & MI_ACL)) {
1046                         mutex_exit(&rp->r_statelock);
1047                         PURGE_ATTRCACHE(vp);
1048                 } else {
1049                         if (rp->r_mtime <= t) {
1050                                 nfs_attrcache_va(vp, &va);
1051                         }
1052                         mutex_exit(&rp->r_statelock);
1053                 }
1054         }
1055 
1056         return (error);
1057 }
1058 
1059 /* ARGSUSED */
1060 static int
1061 nfs_ioctl(vnode_t *vp, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp,
1062         caller_context_t *ct)
1063 {
1064 
1065         if (nfs_zone() != VTOMI(vp)->mi_zone)
1066                 return (EIO);
1067         switch (cmd) {
1068                 case _FIODIRECTIO:
1069                         return (nfs_directio(vp, (int)arg, cr));
1070                 default:
1071                         return (ENOTTY);
1072         }
1073 }
1074 
1075 /* ARGSUSED */
1076 static int
1077 nfs_getattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
1078         caller_context_t *ct)
1079 {
1080         int error;
1081         rnode_t *rp;
1082 
1083         if (nfs_zone() != VTOMI(vp)->mi_zone)
1084                 return (EIO);
1085         /*
1086          * If it has been specified that the return value will
1087          * just be used as a hint, and we are only being asked
1088          * for size, fsid or rdevid, then return the client's
1089          * notion of these values without checking to make sure
1090          * that the attribute cache is up to date.
1091          * The whole point is to avoid an over the wire GETATTR
1092          * call.
1093          */
1094         rp = VTOR(vp);
1095         if (flags & ATTR_HINT) {
1096                 if (vap->va_mask ==
1097                     (vap->va_mask & (AT_SIZE | AT_FSID | AT_RDEV))) {
1098                         mutex_enter(&rp->r_statelock);
1099                         if (vap->va_mask | AT_SIZE)
1100                                 vap->va_size = rp->r_size;
1101                         if (vap->va_mask | AT_FSID)
1102                                 vap->va_fsid = rp->r_attr.va_fsid;
1103                         if (vap->va_mask | AT_RDEV)
1104                                 vap->va_rdev = rp->r_attr.va_rdev;
1105                         mutex_exit(&rp->r_statelock);
1106                         return (0);
1107                 }
1108         }
1109 
1110         /*
1111          * Only need to flush pages if asking for the mtime
1112          * and if there any dirty pages or any outstanding
1113          * asynchronous (write) requests for this file.
1114          */
1115         if (vap->va_mask & AT_MTIME) {
1116                 if (vn_has_cached_data(vp) &&
1117                     ((rp->r_flags & RDIRTY) || rp->r_awcount > 0)) {
1118                         mutex_enter(&rp->r_statelock);
1119                         rp->r_gcount++;
1120                         mutex_exit(&rp->r_statelock);
1121                         error = nfs_putpage(vp, (offset_t)0, 0, 0, cr, ct);
1122                         mutex_enter(&rp->r_statelock);
1123                         if (error && (error == ENOSPC || error == EDQUOT)) {
1124                                 if (!rp->r_error)
1125                                         rp->r_error = error;
1126                         }
1127                         if (--rp->r_gcount == 0)
1128                                 cv_broadcast(&rp->r_cv);
1129                         mutex_exit(&rp->r_statelock);
1130                 }
1131         }
1132 
1133         return (nfsgetattr(vp, vap, cr));
1134 }
1135 
1136 /*ARGSUSED4*/
1137 static int
1138 nfs_setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
1139                 caller_context_t *ct)
1140 {
1141         int error;
1142         uint_t mask;
1143         struct vattr va;
1144 
1145         mask = vap->va_mask;
1146 
1147         if (mask & AT_NOSET)
1148                 return (EINVAL);
1149 
1150         if ((mask & AT_SIZE) &&
1151             vap->va_type == VREG &&
1152             vap->va_size > MAXOFF32_T)
1153                 return (EFBIG);
1154 
1155         if (nfs_zone() != VTOMI(vp)->mi_zone)
1156                 return (EIO);
1157 
1158         va.va_mask = AT_UID | AT_MODE;
1159 
1160         error = nfsgetattr(vp, &va, cr);
1161         if (error)
1162                 return (error);
1163 
1164         error = secpolicy_vnode_setattr(cr, vp, vap, &va, flags, nfs_accessx,
1165             vp);
1166 
1167         if (error)
1168                 return (error);
1169 
1170         return (nfssetattr(vp, vap, flags, cr));
1171 }
1172 
1173 static int
1174 nfssetattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr)
1175 {
1176         int error;
1177         uint_t mask;
1178         struct nfssaargs args;
1179         struct nfsattrstat ns;
1180         int douprintf;
1181         rnode_t *rp;
1182         struct vattr va;
1183         mode_t omode;
1184         mntinfo_t *mi;
1185         vsecattr_t *vsp;
1186         hrtime_t t;
1187 
1188         mask = vap->va_mask;
1189 
1190         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
1191 
1192         rp = VTOR(vp);
1193 
1194         /*
1195          * Only need to flush pages if there are any pages and
1196          * if the file is marked as dirty in some fashion.  The
1197          * file must be flushed so that we can accurately
1198          * determine the size of the file and the cached data
1199          * after the SETATTR returns.  A file is considered to
1200          * be dirty if it is either marked with RDIRTY, has
1201          * outstanding i/o's active, or is mmap'd.  In this
1202          * last case, we can't tell whether there are dirty
1203          * pages, so we flush just to be sure.
1204          */
1205         if (vn_has_cached_data(vp) &&
1206             ((rp->r_flags & RDIRTY) ||
1207             rp->r_count > 0 ||
1208             rp->r_mapcnt > 0)) {
1209                 ASSERT(vp->v_type != VCHR);
1210                 error = nfs_putpage(vp, (offset_t)0, 0, 0, cr, NULL);
1211                 if (error && (error == ENOSPC || error == EDQUOT)) {
1212                         mutex_enter(&rp->r_statelock);
1213                         if (!rp->r_error)
1214                                 rp->r_error = error;
1215                         mutex_exit(&rp->r_statelock);
1216                 }
1217         }
1218 
1219         /*
1220          * If the system call was utime(2) or utimes(2) and the
1221          * application did not specify the times, then set the
1222          * mtime nanosecond field to 1 billion.  This will get
1223          * translated from 1 billion nanoseconds to 1 million
1224          * microseconds in the over the wire request.  The
1225          * server will use 1 million in the microsecond field
1226          * to tell whether both the mtime and atime should be
1227          * set to the server's current time.
1228          *
1229          * This is an overload of the protocol and should be
1230          * documented in the NFS Version 2 protocol specification.
1231          */
1232         if ((mask & AT_MTIME) && !(flags & ATTR_UTIME)) {
1233                 vap->va_mtime.tv_nsec = 1000000000;
1234                 if (NFS_TIME_T_OK(vap->va_mtime.tv_sec) &&
1235                     NFS_TIME_T_OK(vap->va_atime.tv_sec)) {
1236                         error = vattr_to_sattr(vap, &args.saa_sa);
1237                 } else {
1238                         /*
1239                          * Use server times. vap time values will not be used.
1240                          * To ensure no time overflow, make sure vap has
1241                          * valid values, but retain the original values.
1242                          */
1243                         timestruc_t     mtime = vap->va_mtime;
1244                         timestruc_t     atime = vap->va_atime;
1245                         time_t          now;
1246 
1247                         now = gethrestime_sec();
1248                         if (NFS_TIME_T_OK(now)) {
1249                                 /* Just in case server does not know of this */
1250                                 vap->va_mtime.tv_sec = now;
1251                                 vap->va_atime.tv_sec = now;
1252                         } else {
1253                                 vap->va_mtime.tv_sec = 0;
1254                                 vap->va_atime.tv_sec = 0;
1255                         }
1256                         error = vattr_to_sattr(vap, &args.saa_sa);
1257                         /* set vap times back on */
1258                         vap->va_mtime = mtime;
1259                         vap->va_atime = atime;
1260                 }
1261         } else {
1262                 /* Either do not set times or use the client specified times */
1263                 error = vattr_to_sattr(vap, &args.saa_sa);
1264         }
1265         if (error) {
1266                 /* req time field(s) overflow - return immediately */
1267                 return (error);
1268         }
1269         args.saa_fh = *VTOFH(vp);
1270 
1271         va.va_mask = AT_MODE;
1272         error = nfsgetattr(vp, &va, cr);
1273         if (error)
1274                 return (error);
1275         omode = va.va_mode;
1276 
1277         mi = VTOMI(vp);
1278 
1279         douprintf = 1;
1280 
1281         t = gethrtime();
1282 
1283         error = rfs2call(mi, RFS_SETATTR,
1284             xdr_saargs, (caddr_t)&args,
1285             xdr_attrstat, (caddr_t)&ns, cr,
1286             &douprintf, &ns.ns_status, 0, NULL);
1287 
1288         /*
1289          * Purge the access cache and ACL cache if changing either the
1290          * owner of the file, the group owner, or the mode.  These may
1291          * change the access permissions of the file, so purge old
1292          * information and start over again.
1293          */
1294         if ((mask & (AT_UID | AT_GID | AT_MODE)) && (mi->mi_flags & MI_ACL)) {
1295                 (void) nfs_access_purge_rp(rp);
1296                 if (rp->r_secattr != NULL) {
1297                         mutex_enter(&rp->r_statelock);
1298                         vsp = rp->r_secattr;
1299                         rp->r_secattr = NULL;
1300                         mutex_exit(&rp->r_statelock);
1301                         if (vsp != NULL)
1302                                 nfs_acl_free(vsp);
1303                 }
1304         }
1305 
1306         if (!error) {
1307                 error = geterrno(ns.ns_status);
1308                 if (!error) {
1309                         /*
1310                          * If changing the size of the file, invalidate
1311                          * any local cached data which is no longer part
1312                          * of the file.  We also possibly invalidate the
1313                          * last page in the file.  We could use
1314                          * pvn_vpzero(), but this would mark the page as
1315                          * modified and require it to be written back to
1316                          * the server for no particularly good reason.
1317                          * This way, if we access it, then we bring it
1318                          * back in.  A read should be cheaper than a
1319                          * write.
1320                          */
1321                         if (mask & AT_SIZE) {
1322                                 nfs_invalidate_pages(vp,
1323                                     (vap->va_size & PAGEMASK), cr);
1324                         }
1325                         (void) nfs_cache_fattr(vp, &ns.ns_attr, &va, t, cr);
1326                         /*
1327                          * If NFS_ACL is supported on the server, then the
1328                          * attributes returned by server may have minimal
1329                          * permissions sometimes denying access to users having
1330                          * proper access.  To get the proper attributes, mark
1331                          * the attributes as expired so that they will be
1332                          * regotten via the NFS_ACL GETATTR2 procedure.
1333                          */
1334                         if (mi->mi_flags & MI_ACL) {
1335                                 PURGE_ATTRCACHE(vp);
1336                         }
1337                         /*
1338                          * This next check attempts to deal with NFS
1339                          * servers which can not handle increasing
1340                          * the size of the file via setattr.  Most
1341                          * of these servers do not return an error,
1342                          * but do not change the size of the file.
1343                          * Hence, this check and then attempt to set
1344                          * the file size by writing 1 byte at the
1345                          * offset of the end of the file that we need.
1346                          */
1347                         if ((mask & AT_SIZE) &&
1348                             ns.ns_attr.na_size < (uint32_t)vap->va_size) {
1349                                 char zb = '\0';
1350 
1351                                 error = nfswrite(vp, &zb,
1352                                     vap->va_size - sizeof (zb),
1353                                     sizeof (zb), cr);
1354                         }
1355                         /*
1356                          * Some servers will change the mode to clear the setuid
1357                          * and setgid bits when changing the uid or gid.  The
1358                          * client needs to compensate appropriately.
1359                          */
1360                         if (mask & (AT_UID | AT_GID)) {
1361                                 int terror;
1362 
1363                                 va.va_mask = AT_MODE;
1364                                 terror = nfsgetattr(vp, &va, cr);
1365                                 if (!terror &&
1366                                     (((mask & AT_MODE) &&
1367                                     va.va_mode != vap->va_mode) ||
1368                                     (!(mask & AT_MODE) &&
1369                                     va.va_mode != omode))) {
1370                                         va.va_mask = AT_MODE;
1371                                         if (mask & AT_MODE)
1372                                                 va.va_mode = vap->va_mode;
1373                                         else
1374                                                 va.va_mode = omode;
1375                                         (void) nfssetattr(vp, &va, 0, cr);
1376                                 }
1377                         }
1378                 } else {
1379                         PURGE_ATTRCACHE(vp);
1380                         PURGE_STALE_FH(error, vp, cr);
1381                 }
1382         } else {
1383                 PURGE_ATTRCACHE(vp);
1384         }
1385 
1386         return (error);
1387 }
1388 
1389 static int
1390 nfs_accessx(void *vp, int mode, cred_t *cr)
1391 {
1392         ASSERT(nfs_zone() == VTOMI((vnode_t *)vp)->mi_zone);
1393         return (nfs_access(vp, mode, 0, cr, NULL));
1394 }
1395 
1396 /* ARGSUSED */
1397 static int
1398 nfs_access(vnode_t *vp, int mode, int flags, cred_t *cr, caller_context_t *ct)
1399 {
1400         struct vattr va;
1401         int error;
1402         mntinfo_t *mi;
1403         int shift = 0;
1404 
1405         mi = VTOMI(vp);
1406 
1407         if (nfs_zone() != mi->mi_zone)
1408                 return (EIO);
1409         if (mi->mi_flags & MI_ACL) {
1410                 error = acl_access2(vp, mode, flags, cr);
1411                 if (mi->mi_flags & MI_ACL)
1412                         return (error);
1413         }
1414 
1415         va.va_mask = AT_MODE | AT_UID | AT_GID;
1416         error = nfsgetattr(vp, &va, cr);
1417         if (error)
1418                 return (error);
1419 
1420         /*
1421          * Disallow write attempts on read-only
1422          * file systems, unless the file is a
1423          * device node.
1424          */
1425         if ((mode & VWRITE) && vn_is_readonly(vp) && !IS_DEVVP(vp))
1426                 return (EROFS);
1427 
1428         /*
1429          * Disallow attempts to access mandatory lock files.
1430          */
1431         if ((mode & (VWRITE | VREAD | VEXEC)) &&
1432             MANDLOCK(vp, va.va_mode))
1433                 return (EACCES);
1434 
1435         /*
1436          * Access check is based on only
1437          * one of owner, group, public.
1438          * If not owner, then check group.
1439          * If not a member of the group,
1440          * then check public access.
1441          */
1442         if (crgetuid(cr) != va.va_uid) {
1443                 shift += 3;
1444                 if (!groupmember(va.va_gid, cr))
1445                         shift += 3;
1446         }
1447 
1448         return (secpolicy_vnode_access2(cr, vp, va.va_uid,
1449             va.va_mode << shift, mode));
1450 }
1451 
1452 static int nfs_do_symlink_cache = 1;
1453 
1454 /* ARGSUSED */
1455 static int
1456 nfs_readlink(vnode_t *vp, struct uio *uiop, cred_t *cr, caller_context_t *ct)
1457 {
1458         int error;
1459         struct nfsrdlnres rl;
1460         rnode_t *rp;
1461         int douprintf;
1462         failinfo_t fi;
1463 
1464         /*
1465          * We want to be consistent with UFS semantics so we will return
1466          * EINVAL instead of ENXIO. This violates the XNFS spec and
1467          * the RFC 1094, which are wrong any way. BUGID 1138002.
1468          */
1469         if (vp->v_type != VLNK)
1470                 return (EINVAL);
1471 
1472         if (nfs_zone() != VTOMI(vp)->mi_zone)
1473                 return (EIO);
1474 
1475         rp = VTOR(vp);
1476         if (nfs_do_symlink_cache && rp->r_symlink.contents != NULL) {
1477                 error = nfs_validate_caches(vp, cr);
1478                 if (error)
1479                         return (error);
1480                 mutex_enter(&rp->r_statelock);
1481                 if (rp->r_symlink.contents != NULL) {
1482                         error = uiomove(rp->r_symlink.contents,
1483                             rp->r_symlink.len, UIO_READ, uiop);
1484                         mutex_exit(&rp->r_statelock);
1485                         return (error);
1486                 }
1487                 mutex_exit(&rp->r_statelock);
1488         }
1489 
1490 
1491         rl.rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP);
1492 
1493         fi.vp = vp;
1494         fi.fhp = NULL;          /* no need to update, filehandle not copied */
1495         fi.copyproc = nfscopyfh;
1496         fi.lookupproc = nfslookup;
1497         fi.xattrdirproc = acl_getxattrdir2;
1498 
1499         douprintf = 1;
1500 
1501         error = rfs2call(VTOMI(vp), RFS_READLINK,
1502             xdr_readlink, (caddr_t)VTOFH(vp),
1503             xdr_rdlnres, (caddr_t)&rl, cr,
1504             &douprintf, &rl.rl_status, 0, &fi);
1505 
1506         if (error) {
1507 
1508                 kmem_free((void *)rl.rl_data, NFS_MAXPATHLEN);
1509                 return (error);
1510         }
1511 
1512         error = geterrno(rl.rl_status);
1513         if (!error) {
1514                 error = uiomove(rl.rl_data, (int)rl.rl_count, UIO_READ, uiop);
1515                 if (nfs_do_symlink_cache && rp->r_symlink.contents == NULL) {
1516                         mutex_enter(&rp->r_statelock);
1517                         if (rp->r_symlink.contents == NULL) {
1518                                 rp->r_symlink.contents = rl.rl_data;
1519                                 rp->r_symlink.len = (int)rl.rl_count;
1520                                 rp->r_symlink.size = NFS_MAXPATHLEN;
1521                                 mutex_exit(&rp->r_statelock);
1522                         } else {
1523                                 mutex_exit(&rp->r_statelock);
1524 
1525                                 kmem_free((void *)rl.rl_data,
1526                                     NFS_MAXPATHLEN);
1527                         }
1528                 } else {
1529 
1530                         kmem_free((void *)rl.rl_data, NFS_MAXPATHLEN);
1531                 }
1532         } else {
1533                 PURGE_STALE_FH(error, vp, cr);
1534 
1535                 kmem_free((void *)rl.rl_data, NFS_MAXPATHLEN);
1536         }
1537 
1538         /*
1539          * Conform to UFS semantics (see comment above)
1540          */
1541         return (error == ENXIO ? EINVAL : error);
1542 }
1543 
1544 /*
1545  * Flush local dirty pages to stable storage on the server.
1546  *
1547  * If FNODSYNC is specified, then there is nothing to do because
1548  * metadata changes are not cached on the client before being
1549  * sent to the server.
1550  */
1551 /* ARGSUSED */
1552 static int
1553 nfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
1554 {
1555         int error;
1556 
1557         if ((syncflag & FNODSYNC) || IS_SWAPVP(vp))
1558                 return (0);
1559 
1560         if (nfs_zone() != VTOMI(vp)->mi_zone)
1561                 return (EIO);
1562 
1563         error = nfs_putpage(vp, (offset_t)0, 0, 0, cr, ct);
1564         if (!error)
1565                 error = VTOR(vp)->r_error;
1566         return (error);
1567 }
1568 
1569 
1570 /*
1571  * Weirdness: if the file was removed or the target of a rename
1572  * operation while it was open, it got renamed instead.  Here we
1573  * remove the renamed file.
1574  */
1575 /* ARGSUSED */
1576 static void
1577 nfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
1578 {
1579         rnode_t *rp;
1580 
1581         ASSERT(vp != DNLC_NO_VNODE);
1582 
1583         /*
1584          * If this is coming from the wrong zone, we let someone in the right
1585          * zone take care of it asynchronously.  We can get here due to
1586          * VN_RELE() being called from pageout() or fsflush().  This call may
1587          * potentially turn into an expensive no-op if, for instance, v_count
1588          * gets incremented in the meantime, but it's still correct.
1589          */
1590         if (nfs_zone() != VTOMI(vp)->mi_zone) {
1591                 nfs_async_inactive(vp, cr, nfs_inactive);
1592                 return;
1593         }
1594 
1595         rp = VTOR(vp);
1596 redo:
1597         if (rp->r_unldvp != NULL) {
1598                 /*
1599                  * Save the vnode pointer for the directory where the
1600                  * unlinked-open file got renamed, then set it to NULL
1601                  * to prevent another thread from getting here before
1602                  * we're done with the remove.  While we have the
1603                  * statelock, make local copies of the pertinent rnode
1604                  * fields.  If we weren't to do this in an atomic way, the
1605                  * the unl* fields could become inconsistent with respect
1606                  * to each other due to a race condition between this
1607                  * code and nfs_remove().  See bug report 1034328.
1608                  */
1609                 mutex_enter(&rp->r_statelock);
1610                 if (rp->r_unldvp != NULL) {
1611                         vnode_t *unldvp;
1612                         char *unlname;
1613                         cred_t *unlcred;
1614                         struct nfsdiropargs da;
1615                         enum nfsstat status;
1616                         int douprintf;
1617                         int error;
1618 
1619                         unldvp = rp->r_unldvp;
1620                         rp->r_unldvp = NULL;
1621                         unlname = rp->r_unlname;
1622                         rp->r_unlname = NULL;
1623                         unlcred = rp->r_unlcred;
1624                         rp->r_unlcred = NULL;
1625                         mutex_exit(&rp->r_statelock);
1626 
1627                         /*
1628                          * If there are any dirty pages left, then flush
1629                          * them.  This is unfortunate because they just
1630                          * may get thrown away during the remove operation,
1631                          * but we have to do this for correctness.
1632                          */
1633                         if (vn_has_cached_data(vp) &&
1634                             ((rp->r_flags & RDIRTY) || rp->r_count > 0)) {
1635                                 ASSERT(vp->v_type != VCHR);
1636                                 error = nfs_putpage(vp, (offset_t)0, 0, 0,
1637                                     cr, ct);
1638                                 if (error) {
1639                                         mutex_enter(&rp->r_statelock);
1640                                         if (!rp->r_error)
1641                                                 rp->r_error = error;
1642                                         mutex_exit(&rp->r_statelock);
1643                                 }
1644                         }
1645 
1646                         /*
1647                          * Do the remove operation on the renamed file
1648                          */
1649                         setdiropargs(&da, unlname, unldvp);
1650 
1651                         douprintf = 1;
1652 
1653                         (void) rfs2call(VTOMI(unldvp), RFS_REMOVE,
1654                             xdr_diropargs, (caddr_t)&da,
1655                             xdr_enum, (caddr_t)&status, unlcred,
1656                             &douprintf, &status, 0, NULL);
1657 
1658                         if (HAVE_RDDIR_CACHE(VTOR(unldvp)))
1659                                 nfs_purge_rddir_cache(unldvp);
1660                         PURGE_ATTRCACHE(unldvp);
1661 
1662                         /*
1663                          * Release stuff held for the remove
1664                          */
1665                         VN_RELE(unldvp);
1666                         kmem_free(unlname, MAXNAMELEN);
1667                         crfree(unlcred);
1668                         goto redo;
1669                 }
1670                 mutex_exit(&rp->r_statelock);
1671         }
1672 
1673         rp_addfree(rp, cr);
1674 }
1675 
1676 /*
1677  * Remote file system operations having to do with directory manipulation.
1678  */
1679 
1680 /* ARGSUSED */
1681 static int
1682 nfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
1683         int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
1684         int *direntflags, pathname_t *realpnp)
1685 {
1686         int error;
1687         vnode_t *vp;
1688         vnode_t *avp = NULL;
1689         rnode_t *drp;
1690 
1691         if (nfs_zone() != VTOMI(dvp)->mi_zone)
1692                 return (EPERM);
1693 
1694         drp = VTOR(dvp);
1695 
1696         /*
1697          * Are we looking up extended attributes?  If so, "dvp" is
1698          * the file or directory for which we want attributes, and
1699          * we need a lookup of the hidden attribute directory
1700          * before we lookup the rest of the path.
1701          */
1702         if (flags & LOOKUP_XATTR) {
1703                 bool_t cflag = ((flags & CREATE_XATTR_DIR) != 0);
1704                 mntinfo_t *mi;
1705 
1706                 mi = VTOMI(dvp);
1707                 if (!(mi->mi_flags & MI_EXTATTR))
1708                         return (EINVAL);
1709 
1710                 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR(dvp)))
1711                         return (EINTR);
1712 
1713                 (void) nfslookup_dnlc(dvp, XATTR_DIR_NAME, &avp, cr);
1714                 if (avp == NULL)
1715                         error = acl_getxattrdir2(dvp, &avp, cflag, cr, 0);
1716                 else
1717                         error = 0;
1718 
1719                 nfs_rw_exit(&drp->r_rwlock);
1720 
1721                 if (error) {
1722                         if (mi->mi_flags & MI_EXTATTR)
1723                                 return (error);
1724                         return (EINVAL);
1725                 }
1726                 dvp = avp;
1727                 drp = VTOR(dvp);
1728         }
1729 
1730         if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR(dvp))) {
1731                 error = EINTR;
1732                 goto out;
1733         }
1734 
1735         error = nfslookup(dvp, nm, vpp, pnp, flags, rdir, cr, 0);
1736 
1737         nfs_rw_exit(&drp->r_rwlock);
1738 
1739         /*
1740          * If vnode is a device, create special vnode.
1741          */
1742         if (!error && IS_DEVVP(*vpp)) {
1743                 vp = *vpp;
1744                 *vpp = specvp(vp, vp->v_rdev, vp->v_type, cr);
1745                 VN_RELE(vp);
1746         }
1747 
1748 out:
1749         if (avp != NULL)
1750                 VN_RELE(avp);
1751 
1752         return (error);
1753 }
1754 
1755 static int nfs_lookup_neg_cache = 1;
1756 
1757 #ifdef DEBUG
1758 static int nfs_lookup_dnlc_hits = 0;
1759 static int nfs_lookup_dnlc_misses = 0;
1760 static int nfs_lookup_dnlc_neg_hits = 0;
1761 static int nfs_lookup_dnlc_disappears = 0;
1762 static int nfs_lookup_dnlc_lookups = 0;
1763 #endif
1764 
1765 /* ARGSUSED */
1766 int
1767 nfslookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
1768         int flags, vnode_t *rdir, cred_t *cr, int rfscall_flags)
1769 {
1770         int error;
1771 
1772         ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone);
1773 
1774         /*
1775          * If lookup is for "", just return dvp.  Don't need
1776          * to send it over the wire, look it up in the dnlc,
1777          * or perform any access checks.
1778          */
1779         if (*nm == '\0') {
1780                 VN_HOLD(dvp);
1781                 *vpp = dvp;
1782                 return (0);
1783         }
1784 
1785         /*
1786          * Can't do lookups in non-directories.
1787          */
1788         if (dvp->v_type != VDIR)
1789                 return (ENOTDIR);
1790 
1791         /*
1792          * If we're called with RFSCALL_SOFT, it's important that
1793          * the only rfscall is one we make directly; if we permit
1794          * an access call because we're looking up "." or validating
1795          * a dnlc hit, we'll deadlock because that rfscall will not
1796          * have the RFSCALL_SOFT set.
1797          */
1798         if (rfscall_flags & RFSCALL_SOFT)
1799                 goto callit;
1800 
1801         /*
1802          * If lookup is for ".", just return dvp.  Don't need
1803          * to send it over the wire or look it up in the dnlc,
1804          * just need to check access.
1805          */
1806         if (strcmp(nm, ".") == 0) {
1807                 error = nfs_access(dvp, VEXEC, 0, cr, NULL);
1808                 if (error)
1809                         return (error);
1810                 VN_HOLD(dvp);
1811                 *vpp = dvp;
1812                 return (0);
1813         }
1814 
1815         /*
1816          * Lookup this name in the DNLC.  If there was a valid entry,
1817          * then return the results of the lookup.
1818          */
1819         error = nfslookup_dnlc(dvp, nm, vpp, cr);
1820         if (error || *vpp != NULL)
1821                 return (error);
1822 
1823 callit:
1824         error = nfslookup_otw(dvp, nm, vpp, cr, rfscall_flags);
1825 
1826         return (error);
1827 }
1828 
1829 static int
1830 nfslookup_dnlc(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr)
1831 {
1832         int error;
1833         vnode_t *vp;
1834 
1835         ASSERT(*nm != '\0');
1836         ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone);
1837 
1838         /*
1839          * Lookup this name in the DNLC.  If successful, then validate
1840          * the caches and then recheck the DNLC.  The DNLC is rechecked
1841          * just in case this entry got invalidated during the call
1842          * to nfs_validate_caches.
1843          *
1844          * An assumption is being made that it is safe to say that a
1845          * file exists which may not on the server.  Any operations to
1846          * the server will fail with ESTALE.
1847          */
1848 #ifdef DEBUG
1849         nfs_lookup_dnlc_lookups++;
1850 #endif
1851         vp = dnlc_lookup(dvp, nm);
1852         if (vp != NULL) {
1853                 VN_RELE(vp);
1854                 if (vp == DNLC_NO_VNODE && !vn_is_readonly(dvp)) {
1855                         PURGE_ATTRCACHE(dvp);
1856                 }
1857                 error = nfs_validate_caches(dvp, cr);
1858                 if (error)
1859                         return (error);
1860                 vp = dnlc_lookup(dvp, nm);
1861                 if (vp != NULL) {
1862                         error = nfs_access(dvp, VEXEC, 0, cr, NULL);
1863                         if (error) {
1864                                 VN_RELE(vp);
1865                                 return (error);
1866                         }
1867                         if (vp == DNLC_NO_VNODE) {
1868                                 VN_RELE(vp);
1869 #ifdef DEBUG
1870                                 nfs_lookup_dnlc_neg_hits++;
1871 #endif
1872                                 return (ENOENT);
1873                         }
1874                         *vpp = vp;
1875 #ifdef DEBUG
1876                         nfs_lookup_dnlc_hits++;
1877 #endif
1878                         return (0);
1879                 }
1880 #ifdef DEBUG
1881                 nfs_lookup_dnlc_disappears++;
1882 #endif
1883         }
1884 #ifdef DEBUG
1885         else
1886                 nfs_lookup_dnlc_misses++;
1887 #endif
1888 
1889         *vpp = NULL;
1890 
1891         return (0);
1892 }
1893 
1894 static int
1895 nfslookup_otw(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr,
1896         int rfscall_flags)
1897 {
1898         int error;
1899         struct nfsdiropargs da;
1900         struct nfsdiropres dr;
1901         int douprintf;
1902         failinfo_t fi;
1903         hrtime_t t;
1904 
1905         ASSERT(*nm != '\0');
1906         ASSERT(dvp->v_type == VDIR);
1907         ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone);
1908 
1909         setdiropargs(&da, nm, dvp);
1910 
1911         fi.vp = dvp;
1912         fi.fhp = NULL;          /* no need to update, filehandle not copied */
1913         fi.copyproc = nfscopyfh;
1914         fi.lookupproc = nfslookup;
1915         fi.xattrdirproc = acl_getxattrdir2;
1916 
1917         douprintf = 1;
1918 
1919         t = gethrtime();
1920 
1921         error = rfs2call(VTOMI(dvp), RFS_LOOKUP,
1922             xdr_diropargs, (caddr_t)&da,
1923             xdr_diropres, (caddr_t)&dr, cr,
1924             &douprintf, &dr.dr_status, rfscall_flags, &fi);
1925 
1926         if (!error) {
1927                 error = geterrno(dr.dr_status);
1928                 if (!error) {
1929                         *vpp = makenfsnode(&dr.dr_fhandle, &dr.dr_attr,
1930                             dvp->v_vfsp, t, cr, VTOR(dvp)->r_path, nm);
1931                         /*
1932                          * If NFS_ACL is supported on the server, then the
1933                          * attributes returned by server may have minimal
1934                          * permissions sometimes denying access to users having
1935                          * proper access.  To get the proper attributes, mark
1936                          * the attributes as expired so that they will be
1937                          * regotten via the NFS_ACL GETATTR2 procedure.
1938                          */
1939                         if (VTOMI(*vpp)->mi_flags & MI_ACL) {
1940                                 PURGE_ATTRCACHE(*vpp);
1941                         }
1942                         if (!(rfscall_flags & RFSCALL_SOFT))
1943                                 dnlc_update(dvp, nm, *vpp);
1944                 } else {
1945                         PURGE_STALE_FH(error, dvp, cr);
1946                         if (error == ENOENT && nfs_lookup_neg_cache)
1947                                 dnlc_enter(dvp, nm, DNLC_NO_VNODE);
1948                 }
1949         }
1950 
1951         return (error);
1952 }
1953 
1954 /* ARGSUSED */
1955 static int
1956 nfs_create(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive,
1957         int mode, vnode_t **vpp, cred_t *cr, int lfaware, caller_context_t *ct,
1958         vsecattr_t *vsecp)
1959 {
1960         int error;
1961         struct nfscreatargs args;
1962         struct nfsdiropres dr;
1963         int douprintf;
1964         vnode_t *vp;
1965         rnode_t *rp;
1966         struct vattr vattr;
1967         rnode_t *drp;
1968         vnode_t *tempvp;
1969         hrtime_t t;
1970 
1971         drp = VTOR(dvp);
1972 
1973         if (nfs_zone() != VTOMI(dvp)->mi_zone)
1974                 return (EPERM);
1975         if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
1976                 return (EINTR);
1977 
1978         /*
1979          * We make a copy of the attributes because the caller does not
1980          * expect us to change what va points to.
1981          */
1982         vattr = *va;
1983 
1984         /*
1985          * If the pathname is "", just use dvp.  Don't need
1986          * to send it over the wire, look it up in the dnlc,
1987          * or perform any access checks.
1988          */
1989         if (*nm == '\0') {
1990                 error = 0;
1991                 VN_HOLD(dvp);
1992                 vp = dvp;
1993         /*
1994          * If the pathname is ".", just use dvp.  Don't need
1995          * to send it over the wire or look it up in the dnlc,
1996          * just need to check access.
1997          */
1998         } else if (strcmp(nm, ".") == 0) {
1999                 error = nfs_access(dvp, VEXEC, 0, cr, ct);
2000                 if (error) {
2001                         nfs_rw_exit(&drp->r_rwlock);
2002                         return (error);
2003                 }
2004                 VN_HOLD(dvp);
2005                 vp = dvp;
2006         /*
2007          * We need to go over the wire, just to be sure whether the
2008          * file exists or not.  Using the DNLC can be dangerous in
2009          * this case when making a decision regarding existence.
2010          */
2011         } else {
2012                 error = nfslookup_otw(dvp, nm, &vp, cr, 0);
2013         }
2014         if (!error) {
2015                 if (exclusive == EXCL)
2016                         error = EEXIST;
2017                 else if (vp->v_type == VDIR && (mode & VWRITE))
2018                         error = EISDIR;
2019                 else {
2020                         /*
2021                          * If vnode is a device, create special vnode.
2022                          */
2023                         if (IS_DEVVP(vp)) {
2024                                 tempvp = vp;
2025                                 vp = specvp(vp, vp->v_rdev, vp->v_type, cr);
2026                                 VN_RELE(tempvp);
2027                         }
2028                         if (!(error = VOP_ACCESS(vp, mode, 0, cr, ct))) {
2029                                 if ((vattr.va_mask & AT_SIZE) &&
2030                                     vp->v_type == VREG) {
2031                                         vattr.va_mask = AT_SIZE;
2032                                         error = nfssetattr(vp, &vattr, 0, cr);
2033                                 }
2034                         }
2035                 }
2036                 nfs_rw_exit(&drp->r_rwlock);
2037                 if (error) {
2038                         VN_RELE(vp);
2039                 } else {
2040                         /*
2041                          * existing file got truncated, notify.
2042                          */
2043                         vnevent_create(vp, ct);
2044                         *vpp = vp;
2045                 }
2046                 return (error);
2047         }
2048 
2049         ASSERT(vattr.va_mask & AT_TYPE);
2050         if (vattr.va_type == VREG) {
2051                 ASSERT(vattr.va_mask & AT_MODE);
2052                 if (MANDMODE(vattr.va_mode)) {
2053                         nfs_rw_exit(&drp->r_rwlock);
2054                         return (EACCES);
2055                 }
2056         }
2057 
2058         dnlc_remove(dvp, nm);
2059 
2060         setdiropargs(&args.ca_da, nm, dvp);
2061 
2062         /*
2063          * Decide what the group-id of the created file should be.
2064          * Set it in attribute list as advisory...then do a setattr
2065          * if the server didn't get it right the first time.
2066          */
2067         error = setdirgid(dvp, &vattr.va_gid, cr);
2068         if (error) {
2069                 nfs_rw_exit(&drp->r_rwlock);
2070                 return (error);
2071         }
2072         vattr.va_mask |= AT_GID;
2073 
2074         /*
2075          * This is a completely gross hack to make mknod
2076          * work over the wire until we can wack the protocol
2077          */
2078 #define IFCHR           0020000         /* character special */
2079 #define IFBLK           0060000         /* block special */
2080 #define IFSOCK          0140000         /* socket */
2081 
2082         /*
2083          * dev_t is uint_t in 5.x and short in 4.x. Both 4.x
2084          * supports 8 bit majors. 5.x supports 14 bit majors. 5.x supports 18
2085          * bits in the minor number where 4.x supports 8 bits.  If the 5.x
2086          * minor/major numbers <= 8 bits long, compress the device
2087          * number before sending it. Otherwise, the 4.x server will not
2088          * create the device with the correct device number and nothing can be
2089          * done about this.
2090          */
2091         if (vattr.va_type == VCHR || vattr.va_type == VBLK) {
2092                 dev_t d = vattr.va_rdev;
2093                 dev32_t dev32;
2094 
2095                 if (vattr.va_type == VCHR)
2096                         vattr.va_mode |= IFCHR;
2097                 else
2098                         vattr.va_mode |= IFBLK;
2099 
2100                 (void) cmpldev(&dev32, d);
2101                 if (dev32 & ~((SO4_MAXMAJ << L_BITSMINOR32) | SO4_MAXMIN))
2102                         vattr.va_size = (u_offset_t)dev32;
2103                 else
2104                         vattr.va_size = (u_offset_t)nfsv2_cmpdev(d);
2105 
2106                 vattr.va_mask |= AT_MODE|AT_SIZE;
2107         } else if (vattr.va_type == VFIFO) {
2108                 vattr.va_mode |= IFCHR;         /* xtra kludge for namedpipe */
2109                 vattr.va_size = (u_offset_t)NFS_FIFO_DEV;       /* blech */
2110                 vattr.va_mask |= AT_MODE|AT_SIZE;
2111         } else if (vattr.va_type == VSOCK) {
2112                 vattr.va_mode |= IFSOCK;
2113                 /*
2114                  * To avoid triggering bugs in the servers set AT_SIZE
2115                  * (all other RFS_CREATE calls set this).
2116                  */
2117                 vattr.va_size = 0;
2118                 vattr.va_mask |= AT_MODE|AT_SIZE;
2119         }
2120 
2121         args.ca_sa = &args.ca_sa_buf;
2122         error = vattr_to_sattr(&vattr, args.ca_sa);
2123         if (error) {
2124                 /* req time field(s) overflow - return immediately */
2125                 nfs_rw_exit(&drp->r_rwlock);
2126                 return (error);
2127         }
2128 
2129         douprintf = 1;
2130 
2131         t = gethrtime();
2132 
2133         error = rfs2call(VTOMI(dvp), RFS_CREATE,
2134             xdr_creatargs, (caddr_t)&args,
2135             xdr_diropres, (caddr_t)&dr, cr,
2136             &douprintf, &dr.dr_status, 0, NULL);
2137 
2138         PURGE_ATTRCACHE(dvp);   /* mod time changed */
2139 
2140         if (!error) {
2141                 error = geterrno(dr.dr_status);
2142                 if (!error) {
2143                         if (HAVE_RDDIR_CACHE(drp))
2144                                 nfs_purge_rddir_cache(dvp);
2145                         vp = makenfsnode(&dr.dr_fhandle, &dr.dr_attr,
2146                             dvp->v_vfsp, t, cr, NULL, NULL);
2147                         /*
2148                          * If NFS_ACL is supported on the server, then the
2149                          * attributes returned by server may have minimal
2150                          * permissions sometimes denying access to users having
2151                          * proper access.  To get the proper attributes, mark
2152                          * the attributes as expired so that they will be
2153                          * regotten via the NFS_ACL GETATTR2 procedure.
2154                          */
2155                         if (VTOMI(vp)->mi_flags & MI_ACL) {
2156                                 PURGE_ATTRCACHE(vp);
2157                         }
2158                         dnlc_update(dvp, nm, vp);
2159                         rp = VTOR(vp);
2160                         if (vattr.va_size == 0) {
2161                                 mutex_enter(&rp->r_statelock);
2162                                 rp->r_size = 0;
2163                                 mutex_exit(&rp->r_statelock);
2164                                 if (vn_has_cached_data(vp)) {
2165                                         ASSERT(vp->v_type != VCHR);
2166                                         nfs_invalidate_pages(vp,
2167                                             (u_offset_t)0, cr);
2168                                 }
2169                         }
2170 
2171                         /*
2172                          * Make sure the gid was set correctly.
2173                          * If not, try to set it (but don't lose
2174                          * any sleep over it).
2175                          */
2176                         if (vattr.va_gid != rp->r_attr.va_gid) {
2177                                 vattr.va_mask = AT_GID;
2178                                 (void) nfssetattr(vp, &vattr, 0, cr);
2179                         }
2180 
2181                         /*
2182                          * If vnode is a device create special vnode
2183                          */
2184                         if (IS_DEVVP(vp)) {
2185                                 *vpp = specvp(vp, vp->v_rdev, vp->v_type, cr);
2186                                 VN_RELE(vp);
2187                         } else
2188                                 *vpp = vp;
2189                 } else {
2190                         PURGE_STALE_FH(error, dvp, cr);
2191                 }
2192         }
2193 
2194         nfs_rw_exit(&drp->r_rwlock);
2195 
2196         return (error);
2197 }
2198 
2199 /*
2200  * Weirdness: if the vnode to be removed is open
2201  * we rename it instead of removing it and nfs_inactive
2202  * will remove the new name.
2203  */
2204 /* ARGSUSED */
2205 static int
2206 nfs_remove(vnode_t *dvp, char *nm, cred_t *cr, caller_context_t *ct, int flags)
2207 {
2208         int error;
2209         struct nfsdiropargs da;
2210         enum nfsstat status;
2211         vnode_t *vp;
2212         char *tmpname;
2213         int douprintf;
2214         rnode_t *rp;
2215         rnode_t *drp;
2216 
2217         if (nfs_zone() != VTOMI(dvp)->mi_zone)
2218                 return (EPERM);
2219         drp = VTOR(dvp);
2220         if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
2221                 return (EINTR);
2222 
2223         error = nfslookup(dvp, nm, &vp, NULL, 0, NULL, cr, 0);
2224         if (error) {
2225                 nfs_rw_exit(&drp->r_rwlock);
2226                 return (error);
2227         }
2228 
2229         if (vp->v_type == VDIR && secpolicy_fs_linkdir(cr, dvp->v_vfsp)) {
2230                 VN_RELE(vp);
2231                 nfs_rw_exit(&drp->r_rwlock);
2232                 return (EPERM);
2233         }
2234 
2235         /*
2236          * First just remove the entry from the name cache, as it
2237          * is most likely the only entry for this vp.
2238          */
2239         dnlc_remove(dvp, nm);
2240 
2241         /*
2242          * If the file has a v_count > 1 then there may be more than one
2243          * entry in the name cache due multiple links or an open file,
2244          * but we don't have the real reference count so flush all
2245          * possible entries.
2246          */
2247         if (vp->v_count > 1)
2248                 dnlc_purge_vp(vp);
2249 
2250         /*
2251          * Now we have the real reference count on the vnode
2252          */
2253         rp = VTOR(vp);
2254         mutex_enter(&rp->r_statelock);
2255         if (vp->v_count > 1 &&
2256             (rp->r_unldvp == NULL || strcmp(nm, rp->r_unlname) == 0)) {
2257                 mutex_exit(&rp->r_statelock);
2258                 tmpname = newname();
2259                 error = nfsrename(dvp, nm, dvp, tmpname, cr, ct);
2260                 if (error)
2261                         kmem_free(tmpname, MAXNAMELEN);
2262                 else {
2263                         mutex_enter(&rp->r_statelock);
2264                         if (rp->r_unldvp == NULL) {
2265                                 VN_HOLD(dvp);
2266                                 rp->r_unldvp = dvp;
2267                                 if (rp->r_unlcred != NULL)
2268                                         crfree(rp->r_unlcred);
2269                                 crhold(cr);
2270                                 rp->r_unlcred = cr;
2271                                 rp->r_unlname = tmpname;
2272                         } else {
2273                                 kmem_free(rp->r_unlname, MAXNAMELEN);
2274                                 rp->r_unlname = tmpname;
2275                         }
2276                         mutex_exit(&rp->r_statelock);
2277                 }
2278         } else {
2279                 mutex_exit(&rp->r_statelock);
2280                 /*
2281                  * We need to flush any dirty pages which happen to
2282                  * be hanging around before removing the file.  This
2283                  * shouldn't happen very often and mostly on file
2284                  * systems mounted "nocto".
2285                  */
2286                 if (vn_has_cached_data(vp) &&
2287                     ((rp->r_flags & RDIRTY) || rp->r_count > 0)) {
2288                         error = nfs_putpage(vp, (offset_t)0, 0, 0, cr, ct);
2289                         if (error && (error == ENOSPC || error == EDQUOT)) {
2290                                 mutex_enter(&rp->r_statelock);
2291                                 if (!rp->r_error)
2292                                         rp->r_error = error;
2293                                 mutex_exit(&rp->r_statelock);
2294                         }
2295                 }
2296 
2297                 setdiropargs(&da, nm, dvp);
2298 
2299                 douprintf = 1;
2300 
2301                 error = rfs2call(VTOMI(dvp), RFS_REMOVE,
2302                     xdr_diropargs, (caddr_t)&da,
2303                     xdr_enum, (caddr_t)&status, cr,
2304                     &douprintf, &status, 0, NULL);
2305 
2306                 /*
2307                  * The xattr dir may be gone after last attr is removed,
2308                  * so flush it from dnlc.
2309                  */
2310                 if (dvp->v_flag & V_XATTRDIR)
2311                         dnlc_purge_vp(dvp);
2312 
2313                 PURGE_ATTRCACHE(dvp);   /* mod time changed */
2314                 PURGE_ATTRCACHE(vp);    /* link count changed */
2315 
2316                 if (!error) {
2317                         error = geterrno(status);
2318                         if (!error) {
2319                                 if (HAVE_RDDIR_CACHE(drp))
2320                                         nfs_purge_rddir_cache(dvp);
2321                         } else {
2322                                 PURGE_STALE_FH(error, dvp, cr);
2323                         }
2324                 }
2325         }
2326 
2327         if (error == 0) {
2328                 vnevent_remove(vp, dvp, nm, ct);
2329         }
2330         VN_RELE(vp);
2331 
2332         nfs_rw_exit(&drp->r_rwlock);
2333 
2334         return (error);
2335 }
2336 
2337 /* ARGSUSED */
2338 static int
2339 nfs_link(vnode_t *tdvp, vnode_t *svp, char *tnm, cred_t *cr,
2340         caller_context_t *ct, int flags)
2341 {
2342         int error;
2343         struct nfslinkargs args;
2344         enum nfsstat status;
2345         vnode_t *realvp;
2346         int douprintf;
2347         rnode_t *tdrp;
2348 
2349         if (nfs_zone() != VTOMI(tdvp)->mi_zone)
2350                 return (EPERM);
2351         if (VOP_REALVP(svp, &realvp, ct) == 0)
2352                 svp = realvp;
2353 
2354         args.la_from = VTOFH(svp);
2355         setdiropargs(&args.la_to, tnm, tdvp);
2356 
2357         tdrp = VTOR(tdvp);
2358         if (nfs_rw_enter_sig(&tdrp->r_rwlock, RW_WRITER, INTR(tdvp)))
2359                 return (EINTR);
2360 
2361         dnlc_remove(tdvp, tnm);
2362 
2363         douprintf = 1;
2364 
2365         error = rfs2call(VTOMI(svp), RFS_LINK,
2366             xdr_linkargs, (caddr_t)&args,
2367             xdr_enum, (caddr_t)&status, cr,
2368             &douprintf, &status, 0, NULL);
2369 
2370         PURGE_ATTRCACHE(tdvp);  /* mod time changed */
2371         PURGE_ATTRCACHE(svp);   /* link count changed */
2372 
2373         if (!error) {
2374                 error = geterrno(status);
2375                 if (!error) {
2376                         if (HAVE_RDDIR_CACHE(tdrp))
2377                                 nfs_purge_rddir_cache(tdvp);
2378                 }
2379         }
2380 
2381         nfs_rw_exit(&tdrp->r_rwlock);
2382 
2383         if (!error) {
2384                 /*
2385                  * Notify the source file of this link operation.
2386                  */
2387                 vnevent_link(svp, ct);
2388         }
2389         return (error);
2390 }
2391 
2392 /* ARGSUSED */
2393 static int
2394 nfs_rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr,
2395         caller_context_t *ct, int flags)
2396 {
2397         vnode_t *realvp;
2398 
2399         if (nfs_zone() != VTOMI(odvp)->mi_zone)
2400                 return (EPERM);
2401         if (VOP_REALVP(ndvp, &realvp, ct) == 0)
2402                 ndvp = realvp;
2403 
2404         return (nfsrename(odvp, onm, ndvp, nnm, cr, ct));
2405 }
2406 
2407 /*
2408  * nfsrename does the real work of renaming in NFS Version 2.
2409  */
2410 static int
2411 nfsrename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr,
2412     caller_context_t *ct)
2413 {
2414         int error;
2415         enum nfsstat status;
2416         struct nfsrnmargs args;
2417         int douprintf;
2418         vnode_t *nvp = NULL;
2419         vnode_t *ovp = NULL;
2420         char *tmpname;
2421         rnode_t *rp;
2422         rnode_t *odrp;
2423         rnode_t *ndrp;
2424 
2425         ASSERT(nfs_zone() == VTOMI(odvp)->mi_zone);
2426         if (strcmp(onm, ".") == 0 || strcmp(onm, "..") == 0 ||
2427             strcmp(nnm, ".") == 0 || strcmp(nnm, "..") == 0)
2428                 return (EINVAL);
2429 
2430         odrp = VTOR(odvp);
2431         ndrp = VTOR(ndvp);
2432         if ((intptr_t)odrp < (intptr_t)ndrp) {
2433                 if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR(odvp)))
2434                         return (EINTR);
2435                 if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR(ndvp))) {
2436                         nfs_rw_exit(&odrp->r_rwlock);
2437                         return (EINTR);
2438                 }
2439         } else {
2440                 if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR(ndvp)))
2441                         return (EINTR);
2442                 if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR(odvp))) {
2443                         nfs_rw_exit(&ndrp->r_rwlock);
2444                         return (EINTR);
2445                 }
2446         }
2447 
2448         /*
2449          * Lookup the target file.  If it exists, it needs to be
2450          * checked to see whether it is a mount point and whether
2451          * it is active (open).
2452          */
2453         error = nfslookup(ndvp, nnm, &nvp, NULL, 0, NULL, cr, 0);
2454         if (!error) {
2455                 /*
2456                  * If this file has been mounted on, then just
2457                  * return busy because renaming to it would remove
2458                  * the mounted file system from the name space.
2459                  */
2460                 if (vn_mountedvfs(nvp) != NULL) {
2461                         VN_RELE(nvp);
2462                         nfs_rw_exit(&odrp->r_rwlock);
2463                         nfs_rw_exit(&ndrp->r_rwlock);
2464                         return (EBUSY);
2465                 }
2466 
2467                 /*
2468                  * Purge the name cache of all references to this vnode
2469                  * so that we can check the reference count to infer
2470                  * whether it is active or not.
2471                  */
2472                 /*
2473                  * First just remove the entry from the name cache, as it
2474                  * is most likely the only entry for this vp.
2475                  */
2476                 dnlc_remove(ndvp, nnm);
2477                 /*
2478                  * If the file has a v_count > 1 then there may be more
2479                  * than one entry in the name cache due multiple links
2480                  * or an open file, but we don't have the real reference
2481                  * count so flush all possible entries.
2482                  */
2483                 if (nvp->v_count > 1)
2484                         dnlc_purge_vp(nvp);
2485 
2486                 /*
2487                  * If the vnode is active and is not a directory,
2488                  * arrange to rename it to a
2489                  * temporary file so that it will continue to be
2490                  * accessible.  This implements the "unlink-open-file"
2491                  * semantics for the target of a rename operation.
2492                  * Before doing this though, make sure that the
2493                  * source and target files are not already the same.
2494                  */
2495                 if (nvp->v_count > 1 && nvp->v_type != VDIR) {
2496                         /*
2497                          * Lookup the source name.
2498                          */
2499                         error = nfslookup(odvp, onm, &ovp, NULL, 0, NULL,
2500                             cr, 0);
2501 
2502                         /*
2503                          * The source name *should* already exist.
2504                          */
2505                         if (error) {
2506                                 VN_RELE(nvp);
2507                                 nfs_rw_exit(&odrp->r_rwlock);
2508                                 nfs_rw_exit(&ndrp->r_rwlock);
2509                                 return (error);
2510                         }
2511 
2512                         /*
2513                          * Compare the two vnodes.  If they are the same,
2514                          * just release all held vnodes and return success.
2515                          */
2516                         if (ovp == nvp) {
2517                                 VN_RELE(ovp);
2518                                 VN_RELE(nvp);
2519                                 nfs_rw_exit(&odrp->r_rwlock);
2520                                 nfs_rw_exit(&ndrp->r_rwlock);
2521                                 return (0);
2522                         }
2523 
2524                         /*
2525                          * Can't mix and match directories and non-
2526                          * directories in rename operations.  We already
2527                          * know that the target is not a directory.  If
2528                          * the source is a directory, return an error.
2529                          */
2530                         if (ovp->v_type == VDIR) {
2531                                 VN_RELE(ovp);
2532                                 VN_RELE(nvp);
2533                                 nfs_rw_exit(&odrp->r_rwlock);
2534                                 nfs_rw_exit(&ndrp->r_rwlock);
2535                                 return (ENOTDIR);
2536                         }
2537 
2538                         /*
2539                          * The target file exists, is not the same as
2540                          * the source file, and is active.  Link it
2541                          * to a temporary filename to avoid having
2542                          * the server removing the file completely.
2543                          */
2544                         tmpname = newname();
2545                         error = nfs_link(ndvp, nvp, tmpname, cr, NULL, 0);
2546                         if (error == EOPNOTSUPP) {
2547                                 error = nfs_rename(ndvp, nnm, ndvp, tmpname,
2548                                     cr, NULL, 0);
2549                         }
2550                         if (error) {
2551                                 kmem_free(tmpname, MAXNAMELEN);
2552                                 VN_RELE(ovp);
2553                                 VN_RELE(nvp);
2554                                 nfs_rw_exit(&odrp->r_rwlock);
2555                                 nfs_rw_exit(&ndrp->r_rwlock);
2556                                 return (error);
2557                         }
2558                         rp = VTOR(nvp);
2559                         mutex_enter(&rp->r_statelock);
2560                         if (rp->r_unldvp == NULL) {
2561                                 VN_HOLD(ndvp);
2562                                 rp->r_unldvp = ndvp;
2563                                 if (rp->r_unlcred != NULL)
2564                                         crfree(rp->r_unlcred);
2565                                 crhold(cr);
2566                                 rp->r_unlcred = cr;
2567                                 rp->r_unlname = tmpname;
2568                         } else {
2569                                 kmem_free(rp->r_unlname, MAXNAMELEN);
2570                                 rp->r_unlname = tmpname;
2571                         }
2572                         mutex_exit(&rp->r_statelock);
2573                 }
2574         }
2575 
2576         if (ovp == NULL) {
2577                 /*
2578                  * When renaming directories to be a subdirectory of a
2579                  * different parent, the dnlc entry for ".." will no
2580                  * longer be valid, so it must be removed.
2581                  *
2582                  * We do a lookup here to determine whether we are renaming
2583                  * a directory and we need to check if we are renaming
2584                  * an unlinked file.  This might have already been done
2585                  * in previous code, so we check ovp == NULL to avoid
2586                  * doing it twice.
2587                  */
2588 
2589                 error = nfslookup(odvp, onm, &ovp, NULL, 0, NULL, cr, 0);
2590 
2591                 /*
2592                  * The source name *should* already exist.
2593                  */
2594                 if (error) {
2595                         nfs_rw_exit(&odrp->r_rwlock);
2596                         nfs_rw_exit(&ndrp->r_rwlock);
2597                         if (nvp) {
2598                                 VN_RELE(nvp);
2599                         }
2600                         return (error);
2601                 }
2602                 ASSERT(ovp != NULL);
2603         }
2604 
2605         dnlc_remove(odvp, onm);
2606         dnlc_remove(ndvp, nnm);
2607 
2608         setdiropargs(&args.rna_from, onm, odvp);
2609         setdiropargs(&args.rna_to, nnm, ndvp);
2610 
2611         douprintf = 1;
2612 
2613         error = rfs2call(VTOMI(odvp), RFS_RENAME,
2614             xdr_rnmargs, (caddr_t)&args,
2615             xdr_enum, (caddr_t)&status, cr,
2616             &douprintf, &status, 0, NULL);
2617 
2618         PURGE_ATTRCACHE(odvp);  /* mod time changed */
2619         PURGE_ATTRCACHE(ndvp);  /* mod time changed */
2620 
2621         if (!error) {
2622                 error = geterrno(status);
2623                 if (!error) {
2624                         if (HAVE_RDDIR_CACHE(odrp))
2625                                 nfs_purge_rddir_cache(odvp);
2626                         if (HAVE_RDDIR_CACHE(ndrp))
2627                                 nfs_purge_rddir_cache(ndvp);
2628                         /*
2629                          * when renaming directories to be a subdirectory of a
2630                          * different parent, the dnlc entry for ".." will no
2631                          * longer be valid, so it must be removed
2632                          */
2633                         rp = VTOR(ovp);
2634                         if (ndvp != odvp) {
2635                                 if (ovp->v_type == VDIR) {
2636                                         dnlc_remove(ovp, "..");
2637                                         if (HAVE_RDDIR_CACHE(rp))
2638                                                 nfs_purge_rddir_cache(ovp);
2639                                 }
2640                         }
2641 
2642                         /*
2643                          * If we are renaming the unlinked file, update the
2644                          * r_unldvp and r_unlname as needed.
2645                          */
2646                         mutex_enter(&rp->r_statelock);
2647                         if (rp->r_unldvp != NULL) {
2648                                 if (strcmp(rp->r_unlname, onm) == 0) {
2649                                         (void) strncpy(rp->r_unlname,
2650                                             nnm, MAXNAMELEN);
2651                                         rp->r_unlname[MAXNAMELEN - 1] = '\0';
2652 
2653                                         if (ndvp != rp->r_unldvp) {
2654                                                 VN_RELE(rp->r_unldvp);
2655                                                 rp->r_unldvp = ndvp;
2656                                                 VN_HOLD(ndvp);
2657                                         }
2658                                 }
2659                         }
2660                         mutex_exit(&rp->r_statelock);
2661                 } else {
2662                         /*
2663                          * System V defines rename to return EEXIST, not
2664                          * ENOTEMPTY if the target directory is not empty.
2665                          * Over the wire, the error is NFSERR_ENOTEMPTY
2666                          * which geterrno maps to ENOTEMPTY.
2667                          */
2668                         if (error == ENOTEMPTY)
2669                                 error = EEXIST;
2670                 }
2671         }
2672 
2673         if (error == 0) {
2674                 if (nvp)
2675                         vnevent_rename_dest(nvp, ndvp, nnm, ct);
2676 
2677                 if (odvp != ndvp)
2678                         vnevent_rename_dest_dir(ndvp, ct);
2679 
2680                 ASSERT(ovp != NULL);
2681                 vnevent_rename_src(ovp, odvp, onm, ct);
2682         }
2683 
2684         if (nvp) {
2685                 VN_RELE(nvp);
2686         }
2687         VN_RELE(ovp);
2688 
2689         nfs_rw_exit(&odrp->r_rwlock);
2690         nfs_rw_exit(&ndrp->r_rwlock);
2691 
2692         return (error);
2693 }
2694 
2695 /* ARGSUSED */
2696 static int
2697 nfs_mkdir(vnode_t *dvp, char *nm, struct vattr *va, vnode_t **vpp, cred_t *cr,
2698         caller_context_t *ct, int flags, vsecattr_t *vsecp)
2699 {
2700         int error;
2701         struct nfscreatargs args;
2702         struct nfsdiropres dr;
2703         int douprintf;
2704         rnode_t *drp;
2705         hrtime_t t;
2706 
2707         if (nfs_zone() != VTOMI(dvp)->mi_zone)
2708                 return (EPERM);
2709 
2710         setdiropargs(&args.ca_da, nm, dvp);
2711 
2712         /*
2713          * Decide what the group-id and set-gid bit of the created directory
2714          * should be.  May have to do a setattr to get the gid right.
2715          */
2716         error = setdirgid(dvp, &va->va_gid, cr);
2717         if (error)
2718                 return (error);
2719         error = setdirmode(dvp, &va->va_mode, cr);
2720         if (error)
2721                 return (error);
2722         va->va_mask |= AT_MODE|AT_GID;
2723 
2724         args.ca_sa = &args.ca_sa_buf;
2725         error = vattr_to_sattr(va, args.ca_sa);
2726         if (error) {
2727                 /* req time field(s) overflow - return immediately */
2728                 return (error);
2729         }
2730 
2731         drp = VTOR(dvp);
2732         if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
2733                 return (EINTR);
2734 
2735         dnlc_remove(dvp, nm);
2736 
2737         douprintf = 1;
2738 
2739         t = gethrtime();
2740 
2741         error = rfs2call(VTOMI(dvp), RFS_MKDIR,
2742             xdr_creatargs, (caddr_t)&args,
2743             xdr_diropres, (caddr_t)&dr, cr,
2744             &douprintf, &dr.dr_status, 0, NULL);
2745 
2746         PURGE_ATTRCACHE(dvp);   /* mod time changed */
2747 
2748         if (!error) {
2749                 error = geterrno(dr.dr_status);
2750                 if (!error) {
2751                         if (HAVE_RDDIR_CACHE(drp))
2752                                 nfs_purge_rddir_cache(dvp);
2753                         /*
2754                          * The attributes returned by RFS_MKDIR can not
2755                          * be depended upon, so mark the attribute cache
2756                          * as purged.  A subsequent GETATTR will get the
2757                          * correct attributes from the server.
2758                          */
2759                         *vpp = makenfsnode(&dr.dr_fhandle, &dr.dr_attr,
2760                             dvp->v_vfsp, t, cr, NULL, NULL);
2761                         PURGE_ATTRCACHE(*vpp);
2762                         dnlc_update(dvp, nm, *vpp);
2763 
2764                         /*
2765                          * Make sure the gid was set correctly.
2766                          * If not, try to set it (but don't lose
2767                          * any sleep over it).
2768                          */
2769                         if (va->va_gid != VTOR(*vpp)->r_attr.va_gid) {
2770                                 va->va_mask = AT_GID;
2771                                 (void) nfssetattr(*vpp, va, 0, cr);
2772                         }
2773                 } else {
2774                         PURGE_STALE_FH(error, dvp, cr);
2775                 }
2776         }
2777 
2778         nfs_rw_exit(&drp->r_rwlock);
2779 
2780         return (error);
2781 }
2782 
2783 /* ARGSUSED */
2784 static int
2785 nfs_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr,
2786         caller_context_t *ct, int flags)
2787 {
2788         int error;
2789         enum nfsstat status;
2790         struct nfsdiropargs da;
2791         vnode_t *vp;
2792         int douprintf;
2793         rnode_t *drp;
2794 
2795         if (nfs_zone() != VTOMI(dvp)->mi_zone)
2796                 return (EPERM);
2797         drp = VTOR(dvp);
2798         if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
2799                 return (EINTR);
2800 
2801         /*
2802          * Attempt to prevent a rmdir(".") from succeeding.
2803          */
2804         error = nfslookup(dvp, nm, &vp, NULL, 0, NULL, cr, 0);
2805         if (error) {
2806                 nfs_rw_exit(&drp->r_rwlock);
2807                 return (error);
2808         }
2809 
2810         if (vp == cdir) {
2811                 VN_RELE(vp);
2812                 nfs_rw_exit(&drp->r_rwlock);
2813                 return (EINVAL);
2814         }
2815 
2816         setdiropargs(&da, nm, dvp);
2817 
2818         /*
2819          * First just remove the entry from the name cache, as it
2820          * is most likely an entry for this vp.
2821          */
2822         dnlc_remove(dvp, nm);
2823 
2824         /*
2825          * If there vnode reference count is greater than one, then
2826          * there may be additional references in the DNLC which will
2827          * need to be purged.  First, trying removing the entry for
2828          * the parent directory and see if that removes the additional
2829          * reference(s).  If that doesn't do it, then use dnlc_purge_vp
2830          * to completely remove any references to the directory which
2831          * might still exist in the DNLC.
2832          */
2833         if (vp->v_count > 1) {
2834                 dnlc_remove(vp, "..");
2835                 if (vp->v_count > 1)
2836                         dnlc_purge_vp(vp);
2837         }
2838 
2839         douprintf = 1;
2840 
2841         error = rfs2call(VTOMI(dvp), RFS_RMDIR,
2842             xdr_diropargs, (caddr_t)&da,
2843             xdr_enum, (caddr_t)&status, cr,
2844             &douprintf, &status, 0, NULL);
2845 
2846         PURGE_ATTRCACHE(dvp);   /* mod time changed */
2847 
2848         if (error) {
2849                 VN_RELE(vp);
2850                 nfs_rw_exit(&drp->r_rwlock);
2851                 return (error);
2852         }
2853 
2854         error = geterrno(status);
2855         if (!error) {
2856                 if (HAVE_RDDIR_CACHE(drp))
2857                         nfs_purge_rddir_cache(dvp);
2858                 if (HAVE_RDDIR_CACHE(VTOR(vp)))
2859                         nfs_purge_rddir_cache(vp);
2860         } else {
2861                 PURGE_STALE_FH(error, dvp, cr);
2862                 /*
2863                  * System V defines rmdir to return EEXIST, not
2864                  * ENOTEMPTY if the directory is not empty.  Over
2865                  * the wire, the error is NFSERR_ENOTEMPTY which
2866                  * geterrno maps to ENOTEMPTY.
2867                  */
2868                 if (error == ENOTEMPTY)
2869                         error = EEXIST;
2870         }
2871 
2872         if (error == 0) {
2873                 vnevent_rmdir(vp, dvp, nm, ct);
2874         }
2875         VN_RELE(vp);
2876 
2877         nfs_rw_exit(&drp->r_rwlock);
2878 
2879         return (error);
2880 }
2881 
2882 /* ARGSUSED */
2883 static int
2884 nfs_symlink(vnode_t *dvp, char *lnm, struct vattr *tva, char *tnm, cred_t *cr,
2885         caller_context_t *ct, int flags)
2886 {
2887         int error;
2888         struct nfsslargs args;
2889         enum nfsstat status;
2890         int douprintf;
2891         rnode_t *drp;
2892 
2893         if (nfs_zone() != VTOMI(dvp)->mi_zone)
2894                 return (EPERM);
2895         setdiropargs(&args.sla_from, lnm, dvp);
2896         args.sla_sa = &args.sla_sa_buf;
2897         error = vattr_to_sattr(tva, args.sla_sa);
2898         if (error) {
2899                 /* req time field(s) overflow - return immediately */
2900                 return (error);
2901         }
2902         args.sla_tnm = tnm;
2903 
2904         drp = VTOR(dvp);
2905         if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
2906                 return (EINTR);
2907 
2908         dnlc_remove(dvp, lnm);
2909 
2910         douprintf = 1;
2911 
2912         error = rfs2call(VTOMI(dvp), RFS_SYMLINK,
2913             xdr_slargs, (caddr_t)&args,
2914             xdr_enum, (caddr_t)&status, cr,
2915             &douprintf, &status, 0, NULL);
2916 
2917         PURGE_ATTRCACHE(dvp);   /* mod time changed */
2918 
2919         if (!error) {
2920                 error = geterrno(status);
2921                 if (!error) {
2922                         if (HAVE_RDDIR_CACHE(drp))
2923                                 nfs_purge_rddir_cache(dvp);
2924                 } else {
2925                         PURGE_STALE_FH(error, dvp, cr);
2926                 }
2927         }
2928 
2929         nfs_rw_exit(&drp->r_rwlock);
2930 
2931         return (error);
2932 }
2933 
2934 #ifdef DEBUG
2935 static int nfs_readdir_cache_hits = 0;
2936 static int nfs_readdir_cache_shorts = 0;
2937 static int nfs_readdir_cache_waits = 0;
2938 static int nfs_readdir_cache_misses = 0;
2939 static int nfs_readdir_readahead = 0;
2940 #endif
2941 
2942 static int nfs_shrinkreaddir = 0;
2943 
2944 /*
2945  * Read directory entries.
2946  * There are some weird things to look out for here.  The uio_offset
2947  * field is either 0 or it is the offset returned from a previous
2948  * readdir.  It is an opaque value used by the server to find the
2949  * correct directory block to read. The count field is the number
2950  * of blocks to read on the server.  This is advisory only, the server
2951  * may return only one block's worth of entries.  Entries may be compressed
2952  * on the server.
2953  */
2954 /* ARGSUSED */
2955 static int
2956 nfs_readdir(vnode_t *vp, struct uio *uiop, cred_t *cr, int *eofp,
2957         caller_context_t *ct, int flags)
2958 {
2959         int error;
2960         size_t count;
2961         rnode_t *rp;
2962         rddir_cache *rdc;
2963         rddir_cache *nrdc;
2964         rddir_cache *rrdc;
2965 #ifdef DEBUG
2966         int missed;
2967 #endif
2968         rddir_cache srdc;
2969         avl_index_t where;
2970 
2971         rp = VTOR(vp);
2972 
2973         ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER));
2974         if (nfs_zone() != VTOMI(vp)->mi_zone)
2975                 return (EIO);
2976         /*
2977          * Make sure that the directory cache is valid.
2978          */
2979         if (HAVE_RDDIR_CACHE(rp)) {
2980                 if (nfs_disable_rddir_cache) {
2981                         /*
2982                          * Setting nfs_disable_rddir_cache in /etc/system
2983                          * allows interoperability with servers that do not
2984                          * properly update the attributes of directories.
2985                          * Any cached information gets purged before an
2986                          * access is made to it.
2987                          */
2988                         nfs_purge_rddir_cache(vp);
2989                 } else {
2990                         error = nfs_validate_caches(vp, cr);
2991                         if (error)
2992                                 return (error);
2993                 }
2994         }
2995 
2996         /*
2997          * UGLINESS: SunOS 3.2 servers apparently cannot always handle an
2998          * RFS_READDIR request with rda_count set to more than 0x400. So
2999          * we reduce the request size here purely for compatibility.
3000          *
3001          * In general, this is no longer required.  However, if a server
3002          * is discovered which can not handle requests larger than 1024,
3003          * nfs_shrinkreaddir can be set to 1 to enable this backwards
3004          * compatibility.
3005          *
3006          * In any case, the request size is limited to NFS_MAXDATA bytes.
3007          */
3008         count = MIN(uiop->uio_iov->iov_len,
3009             nfs_shrinkreaddir ? 0x400 : NFS_MAXDATA);
3010 
3011         nrdc = NULL;
3012 #ifdef DEBUG
3013         missed = 0;
3014 #endif
3015 top:
3016         /*
3017          * Short circuit last readdir which always returns 0 bytes.
3018          * This can be done after the directory has been read through
3019          * completely at least once.  This will set r_direof which
3020          * can be used to find the value of the last cookie.
3021          */
3022         mutex_enter(&rp->r_statelock);
3023         if (rp->r_direof != NULL &&
3024             uiop->uio_offset == rp->r_direof->nfs_ncookie) {
3025                 mutex_exit(&rp->r_statelock);
3026 #ifdef DEBUG
3027                 nfs_readdir_cache_shorts++;
3028 #endif
3029                 if (eofp)
3030                         *eofp = 1;
3031                 if (nrdc != NULL)
3032                         rddir_cache_rele(nrdc);
3033                 return (0);
3034         }
3035         /*
3036          * Look for a cache entry.  Cache entries are identified
3037          * by the NFS cookie value and the byte count requested.
3038          */
3039         srdc.nfs_cookie = uiop->uio_offset;
3040         srdc.buflen = count;
3041         rdc = avl_find(&rp->r_dir, &srdc, &where);
3042         if (rdc != NULL) {
3043                 rddir_cache_hold(rdc);
3044                 /*
3045                  * If the cache entry is in the process of being
3046                  * filled in, wait until this completes.  The
3047                  * RDDIRWAIT bit is set to indicate that someone
3048                  * is waiting and then the thread currently
3049                  * filling the entry is done, it should do a
3050                  * cv_broadcast to wakeup all of the threads
3051                  * waiting for it to finish.
3052                  */
3053                 if (rdc->flags & RDDIR) {
3054                         nfs_rw_exit(&rp->r_rwlock);
3055                         rdc->flags |= RDDIRWAIT;
3056 #ifdef DEBUG
3057                         nfs_readdir_cache_waits++;
3058 #endif
3059                         if (!cv_wait_sig(&rdc->cv, &rp->r_statelock)) {
3060                                 /*
3061                                  * We got interrupted, probably
3062                                  * the user typed ^C or an alarm
3063                                  * fired.  We free the new entry
3064                                  * if we allocated one.
3065                                  */
3066                                 mutex_exit(&rp->r_statelock);
3067                                 (void) nfs_rw_enter_sig(&rp->r_rwlock,
3068                                     RW_READER, FALSE);
3069                                 rddir_cache_rele(rdc);
3070                                 if (nrdc != NULL)
3071                                         rddir_cache_rele(nrdc);
3072                                 return (EINTR);
3073                         }
3074                         mutex_exit(&rp->r_statelock);
3075                         (void) nfs_rw_enter_sig(&rp->r_rwlock,
3076                             RW_READER, FALSE);
3077                         rddir_cache_rele(rdc);
3078                         goto top;
3079                 }
3080                 /*
3081                  * Check to see if a readdir is required to
3082                  * fill the entry.  If so, mark this entry
3083                  * as being filled, remove our reference,
3084                  * and branch to the code to fill the entry.
3085                  */
3086                 if (rdc->flags & RDDIRREQ) {
3087                         rdc->flags &= ~RDDIRREQ;
3088                         rdc->flags |= RDDIR;
3089                         if (nrdc != NULL)
3090                                 rddir_cache_rele(nrdc);
3091                         nrdc = rdc;
3092                         mutex_exit(&rp->r_statelock);
3093                         goto bottom;
3094                 }
3095 #ifdef DEBUG
3096                 if (!missed)
3097                         nfs_readdir_cache_hits++;
3098 #endif
3099                 /*
3100                  * If an error occurred while attempting
3101                  * to fill the cache entry, just return it.
3102                  */
3103                 if (rdc->error) {
3104                         error = rdc->error;
3105                         mutex_exit(&rp->r_statelock);
3106                         rddir_cache_rele(rdc);
3107                         if (nrdc != NULL)
3108                                 rddir_cache_rele(nrdc);
3109                         return (error);
3110                 }
3111 
3112                 /*
3113                  * The cache entry is complete and good,
3114                  * copyout the dirent structs to the calling
3115                  * thread.
3116                  */
3117                 error = uiomove(rdc->entries, rdc->entlen, UIO_READ, uiop);
3118 
3119                 /*
3120                  * If no error occurred during the copyout,
3121                  * update the offset in the uio struct to
3122                  * contain the value of the next cookie
3123                  * and set the eof value appropriately.
3124                  */
3125                 if (!error) {
3126                         uiop->uio_offset = rdc->nfs_ncookie;
3127                         if (eofp)
3128                                 *eofp = rdc->eof;
3129                 }
3130 
3131                 /*
3132                  * Decide whether to do readahead.  Don't if
3133                  * have already read to the end of directory.
3134                  */
3135                 if (rdc->eof) {
3136                         rp->r_direof = rdc;
3137                         mutex_exit(&rp->r_statelock);
3138                         rddir_cache_rele(rdc);
3139                         if (nrdc != NULL)
3140                                 rddir_cache_rele(nrdc);
3141                         return (error);
3142                 }
3143 
3144                 /*
3145                  * Check to see whether we found an entry
3146                  * for the readahead.  If so, we don't need
3147                  * to do anything further, so free the new
3148                  * entry if one was allocated.  Otherwise,
3149                  * allocate a new entry, add it to the cache,
3150                  * and then initiate an asynchronous readdir
3151                  * operation to fill it.
3152                  */
3153                 srdc.nfs_cookie = rdc->nfs_ncookie;
3154                 srdc.buflen = count;
3155                 rrdc = avl_find(&rp->r_dir, &srdc, &where);
3156                 if (rrdc != NULL) {
3157                         if (nrdc != NULL)
3158                                 rddir_cache_rele(nrdc);
3159                 } else {
3160                         if (nrdc != NULL)
3161                                 rrdc = nrdc;
3162                         else {
3163                                 rrdc = rddir_cache_alloc(KM_NOSLEEP);
3164                         }
3165                         if (rrdc != NULL) {
3166                                 rrdc->nfs_cookie = rdc->nfs_ncookie;
3167                                 rrdc->buflen = count;
3168                                 avl_insert(&rp->r_dir, rrdc, where);
3169                                 rddir_cache_hold(rrdc);
3170                                 mutex_exit(&rp->r_statelock);
3171                                 rddir_cache_rele(rdc);
3172 #ifdef DEBUG
3173                                 nfs_readdir_readahead++;
3174 #endif
3175                                 nfs_async_readdir(vp, rrdc, cr, nfsreaddir);
3176                                 return (error);
3177                         }
3178                 }
3179 
3180                 mutex_exit(&rp->r_statelock);
3181                 rddir_cache_rele(rdc);
3182                 return (error);
3183         }
3184 
3185         /*
3186          * Didn't find an entry in the cache.  Construct a new empty
3187          * entry and link it into the cache.  Other processes attempting
3188          * to access this entry will need to wait until it is filled in.
3189          *
3190          * Since kmem_alloc may block, another pass through the cache
3191          * will need to be taken to make sure that another process
3192          * hasn't already added an entry to the cache for this request.
3193          */
3194         if (nrdc == NULL) {
3195                 mutex_exit(&rp->r_statelock);
3196                 nrdc = rddir_cache_alloc(KM_SLEEP);
3197                 nrdc->nfs_cookie = uiop->uio_offset;
3198                 nrdc->buflen = count;
3199                 goto top;
3200         }
3201 
3202         /*
3203          * Add this entry to the cache.
3204          */
3205         avl_insert(&rp->r_dir, nrdc, where);
3206         rddir_cache_hold(nrdc);
3207         mutex_exit(&rp->r_statelock);
3208 
3209 bottom:
3210 #ifdef DEBUG
3211         missed = 1;
3212         nfs_readdir_cache_misses++;
3213 #endif
3214         /*
3215          * Do the readdir.
3216          */
3217         error = nfsreaddir(vp, nrdc, cr);
3218 
3219         /*
3220          * If this operation failed, just return the error which occurred.
3221          */
3222         if (error != 0)
3223                 return (error);
3224 
3225         /*
3226          * Since the RPC operation will have taken sometime and blocked
3227          * this process, another pass through the cache will need to be
3228          * taken to find the correct cache entry.  It is possible that
3229          * the correct cache entry will not be there (although one was
3230          * added) because the directory changed during the RPC operation
3231          * and the readdir cache was flushed.  In this case, just start
3232          * over.  It is hoped that this will not happen too often... :-)
3233          */
3234         nrdc = NULL;
3235         goto top;
3236         /* NOTREACHED */
3237 }
3238 
3239 static int
3240 nfsreaddir(vnode_t *vp, rddir_cache *rdc, cred_t *cr)
3241 {
3242         int error;
3243         struct nfsrddirargs rda;
3244         struct nfsrddirres rd;
3245         rnode_t *rp;
3246         mntinfo_t *mi;
3247         uint_t count;
3248         int douprintf;
3249         failinfo_t fi, *fip;
3250 
3251         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
3252         count = rdc->buflen;
3253 
3254         rp = VTOR(vp);
3255         mi = VTOMI(vp);
3256 
3257         rda.rda_fh = *VTOFH(vp);
3258         rda.rda_offset = rdc->nfs_cookie;
3259 
3260         /*
3261          * NFS client failover support
3262          * suppress failover unless we have a zero cookie
3263          */
3264         if (rdc->nfs_cookie == (off_t)0) {
3265                 fi.vp = vp;
3266                 fi.fhp = (caddr_t)&rda.rda_fh;
3267                 fi.copyproc = nfscopyfh;
3268                 fi.lookupproc = nfslookup;
3269                 fi.xattrdirproc = acl_getxattrdir2;
3270                 fip = &fi;
3271         } else {
3272                 fip = NULL;
3273         }
3274 
3275         rd.rd_entries = kmem_alloc(rdc->buflen, KM_SLEEP);
3276         rd.rd_size = count;
3277         rd.rd_offset = rda.rda_offset;
3278 
3279         douprintf = 1;
3280 
3281         if (mi->mi_io_kstats) {
3282                 mutex_enter(&mi->mi_lock);
3283                 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
3284                 mutex_exit(&mi->mi_lock);
3285         }
3286 
3287         do {
3288                 rda.rda_count = MIN(count, mi->mi_curread);
3289                 error = rfs2call(mi, RFS_READDIR,
3290                     xdr_rddirargs, (caddr_t)&rda,
3291                     xdr_getrddirres, (caddr_t)&rd, cr,
3292                     &douprintf, &rd.rd_status, 0, fip);
3293         } while (error == ENFS_TRYAGAIN);
3294 
3295         if (mi->mi_io_kstats) {
3296                 mutex_enter(&mi->mi_lock);
3297                 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
3298                 mutex_exit(&mi->mi_lock);
3299         }
3300 
3301         /*
3302          * Since we are actually doing a READDIR RPC, we must have
3303          * exclusive access to the cache entry being filled.  Thus,
3304          * it is safe to update all fields except for the flags
3305          * field.  The r_statelock in the rnode must be held to
3306          * prevent two different threads from simultaneously
3307          * attempting to update the flags field.  This can happen
3308          * if we are turning off RDDIR and the other thread is
3309          * trying to set RDDIRWAIT.
3310          */
3311         ASSERT(rdc->flags & RDDIR);
3312         if (!error) {
3313                 error = geterrno(rd.rd_status);
3314                 if (!error) {
3315                         rdc->nfs_ncookie = rd.rd_offset;
3316                         rdc->eof = rd.rd_eof ? 1 : 0;
3317                         rdc->entlen = rd.rd_size;
3318                         ASSERT(rdc->entlen <= rdc->buflen);
3319 #ifdef DEBUG
3320                         rdc->entries = rddir_cache_buf_alloc(rdc->buflen,
3321                             KM_SLEEP);
3322 #else
3323                         rdc->entries = kmem_alloc(rdc->buflen, KM_SLEEP);
3324 #endif
3325                         bcopy(rd.rd_entries, rdc->entries, rdc->entlen);
3326                         rdc->error = 0;
3327                         if (mi->mi_io_kstats) {
3328                                 mutex_enter(&mi->mi_lock);
3329                                 KSTAT_IO_PTR(mi->mi_io_kstats)->reads++;
3330                                 KSTAT_IO_PTR(mi->mi_io_kstats)->nread +=
3331                                     rd.rd_size;
3332                                 mutex_exit(&mi->mi_lock);
3333                         }
3334                 } else {
3335                         PURGE_STALE_FH(error, vp, cr);
3336                 }
3337         }
3338         if (error) {
3339                 rdc->entries = NULL;
3340                 rdc->error = error;
3341         }
3342         kmem_free(rd.rd_entries, rdc->buflen);
3343 
3344         mutex_enter(&rp->r_statelock);
3345         rdc->flags &= ~RDDIR;
3346         if (rdc->flags & RDDIRWAIT) {
3347                 rdc->flags &= ~RDDIRWAIT;
3348                 cv_broadcast(&rdc->cv);
3349         }
3350         if (error)
3351                 rdc->flags |= RDDIRREQ;
3352         mutex_exit(&rp->r_statelock);
3353 
3354         rddir_cache_rele(rdc);
3355 
3356         return (error);
3357 }
3358 
3359 #ifdef DEBUG
3360 static int nfs_bio_do_stop = 0;
3361 #endif
3362 
3363 static int
3364 nfs_bio(struct buf *bp, cred_t *cr)
3365 {
3366         rnode_t *rp = VTOR(bp->b_vp);
3367         int count;
3368         int error;
3369         cred_t *cred;
3370         uint_t offset;
3371 
3372         DTRACE_IO1(start, struct buf *, bp);
3373 
3374         ASSERT(nfs_zone() == VTOMI(bp->b_vp)->mi_zone);
3375         offset = dbtob(bp->b_blkno);
3376 
3377         if (bp->b_flags & B_READ) {
3378                 mutex_enter(&rp->r_statelock);
3379                 if (rp->r_cred != NULL) {
3380                         cred = rp->r_cred;
3381                         crhold(cred);
3382                 } else {
3383                         rp->r_cred = cr;
3384                         crhold(cr);
3385                         cred = cr;
3386                         crhold(cred);
3387                 }
3388                 mutex_exit(&rp->r_statelock);
3389         read_again:
3390                 error = bp->b_error = nfsread(bp->b_vp, bp->b_un.b_addr,
3391                     offset, bp->b_bcount, &bp->b_resid, cred);
3392 
3393                 crfree(cred);
3394                 if (!error) {
3395                         if (bp->b_resid) {
3396                                 /*
3397                                  * Didn't get it all because we hit EOF,
3398                                  * zero all the memory beyond the EOF.
3399                                  */
3400                                 /* bzero(rdaddr + */
3401                                 bzero(bp->b_un.b_addr +
3402                                     bp->b_bcount - bp->b_resid, bp->b_resid);
3403                         }
3404                         mutex_enter(&rp->r_statelock);
3405                         if (bp->b_resid == bp->b_bcount &&
3406                             offset >= rp->r_size) {
3407                                 /*
3408                                  * We didn't read anything at all as we are
3409                                  * past EOF.  Return an error indicator back
3410                                  * but don't destroy the pages (yet).
3411                                  */
3412                                 error = NFS_EOF;
3413                         }
3414                         mutex_exit(&rp->r_statelock);
3415                 } else if (error == EACCES) {
3416                         mutex_enter(&rp->r_statelock);
3417                         if (cred != cr) {
3418                                 if (rp->r_cred != NULL)
3419                                         crfree(rp->r_cred);
3420                                 rp->r_cred = cr;
3421                                 crhold(cr);
3422                                 cred = cr;
3423                                 crhold(cred);
3424                                 mutex_exit(&rp->r_statelock);
3425                                 goto read_again;
3426                         }
3427                         mutex_exit(&rp->r_statelock);
3428                 }
3429         } else {
3430                 if (!(rp->r_flags & RSTALE)) {
3431                         mutex_enter(&rp->r_statelock);
3432                         if (rp->r_cred != NULL) {
3433                                 cred = rp->r_cred;
3434                                 crhold(cred);
3435                         } else {
3436                                 rp->r_cred = cr;
3437                                 crhold(cr);
3438                                 cred = cr;
3439                                 crhold(cred);
3440                         }
3441                         mutex_exit(&rp->r_statelock);
3442                 write_again:
3443                         mutex_enter(&rp->r_statelock);
3444                         count = MIN(bp->b_bcount, rp->r_size - offset);
3445                         mutex_exit(&rp->r_statelock);
3446                         if (count < 0)
3447                                 cmn_err(CE_PANIC, "nfs_bio: write count < 0");
3448 #ifdef DEBUG
3449                         if (count == 0) {
3450                                 zcmn_err(getzoneid(), CE_WARN,
3451                                     "nfs_bio: zero length write at %d",
3452                                     offset);
3453                                 nfs_printfhandle(&rp->r_fh);
3454                                 if (nfs_bio_do_stop)
3455                                         debug_enter("nfs_bio");
3456                         }
3457 #endif
3458                         error = nfswrite(bp->b_vp, bp->b_un.b_addr, offset,
3459                             count, cred);
3460                         if (error == EACCES) {
3461                                 mutex_enter(&rp->r_statelock);
3462                                 if (cred != cr) {
3463                                         if (rp->r_cred != NULL)
3464                                                 crfree(rp->r_cred);
3465                                         rp->r_cred = cr;
3466                                         crhold(cr);
3467                                         crfree(cred);
3468                                         cred = cr;
3469                                         crhold(cred);
3470                                         mutex_exit(&rp->r_statelock);
3471                                         goto write_again;
3472                                 }
3473                                 mutex_exit(&rp->r_statelock);
3474                         }
3475                         bp->b_error = error;
3476                         if (error && error != EINTR) {
3477                                 /*
3478                                  * Don't print EDQUOT errors on the console.
3479                                  * Don't print asynchronous EACCES errors.
3480                                  * Don't print EFBIG errors.
3481                                  * Print all other write errors.
3482                                  */
3483                                 if (error != EDQUOT && error != EFBIG &&
3484                                     (error != EACCES ||
3485                                     !(bp->b_flags & B_ASYNC)))
3486                                         nfs_write_error(bp->b_vp, error, cred);
3487                                 /*
3488                                  * Update r_error and r_flags as appropriate.
3489                                  * If the error was ESTALE, then mark the
3490                                  * rnode as not being writeable and save
3491                                  * the error status.  Otherwise, save any
3492                                  * errors which occur from asynchronous
3493                                  * page invalidations.  Any errors occurring
3494                                  * from other operations should be saved
3495                                  * by the caller.
3496                                  */
3497                                 mutex_enter(&rp->r_statelock);
3498                                 if (error == ESTALE) {
3499                                         rp->r_flags |= RSTALE;
3500                                         if (!rp->r_error)
3501                                                 rp->r_error = error;
3502                                 } else if (!rp->r_error &&
3503                                     (bp->b_flags &
3504                                     (B_INVAL|B_FORCE|B_ASYNC)) ==
3505                                     (B_INVAL|B_FORCE|B_ASYNC)) {
3506                                         rp->r_error = error;
3507                                 }
3508                                 mutex_exit(&rp->r_statelock);
3509                         }
3510                         crfree(cred);
3511                 } else {
3512                         error = rp->r_error;
3513                         /*
3514                          * A close may have cleared r_error, if so,
3515                          * propagate ESTALE error return properly
3516                          */
3517                         if (error == 0)
3518                                 error = ESTALE;
3519                 }
3520         }
3521 
3522         if (error != 0 && error != NFS_EOF)
3523                 bp->b_flags |= B_ERROR;
3524 
3525         DTRACE_IO1(done, struct buf *, bp);
3526 
3527         return (error);
3528 }
3529 
3530 /* ARGSUSED */
3531 static int
3532 nfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
3533 {
3534         struct nfs_fid *fp;
3535         rnode_t *rp;
3536 
3537         rp = VTOR(vp);
3538 
3539         if (fidp->fid_len < (sizeof (struct nfs_fid) - sizeof (short))) {
3540                 fidp->fid_len = sizeof (struct nfs_fid) - sizeof (short);
3541                 return (ENOSPC);
3542         }
3543         fp = (struct nfs_fid *)fidp;
3544         fp->nf_pad = 0;
3545         fp->nf_len = sizeof (struct nfs_fid) - sizeof (short);
3546         bcopy(rp->r_fh.fh_buf, fp->nf_data, NFS_FHSIZE);
3547         return (0);
3548 }
3549 
3550 /* ARGSUSED2 */
3551 static int
3552 nfs_rwlock(vnode_t *vp, int write_lock, caller_context_t *ctp)
3553 {
3554         rnode_t *rp = VTOR(vp);
3555 
3556         if (!write_lock) {
3557                 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE);
3558                 return (V_WRITELOCK_FALSE);
3559         }
3560 
3561         if ((rp->r_flags & RDIRECTIO) || (VTOMI(vp)->mi_flags & MI_DIRECTIO)) {
3562                 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE);
3563                 if (rp->r_mapcnt == 0 && !vn_has_cached_data(vp))
3564                         return (V_WRITELOCK_FALSE);
3565                 nfs_rw_exit(&rp->r_rwlock);
3566         }
3567 
3568         (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, FALSE);
3569         return (V_WRITELOCK_TRUE);
3570 }
3571 
3572 /* ARGSUSED */
3573 static void
3574 nfs_rwunlock(vnode_t *vp, int write_lock, caller_context_t *ctp)
3575 {
3576         rnode_t *rp = VTOR(vp);
3577 
3578         nfs_rw_exit(&rp->r_rwlock);
3579 }
3580 
3581 /* ARGSUSED */
3582 static int
3583 nfs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp, caller_context_t *ct)
3584 {
3585 
3586         /*
3587          * Because we stuff the readdir cookie into the offset field
3588          * someone may attempt to do an lseek with the cookie which
3589          * we want to succeed.
3590          */
3591         if (vp->v_type == VDIR)
3592                 return (0);
3593         if (*noffp < 0 || *noffp > MAXOFF32_T)
3594                 return (EINVAL);
3595         return (0);
3596 }
3597 
3598 /*
3599  * number of NFS_MAXDATA blocks to read ahead
3600  * optimized for 100 base-T.
3601  */
3602 static int nfs_nra = 4;
3603 
3604 #ifdef DEBUG
3605 static int nfs_lostpage = 0;    /* number of times we lost original page */
3606 #endif
3607 
3608 /*
3609  * Return all the pages from [off..off+len) in file
3610  */
3611 /* ARGSUSED */
3612 static int
3613 nfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp,
3614         page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
3615         enum seg_rw rw, cred_t *cr, caller_context_t *ct)
3616 {
3617         rnode_t *rp;
3618         int error;
3619         mntinfo_t *mi;
3620 
3621         if (vp->v_flag & VNOMAP)
3622                 return (ENOSYS);
3623 
3624         ASSERT(off <= MAXOFF32_T);
3625         if (nfs_zone() != VTOMI(vp)->mi_zone)
3626                 return (EIO);
3627         if (protp != NULL)
3628                 *protp = PROT_ALL;
3629 
3630         /*
3631          * Now valididate that the caches are up to date.
3632          */
3633         error = nfs_validate_caches(vp, cr);
3634         if (error)
3635                 return (error);
3636 
3637         rp = VTOR(vp);
3638         mi = VTOMI(vp);
3639 retry:
3640         mutex_enter(&rp->r_statelock);
3641 
3642         /*
3643          * Don't create dirty pages faster than they
3644          * can be cleaned so that the system doesn't
3645          * get imbalanced.  If the async queue is
3646          * maxed out, then wait for it to drain before
3647          * creating more dirty pages.  Also, wait for
3648          * any threads doing pagewalks in the vop_getattr
3649          * entry points so that they don't block for
3650          * long periods.
3651          */
3652         if (rw == S_CREATE) {
3653                 while ((mi->mi_max_threads != 0 &&
3654                     rp->r_awcount > 2 * mi->mi_max_threads) ||
3655                     rp->r_gcount > 0)
3656                         cv_wait(&rp->r_cv, &rp->r_statelock);
3657         }
3658 
3659         /*
3660          * If we are getting called as a side effect of an nfs_write()
3661          * operation the local file size might not be extended yet.
3662          * In this case we want to be able to return pages of zeroes.
3663          */
3664         if (off + len > rp->r_size + PAGEOFFSET && seg != segkmap) {
3665                 mutex_exit(&rp->r_statelock);
3666                 return (EFAULT);                /* beyond EOF */
3667         }
3668 
3669         mutex_exit(&rp->r_statelock);
3670 
3671         if (len <= PAGESIZE) {
3672                 error = nfs_getapage(vp, off, len, protp, pl, plsz,
3673                     seg, addr, rw, cr);
3674         } else {
3675                 error = pvn_getpages(nfs_getapage, vp, off, len, protp,
3676                     pl, plsz, seg, addr, rw, cr);
3677         }
3678 
3679         switch (error) {
3680         case NFS_EOF:
3681                 nfs_purge_caches(vp, NFS_NOPURGE_DNLC, cr);
3682                 goto retry;
3683         case ESTALE:
3684                 PURGE_STALE_FH(error, vp, cr);
3685         }
3686 
3687         return (error);
3688 }
3689 
3690 /*
3691  * Called from pvn_getpages or nfs_getpage to get a particular page.
3692  */
3693 /* ARGSUSED */
3694 static int
3695 nfs_getapage(vnode_t *vp, u_offset_t off, size_t len, uint_t *protp,
3696         page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
3697         enum seg_rw rw, cred_t *cr)
3698 {
3699         rnode_t *rp;
3700         uint_t bsize;
3701         struct buf *bp;
3702         page_t *pp;
3703         u_offset_t lbn;
3704         u_offset_t io_off;
3705         u_offset_t blkoff;
3706         u_offset_t rablkoff;
3707         size_t io_len;
3708         uint_t blksize;
3709         int error;
3710         int readahead;
3711         int readahead_issued = 0;
3712         int ra_window; /* readahead window */
3713         page_t *pagefound;
3714 
3715         if (nfs_zone() != VTOMI(vp)->mi_zone)
3716                 return (EIO);
3717         rp = VTOR(vp);
3718         bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
3719 
3720 reread:
3721         bp = NULL;
3722         pp = NULL;
3723         pagefound = NULL;
3724 
3725         if (pl != NULL)
3726                 pl[0] = NULL;
3727 
3728         error = 0;
3729         lbn = off / bsize;
3730         blkoff = lbn * bsize;
3731 
3732         /*
3733          * Queueing up the readahead before doing the synchronous read
3734          * results in a significant increase in read throughput because
3735          * of the increased parallelism between the async threads and
3736          * the process context.
3737          */
3738         if ((off & ((vp->v_vfsp->vfs_bsize) - 1)) == 0 &&
3739             rw != S_CREATE &&
3740             !(vp->v_flag & VNOCACHE)) {
3741                 mutex_enter(&rp->r_statelock);
3742 
3743                 /*
3744                  * Calculate the number of readaheads to do.
3745                  * a) No readaheads at offset = 0.
3746                  * b) Do maximum(nfs_nra) readaheads when the readahead
3747                  *    window is closed.
3748                  * c) Do readaheads between 1 to (nfs_nra - 1) depending
3749                  *    upon how far the readahead window is open or close.
3750                  * d) No readaheads if rp->r_nextr is not within the scope
3751                  *    of the readahead window (random i/o).
3752                  */
3753 
3754                 if (off == 0)
3755                         readahead = 0;
3756                 else if (blkoff == rp->r_nextr)
3757                         readahead = nfs_nra;
3758                 else if (rp->r_nextr > blkoff &&
3759                     ((ra_window = (rp->r_nextr - blkoff) / bsize)
3760                     <= (nfs_nra - 1)))
3761                         readahead = nfs_nra - ra_window;
3762                 else
3763                         readahead = 0;
3764 
3765                 rablkoff = rp->r_nextr;
3766                 while (readahead > 0 && rablkoff + bsize < rp->r_size) {
3767                         mutex_exit(&rp->r_statelock);
3768                         if (nfs_async_readahead(vp, rablkoff + bsize,
3769                             addr + (rablkoff + bsize - off), seg, cr,
3770                             nfs_readahead) < 0) {
3771                                 mutex_enter(&rp->r_statelock);
3772                                 break;
3773                         }
3774                         readahead--;
3775                         rablkoff += bsize;
3776                         /*
3777                          * Indicate that we did a readahead so
3778                          * readahead offset is not updated
3779                          * by the synchronous read below.
3780                          */
3781                         readahead_issued = 1;
3782                         mutex_enter(&rp->r_statelock);
3783                         /*
3784                          * set readahead offset to
3785                          * offset of last async readahead
3786                          * request.
3787                          */
3788                         rp->r_nextr = rablkoff;
3789                 }
3790                 mutex_exit(&rp->r_statelock);
3791         }
3792 
3793 again:
3794         if ((pagefound = page_exists(vp, off)) == NULL) {
3795                 if (pl == NULL) {
3796                         (void) nfs_async_readahead(vp, blkoff, addr, seg, cr,
3797                             nfs_readahead);
3798                 } else if (rw == S_CREATE) {
3799                         /*
3800                          * Block for this page is not allocated, or the offset
3801                          * is beyond the current allocation size, or we're
3802                          * allocating a swap slot and the page was not found,
3803                          * so allocate it and return a zero page.
3804                          */
3805                         if ((pp = page_create_va(vp, off,
3806                             PAGESIZE, PG_WAIT, seg, addr)) == NULL)
3807                                 cmn_err(CE_PANIC, "nfs_getapage: page_create");
3808                         io_len = PAGESIZE;
3809                         mutex_enter(&rp->r_statelock);
3810                         rp->r_nextr = off + PAGESIZE;
3811                         mutex_exit(&rp->r_statelock);
3812                 } else {
3813                         /*
3814                          * Need to go to server to get a BLOCK, exception to
3815                          * that being while reading at offset = 0 or doing
3816                          * random i/o, in that case read only a PAGE.
3817                          */
3818                         mutex_enter(&rp->r_statelock);
3819                         if (blkoff < rp->r_size &&
3820                             blkoff + bsize >= rp->r_size) {
3821                                 /*
3822                                  * If only a block or less is left in
3823                                  * the file, read all that is remaining.
3824                                  */
3825                                 if (rp->r_size <= off) {
3826                                         /*
3827                                          * Trying to access beyond EOF,
3828                                          * set up to get at least one page.
3829                                          */
3830                                         blksize = off + PAGESIZE - blkoff;
3831                                 } else
3832                                         blksize = rp->r_size - blkoff;
3833                         } else if ((off == 0) ||
3834                             (off != rp->r_nextr && !readahead_issued)) {
3835                                 blksize = PAGESIZE;
3836                                 blkoff = off; /* block = page here */
3837                         } else
3838                                 blksize = bsize;
3839                         mutex_exit(&rp->r_statelock);
3840 
3841                         pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
3842                             &io_len, blkoff, blksize, 0);
3843 
3844                         /*
3845                          * Some other thread has entered the page,
3846                          * so just use it.
3847                          */
3848                         if (pp == NULL)
3849                                 goto again;
3850 
3851                         /*
3852                          * Now round the request size up to page boundaries.
3853                          * This ensures that the entire page will be
3854                          * initialized to zeroes if EOF is encountered.
3855                          */
3856                         io_len = ptob(btopr(io_len));
3857 
3858                         bp = pageio_setup(pp, io_len, vp, B_READ);
3859                         ASSERT(bp != NULL);
3860 
3861                         /*
3862                          * pageio_setup should have set b_addr to 0.  This
3863                          * is correct since we want to do I/O on a page
3864                          * boundary.  bp_mapin will use this addr to calculate
3865                          * an offset, and then set b_addr to the kernel virtual
3866                          * address it allocated for us.
3867                          */
3868                         ASSERT(bp->b_un.b_addr == 0);
3869 
3870                         bp->b_edev = 0;
3871                         bp->b_dev = 0;
3872                         bp->b_lblkno = lbtodb(io_off);
3873                         bp->b_file = vp;
3874                         bp->b_offset = (offset_t)off;
3875                         bp_mapin(bp);
3876 
3877                         /*
3878                          * If doing a write beyond what we believe is EOF,
3879                          * don't bother trying to read the pages from the
3880                          * server, we'll just zero the pages here.  We
3881                          * don't check that the rw flag is S_WRITE here
3882                          * because some implementations may attempt a
3883                          * read access to the buffer before copying data.
3884                          */
3885                         mutex_enter(&rp->r_statelock);
3886                         if (io_off >= rp->r_size && seg == segkmap) {
3887                                 mutex_exit(&rp->r_statelock);
3888                                 bzero(bp->b_un.b_addr, io_len);
3889                         } else {
3890                                 mutex_exit(&rp->r_statelock);
3891                                 error = nfs_bio(bp, cr);
3892                         }
3893 
3894                         /*
3895                          * Unmap the buffer before freeing it.
3896                          */
3897                         bp_mapout(bp);
3898                         pageio_done(bp);
3899 
3900                         if (error == NFS_EOF) {
3901                                 /*
3902                                  * If doing a write system call just return
3903                                  * zeroed pages, else user tried to get pages
3904                                  * beyond EOF, return error.  We don't check
3905                                  * that the rw flag is S_WRITE here because
3906                                  * some implementations may attempt a read
3907                                  * access to the buffer before copying data.
3908                                  */
3909                                 if (seg == segkmap)
3910                                         error = 0;
3911                                 else
3912                                         error = EFAULT;
3913                         }
3914 
3915                         if (!readahead_issued && !error) {
3916                                 mutex_enter(&rp->r_statelock);
3917                                 rp->r_nextr = io_off + io_len;
3918                                 mutex_exit(&rp->r_statelock);
3919                         }
3920                 }
3921         }
3922 
3923 out:
3924         if (pl == NULL)
3925                 return (error);
3926 
3927         if (error) {
3928                 if (pp != NULL)
3929                         pvn_read_done(pp, B_ERROR);
3930                 return (error);
3931         }
3932 
3933         if (pagefound) {
3934                 se_t se = (rw == S_CREATE ? SE_EXCL : SE_SHARED);
3935 
3936                 /*
3937                  * Page exists in the cache, acquire the appropriate lock.
3938                  * If this fails, start all over again.
3939                  */
3940                 if ((pp = page_lookup(vp, off, se)) == NULL) {
3941 #ifdef DEBUG
3942                         nfs_lostpage++;
3943 #endif
3944                         goto reread;
3945                 }
3946                 pl[0] = pp;
3947                 pl[1] = NULL;
3948                 return (0);
3949         }
3950 
3951         if (pp != NULL)
3952                 pvn_plist_init(pp, pl, plsz, off, io_len, rw);
3953 
3954         return (error);
3955 }
3956 
3957 static void
3958 nfs_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr, struct seg *seg,
3959         cred_t *cr)
3960 {
3961         int error;
3962         page_t *pp;
3963         u_offset_t io_off;
3964         size_t io_len;
3965         struct buf *bp;
3966         uint_t bsize, blksize;
3967         rnode_t *rp = VTOR(vp);
3968 
3969         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
3970 
3971         bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
3972 
3973         mutex_enter(&rp->r_statelock);
3974         if (blkoff < rp->r_size && blkoff + bsize > rp->r_size) {
3975                 /*
3976                  * If less than a block left in file read less
3977                  * than a block.
3978                  */
3979                 blksize = rp->r_size - blkoff;
3980         } else
3981                 blksize = bsize;
3982         mutex_exit(&rp->r_statelock);
3983 
3984         pp = pvn_read_kluster(vp, blkoff, segkmap, addr,
3985             &io_off, &io_len, blkoff, blksize, 1);
3986         /*
3987          * The isra flag passed to the kluster function is 1, we may have
3988          * gotten a return value of NULL for a variety of reasons (# of free
3989          * pages < minfree, someone entered the page on the vnode etc). In all
3990          * cases, we want to punt on the readahead.
3991          */
3992         if (pp == NULL)
3993                 return;
3994 
3995         /*
3996          * Now round the request size up to page boundaries.
3997          * This ensures that the entire page will be
3998          * initialized to zeroes if EOF is encountered.
3999          */
4000         io_len = ptob(btopr(io_len));
4001 
4002         bp = pageio_setup(pp, io_len, vp, B_READ);
4003         ASSERT(bp != NULL);
4004 
4005         /*
4006          * pageio_setup should have set b_addr to 0.  This is correct since
4007          * we want to do I/O on a page boundary. bp_mapin() will use this addr
4008          * to calculate an offset, and then set b_addr to the kernel virtual
4009          * address it allocated for us.
4010          */
4011         ASSERT(bp->b_un.b_addr == 0);
4012 
4013         bp->b_edev = 0;
4014         bp->b_dev = 0;
4015         bp->b_lblkno = lbtodb(io_off);
4016         bp->b_file = vp;
4017         bp->b_offset = (offset_t)blkoff;
4018         bp_mapin(bp);
4019 
4020         /*
4021          * If doing a write beyond what we believe is EOF, don't bother trying
4022          * to read the pages from the server, we'll just zero the pages here.
4023          * We don't check that the rw flag is S_WRITE here because some
4024          * implementations may attempt a read access to the buffer before
4025          * copying data.
4026          */
4027         mutex_enter(&rp->r_statelock);
4028         if (io_off >= rp->r_size && seg == segkmap) {
4029                 mutex_exit(&rp->r_statelock);
4030                 bzero(bp->b_un.b_addr, io_len);
4031                 error = 0;
4032         } else {
4033                 mutex_exit(&rp->r_statelock);
4034                 error = nfs_bio(bp, cr);
4035                 if (error == NFS_EOF)
4036                         error = 0;
4037         }
4038 
4039         /*
4040          * Unmap the buffer before freeing it.
4041          */
4042         bp_mapout(bp);
4043         pageio_done(bp);
4044 
4045         pvn_read_done(pp, error ? B_READ | B_ERROR : B_READ);
4046 
4047         /*
4048          * In case of error set readahead offset
4049          * to the lowest offset.
4050          * pvn_read_done() calls VN_DISPOSE to destroy the pages
4051          */
4052         if (error && rp->r_nextr > io_off) {
4053                 mutex_enter(&rp->r_statelock);
4054                 if (rp->r_nextr > io_off)
4055                         rp->r_nextr = io_off;
4056                 mutex_exit(&rp->r_statelock);
4057         }
4058 }
4059 
4060 /*
4061  * Flags are composed of {B_INVAL, B_FREE, B_DONTNEED, B_FORCE}
4062  * If len == 0, do from off to EOF.
4063  *
4064  * The normal cases should be len == 0 && off == 0 (entire vp list),
4065  * len == MAXBSIZE (from segmap_release actions), and len == PAGESIZE
4066  * (from pageout).
4067  */
4068 /* ARGSUSED */
4069 static int
4070 nfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr,
4071         caller_context_t *ct)
4072 {
4073         int error;
4074         rnode_t *rp;
4075 
4076         ASSERT(cr != NULL);
4077 
4078         /*
4079          * XXX - Why should this check be made here?
4080          */
4081         if (vp->v_flag & VNOMAP)
4082                 return (ENOSYS);
4083 
4084         if (len == 0 && !(flags & B_INVAL) && vn_is_readonly(vp))
4085                 return (0);
4086 
4087         if (!(flags & B_ASYNC) && nfs_zone() != VTOMI(vp)->mi_zone)
4088                 return (EIO);
4089         ASSERT(off <= MAXOFF32_T);
4090 
4091         rp = VTOR(vp);
4092         mutex_enter(&rp->r_statelock);
4093         rp->r_count++;
4094         mutex_exit(&rp->r_statelock);
4095         error = nfs_putpages(vp, off, len, flags, cr);
4096         mutex_enter(&rp->r_statelock);
4097         rp->r_count--;
4098         cv_broadcast(&rp->r_cv);
4099         mutex_exit(&rp->r_statelock);
4100 
4101         return (error);
4102 }
4103 
4104 /*
4105  * Write out a single page, possibly klustering adjacent dirty pages.
4106  */
4107 int
4108 nfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, size_t *lenp,
4109         int flags, cred_t *cr)
4110 {
4111         u_offset_t io_off;
4112         u_offset_t lbn_off;
4113         u_offset_t lbn;
4114         size_t io_len;
4115         uint_t bsize;
4116         int error;
4117         rnode_t *rp;
4118 
4119         ASSERT(!vn_is_readonly(vp));
4120         ASSERT(pp != NULL);
4121         ASSERT(cr != NULL);
4122         ASSERT((flags & B_ASYNC) || nfs_zone() == VTOMI(vp)->mi_zone);
4123 
4124         rp = VTOR(vp);
4125         ASSERT(rp->r_count > 0);
4126 
4127         ASSERT(pp->p_offset <= MAXOFF32_T);
4128 
4129         bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
4130         lbn = pp->p_offset / bsize;
4131         lbn_off = lbn * bsize;
4132 
4133         /*
4134          * Find a kluster that fits in one block, or in
4135          * one page if pages are bigger than blocks.  If
4136          * there is less file space allocated than a whole
4137          * page, we'll shorten the i/o request below.
4138          */
4139         pp = pvn_write_kluster(vp, pp, &io_off, &io_len, lbn_off,
4140             roundup(bsize, PAGESIZE), flags);
4141 
4142         /*
4143          * pvn_write_kluster shouldn't have returned a page with offset
4144          * behind the original page we were given.  Verify that.
4145          */
4146         ASSERT((pp->p_offset / bsize) >= lbn);
4147 
4148         /*
4149          * Now pp will have the list of kept dirty pages marked for
4150          * write back.  It will also handle invalidation and freeing
4151          * of pages that are not dirty.  Check for page length rounding
4152          * problems.
4153          */
4154         if (io_off + io_len > lbn_off + bsize) {
4155                 ASSERT((io_off + io_len) - (lbn_off + bsize) < PAGESIZE);
4156                 io_len = lbn_off + bsize - io_off;
4157         }
4158         /*
4159          * The RMODINPROGRESS flag makes sure that nfs(3)_bio() sees a
4160          * consistent value of r_size. RMODINPROGRESS is set in writerp().
4161          * When RMODINPROGRESS is set it indicates that a uiomove() is in
4162          * progress and the r_size has not been made consistent with the
4163          * new size of the file. When the uiomove() completes the r_size is
4164          * updated and the RMODINPROGRESS flag is cleared.
4165          *
4166          * The RMODINPROGRESS flag makes sure that nfs(3)_bio() sees a
4167          * consistent value of r_size. Without this handshaking, it is
4168          * possible that nfs(3)_bio() picks  up the old value of r_size
4169          * before the uiomove() in writerp() completes. This will result
4170          * in the write through nfs(3)_bio() being dropped.
4171          *
4172          * More precisely, there is a window between the time the uiomove()
4173          * completes and the time the r_size is updated. If a VOP_PUTPAGE()
4174          * operation intervenes in this window, the page will be picked up,
4175          * because it is dirty (it will be unlocked, unless it was
4176          * pagecreate'd). When the page is picked up as dirty, the dirty
4177          * bit is reset (pvn_getdirty()). In nfs(3)write(), r_size is
4178          * checked. This will still be the old size. Therefore the page will
4179          * not be written out. When segmap_release() calls VOP_PUTPAGE(),
4180          * the page will be found to be clean and the write will be dropped.
4181          */
4182         if (rp->r_flags & RMODINPROGRESS) {
4183                 mutex_enter(&rp->r_statelock);
4184                 if ((rp->r_flags & RMODINPROGRESS) &&
4185                     rp->r_modaddr + MAXBSIZE > io_off &&
4186                     rp->r_modaddr < io_off + io_len) {
4187                         page_t *plist;
4188                         /*
4189                          * A write is in progress for this region of the file.
4190                          * If we did not detect RMODINPROGRESS here then this
4191                          * path through nfs_putapage() would eventually go to
4192                          * nfs(3)_bio() and may not write out all of the data
4193                          * in the pages. We end up losing data. So we decide
4194                          * to set the modified bit on each page in the page
4195                          * list and mark the rnode with RDIRTY. This write
4196                          * will be restarted at some later time.
4197                          */
4198                         plist = pp;
4199                         while (plist != NULL) {
4200                                 pp = plist;
4201                                 page_sub(&plist, pp);
4202                                 hat_setmod(pp);
4203                                 page_io_unlock(pp);
4204                                 page_unlock(pp);
4205                         }
4206                         rp->r_flags |= RDIRTY;
4207                         mutex_exit(&rp->r_statelock);
4208                         if (offp)
4209                                 *offp = io_off;
4210                         if (lenp)
4211                                 *lenp = io_len;
4212                         return (0);
4213                 }
4214                 mutex_exit(&rp->r_statelock);
4215         }
4216 
4217         if (flags & B_ASYNC) {
4218                 error = nfs_async_putapage(vp, pp, io_off, io_len, flags, cr,
4219                     nfs_sync_putapage);
4220         } else
4221                 error = nfs_sync_putapage(vp, pp, io_off, io_len, flags, cr);
4222 
4223         if (offp)
4224                 *offp = io_off;
4225         if (lenp)
4226                 *lenp = io_len;
4227         return (error);
4228 }
4229 
4230 static int
4231 nfs_sync_putapage(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
4232         int flags, cred_t *cr)
4233 {
4234         int error;
4235         rnode_t *rp;
4236 
4237         flags |= B_WRITE;
4238 
4239         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
4240         error = nfs_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
4241 
4242         rp = VTOR(vp);
4243 
4244         if ((error == ENOSPC || error == EDQUOT || error == EACCES) &&
4245             (flags & (B_INVAL|B_FORCE)) != (B_INVAL|B_FORCE)) {
4246                 if (!(rp->r_flags & ROUTOFSPACE)) {
4247                         mutex_enter(&rp->r_statelock);
4248                         rp->r_flags |= ROUTOFSPACE;
4249                         mutex_exit(&rp->r_statelock);
4250                 }
4251                 flags |= B_ERROR;
4252                 pvn_write_done(pp, flags);
4253                 /*
4254                  * If this was not an async thread, then try again to
4255                  * write out the pages, but this time, also destroy
4256                  * them whether or not the write is successful.  This
4257                  * will prevent memory from filling up with these
4258                  * pages and destroying them is the only alternative
4259                  * if they can't be written out.
4260                  *
4261                  * Don't do this if this is an async thread because
4262                  * when the pages are unlocked in pvn_write_done,
4263                  * some other thread could have come along, locked
4264                  * them, and queued for an async thread.  It would be
4265                  * possible for all of the async threads to be tied
4266                  * up waiting to lock the pages again and they would
4267                  * all already be locked and waiting for an async
4268                  * thread to handle them.  Deadlock.
4269                  */
4270                 if (!(flags & B_ASYNC)) {
4271                         error = nfs_putpage(vp, io_off, io_len,
4272                             B_INVAL | B_FORCE, cr, NULL);
4273                 }
4274         } else {
4275                 if (error)
4276                         flags |= B_ERROR;
4277                 else if (rp->r_flags & ROUTOFSPACE) {
4278                         mutex_enter(&rp->r_statelock);
4279                         rp->r_flags &= ~ROUTOFSPACE;
4280                         mutex_exit(&rp->r_statelock);
4281                 }
4282                 pvn_write_done(pp, flags);
4283         }
4284 
4285         return (error);
4286 }
4287 
4288 /* ARGSUSED */
4289 static int
4290 nfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
4291         size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
4292         caller_context_t *ct)
4293 {
4294         struct segvn_crargs vn_a;
4295         int error;
4296         rnode_t *rp;
4297         struct vattr va;
4298 
4299         if (nfs_zone() != VTOMI(vp)->mi_zone)
4300                 return (EIO);
4301 
4302         if (vp->v_flag & VNOMAP)
4303                 return (ENOSYS);
4304 
4305         if (off > MAXOFF32_T)
4306                 return (EFBIG);
4307 
4308         if (off < 0 || off + len < 0)
4309                 return (ENXIO);
4310 
4311         if (vp->v_type != VREG)
4312                 return (ENODEV);
4313 
4314         /*
4315          * If there is cached data and if close-to-open consistency
4316          * checking is not turned off and if the file system is not
4317          * mounted readonly, then force an over the wire getattr.
4318          * Otherwise, just invoke nfsgetattr to get a copy of the
4319          * attributes.  The attribute cache will be used unless it
4320          * is timed out and if it is, then an over the wire getattr
4321          * will be issued.
4322          */
4323         va.va_mask = AT_ALL;
4324         if (vn_has_cached_data(vp) &&
4325             !(VTOMI(vp)->mi_flags & MI_NOCTO) && !vn_is_readonly(vp))
4326                 error = nfs_getattr_otw(vp, &va, cr);
4327         else
4328                 error = nfsgetattr(vp, &va, cr);
4329         if (error)
4330                 return (error);
4331 
4332         /*
4333          * Check to see if the vnode is currently marked as not cachable.
4334          * This means portions of the file are locked (through VOP_FRLOCK).
4335          * In this case the map request must be refused.  We use
4336          * rp->r_lkserlock to avoid a race with concurrent lock requests.
4337          */
4338         rp = VTOR(vp);
4339 
4340         /*
4341          * Atomically increment r_inmap after acquiring r_rwlock. The
4342          * idea here is to acquire r_rwlock to block read/write and
4343          * not to protect r_inmap. r_inmap will inform nfs_read/write()
4344          * that we are in nfs_map(). Now, r_rwlock is acquired in order
4345          * and we can prevent the deadlock that would have occurred
4346          * when nfs_addmap() would have acquired it out of order.
4347          *
4348          * Since we are not protecting r_inmap by any lock, we do not
4349          * hold any lock when we decrement it. We atomically decrement
4350          * r_inmap after we release r_lkserlock.
4351          */
4352 
4353         if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, INTR(vp)))
4354                 return (EINTR);
4355         atomic_add_int(&rp->r_inmap, 1);
4356         nfs_rw_exit(&rp->r_rwlock);
4357 
4358         if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR(vp))) {
4359                 atomic_add_int(&rp->r_inmap, -1);
4360                 return (EINTR);
4361         }
4362         if (vp->v_flag & VNOCACHE) {
4363                 error = EAGAIN;
4364                 goto done;
4365         }
4366 
4367         /*
4368          * Don't allow concurrent locks and mapping if mandatory locking is
4369          * enabled.
4370          */
4371         if ((flk_has_remote_locks(vp) || lm_has_sleep(vp)) &&
4372             MANDLOCK(vp, va.va_mode)) {
4373                 error = EAGAIN;
4374                 goto done;
4375         }
4376 
4377         as_rangelock(as);
4378         error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
4379         if (error != 0) {
4380                 as_rangeunlock(as);
4381                 goto done;
4382         }
4383 
4384         vn_a.vp = vp;
4385         vn_a.offset = off;
4386         vn_a.type = (flags & MAP_TYPE);
4387         vn_a.prot = (uchar_t)prot;
4388         vn_a.maxprot = (uchar_t)maxprot;
4389         vn_a.flags = (flags & ~MAP_TYPE);
4390         vn_a.cred = cr;
4391         vn_a.amp = NULL;
4392         vn_a.szc = 0;
4393         vn_a.lgrp_mem_policy_flags = 0;
4394 
4395         error = as_map(as, *addrp, len, segvn_create, &vn_a);
4396         as_rangeunlock(as);
4397 
4398 done:
4399         nfs_rw_exit(&rp->r_lkserlock);
4400         atomic_add_int(&rp->r_inmap, -1);
4401         return (error);
4402 }
4403 
4404 /* ARGSUSED */
4405 static int
4406 nfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
4407         size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
4408         caller_context_t *ct)
4409 {
4410         rnode_t *rp;
4411 
4412         if (vp->v_flag & VNOMAP)
4413                 return (ENOSYS);
4414         if (nfs_zone() != VTOMI(vp)->mi_zone)
4415                 return (EIO);
4416 
4417         rp = VTOR(vp);
4418         atomic_add_long((ulong_t *)&rp->r_mapcnt, btopr(len));
4419 
4420         return (0);
4421 }
4422 
4423 /* ARGSUSED */
4424 static int
4425 nfs_frlock(vnode_t *vp, int cmd, struct flock64 *bfp, int flag, offset_t offset,
4426         struct flk_callback *flk_cbp, cred_t *cr, caller_context_t *ct)
4427 {
4428         netobj lm_fh;
4429         int rc;
4430         u_offset_t start, end;
4431         rnode_t *rp;
4432         int error = 0, intr = INTR(vp);
4433 
4434         /* check for valid cmd parameter */
4435         if (cmd != F_GETLK && cmd != F_SETLK && cmd != F_SETLKW)
4436                 return (EINVAL);
4437         if (nfs_zone() != VTOMI(vp)->mi_zone)
4438                 return (EIO);
4439 
4440         /* Verify l_type. */
4441         switch (bfp->l_type) {
4442         case F_RDLCK:
4443                 if (cmd != F_GETLK && !(flag & FREAD))
4444                         return (EBADF);
4445                 break;
4446         case F_WRLCK:
4447                 if (cmd != F_GETLK && !(flag & FWRITE))
4448                         return (EBADF);
4449                 break;
4450         case F_UNLCK:
4451                 intr = 0;
4452                 break;
4453 
4454         default:
4455                 return (EINVAL);
4456         }
4457 
4458         /* check the validity of the lock range */
4459         if (rc = flk_convert_lock_data(vp, bfp, &start, &end, offset))
4460                 return (rc);
4461         if (rc = flk_check_lock_data(start, end, MAXOFF32_T))
4462                 return (rc);
4463 
4464         /*
4465          * If the filesystem is mounted using local locking, pass the
4466          * request off to the local locking code.
4467          */
4468         if (VTOMI(vp)->mi_flags & MI_LLOCK) {
4469                 if (offset > MAXOFF32_T)
4470                         return (EFBIG);
4471                 if (cmd == F_SETLK || cmd == F_SETLKW) {
4472                         /*
4473                          * For complete safety, we should be holding
4474                          * r_lkserlock.  However, we can't call
4475                          * lm_safelock and then fs_frlock while
4476                          * holding r_lkserlock, so just invoke
4477                          * lm_safelock and expect that this will
4478                          * catch enough of the cases.
4479                          */
4480                         if (!lm_safelock(vp, bfp, cr))
4481                                 return (EAGAIN);
4482                 }
4483                 return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
4484         }
4485 
4486         rp = VTOR(vp);
4487 
4488         /*
4489          * Check whether the given lock request can proceed, given the
4490          * current file mappings.
4491          */
4492         if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_WRITER, intr))
4493                 return (EINTR);
4494         if (cmd == F_SETLK || cmd == F_SETLKW) {
4495                 if (!lm_safelock(vp, bfp, cr)) {
4496                         rc = EAGAIN;
4497                         goto done;
4498                 }
4499         }
4500 
4501         /*
4502          * Flush the cache after waiting for async I/O to finish.  For new
4503          * locks, this is so that the process gets the latest bits from the
4504          * server.  For unlocks, this is so that other clients see the
4505          * latest bits once the file has been unlocked.  If currently dirty
4506          * pages can't be flushed, then don't allow a lock to be set.  But
4507          * allow unlocks to succeed, to avoid having orphan locks on the
4508          * server.
4509          */
4510         if (cmd != F_GETLK) {
4511                 mutex_enter(&rp->r_statelock);
4512                 while (rp->r_count > 0) {
4513                         if (intr) {
4514                                 klwp_t *lwp = ttolwp(curthread);
4515 
4516                                 if (lwp != NULL)
4517                                         lwp->lwp_nostop++;
4518                                 if (cv_wait_sig(&rp->r_cv, &rp->r_statelock)
4519                                     == 0) {
4520                                         if (lwp != NULL)
4521                                                 lwp->lwp_nostop--;
4522                                         rc = EINTR;
4523                                         break;
4524                                 }
4525                                 if (lwp != NULL)
4526                                         lwp->lwp_nostop--;
4527                         } else
4528                         cv_wait(&rp->r_cv, &rp->r_statelock);
4529                 }
4530                 mutex_exit(&rp->r_statelock);
4531                 if (rc != 0)
4532                         goto done;
4533                 error = nfs_putpage(vp, (offset_t)0, 0, B_INVAL, cr, ct);
4534                 if (error) {
4535                         if (error == ENOSPC || error == EDQUOT) {
4536                                 mutex_enter(&rp->r_statelock);
4537                                 if (!rp->r_error)
4538                                         rp->r_error = error;
4539                                 mutex_exit(&rp->r_statelock);
4540                         }
4541                         if (bfp->l_type != F_UNLCK) {
4542                                 rc = ENOLCK;
4543                                 goto done;
4544                         }
4545                 }
4546         }
4547 
4548         lm_fh.n_len = sizeof (fhandle_t);
4549         lm_fh.n_bytes = (char *)VTOFH(vp);
4550 
4551         /*
4552          * Call the lock manager to do the real work of contacting
4553          * the server and obtaining the lock.
4554          */
4555         rc = lm_frlock(vp, cmd, bfp, flag, offset, cr, &lm_fh, flk_cbp);
4556 
4557         if (rc == 0)
4558                 nfs_lockcompletion(vp, cmd);
4559 
4560 done:
4561         nfs_rw_exit(&rp->r_lkserlock);
4562         return (rc);
4563 }
4564 
4565 /*
4566  * Free storage space associated with the specified vnode.  The portion
4567  * to be freed is specified by bfp->l_start and bfp->l_len (already
4568  * normalized to a "whence" of 0).
4569  *
4570  * This is an experimental facility whose continued existence is not
4571  * guaranteed.  Currently, we only support the special case
4572  * of l_len == 0, meaning free to end of file.
4573  */
4574 /* ARGSUSED */
4575 static int
4576 nfs_space(vnode_t *vp, int cmd, struct flock64 *bfp, int flag,
4577         offset_t offset, cred_t *cr, caller_context_t *ct)
4578 {
4579         int error;
4580 
4581         ASSERT(vp->v_type == VREG);
4582         if (cmd != F_FREESP)
4583                 return (EINVAL);
4584 
4585         if (offset > MAXOFF32_T)
4586                 return (EFBIG);
4587 
4588         if ((bfp->l_start > MAXOFF32_T) || (bfp->l_end > MAXOFF32_T) ||
4589             (bfp->l_len > MAXOFF32_T))
4590                 return (EFBIG);
4591 
4592         if (nfs_zone() != VTOMI(vp)->mi_zone)
4593                 return (EIO);
4594 
4595         error = convoff(vp, bfp, 0, offset);
4596         if (!error) {
4597                 ASSERT(bfp->l_start >= 0);
4598                 if (bfp->l_len == 0) {
4599                         struct vattr va;
4600 
4601                         /*
4602                          * ftruncate should not change the ctime and
4603                          * mtime if we truncate the file to its
4604                          * previous size.
4605                          */
4606                         va.va_mask = AT_SIZE;
4607                         error = nfsgetattr(vp, &va, cr);
4608                         if (error || va.va_size == bfp->l_start)
4609                                 return (error);
4610                         va.va_mask = AT_SIZE;
4611                         va.va_size = bfp->l_start;
4612                         error = nfssetattr(vp, &va, 0, cr);
4613                 } else
4614                         error = EINVAL;
4615         }
4616 
4617         return (error);
4618 }
4619 
4620 /* ARGSUSED */
4621 static int
4622 nfs_realvp(vnode_t *vp, vnode_t **vpp, caller_context_t *ct)
4623 {
4624 
4625         return (EINVAL);
4626 }
4627 
4628 /*
4629  * Setup and add an address space callback to do the work of the delmap call.
4630  * The callback will (and must be) deleted in the actual callback function.
4631  *
4632  * This is done in order to take care of the problem that we have with holding
4633  * the address space's a_lock for a long period of time (e.g. if the NFS server
4634  * is down).  Callbacks will be executed in the address space code while the
4635  * a_lock is not held.  Holding the address space's a_lock causes things such
4636  * as ps and fork to hang because they are trying to acquire this lock as well.
4637  */
4638 /* ARGSUSED */
4639 static int
4640 nfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
4641         size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr,
4642         caller_context_t *ct)
4643 {
4644         int                     caller_found;
4645         int                     error;
4646         rnode_t                 *rp;
4647         nfs_delmap_args_t       *dmapp;
4648         nfs_delmapcall_t        *delmap_call;
4649 
4650         if (vp->v_flag & VNOMAP)
4651                 return (ENOSYS);
4652         /*
4653          * A process may not change zones if it has NFS pages mmap'ed
4654          * in, so we can't legitimately get here from the wrong zone.
4655          */
4656         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
4657 
4658         rp = VTOR(vp);
4659 
4660         /*
4661          * The way that the address space of this process deletes its mapping
4662          * of this file is via the following call chains:
4663          * - as_free()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs_delmap()
4664          * - as_unmap()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs_delmap()
4665          *
4666          * With the use of address space callbacks we are allowed to drop the
4667          * address space lock, a_lock, while executing the NFS operations that
4668          * need to go over the wire.  Returning EAGAIN to the caller of this
4669          * function is what drives the execution of the callback that we add
4670          * below.  The callback will be executed by the address space code
4671          * after dropping the a_lock.  When the callback is finished, since
4672          * we dropped the a_lock, it must be re-acquired and segvn_unmap()
4673          * is called again on the same segment to finish the rest of the work
4674          * that needs to happen during unmapping.
4675          *
4676          * This action of calling back into the segment driver causes
4677          * nfs_delmap() to get called again, but since the callback was
4678          * already executed at this point, it already did the work and there
4679          * is nothing left for us to do.
4680          *
4681          * To Summarize:
4682          * - The first time nfs_delmap is called by the current thread is when
4683          * we add the caller associated with this delmap to the delmap caller
4684          * list, add the callback, and return EAGAIN.
4685          * - The second time in this call chain when nfs_delmap is called we
4686          * will find this caller in the delmap caller list and realize there
4687          * is no more work to do thus removing this caller from the list and
4688          * returning the error that was set in the callback execution.
4689          */
4690         caller_found = nfs_find_and_delete_delmapcall(rp, &error);
4691         if (caller_found) {
4692                 /*
4693                  * 'error' is from the actual delmap operations.  To avoid
4694                  * hangs, we need to handle the return of EAGAIN differently
4695                  * since this is what drives the callback execution.
4696                  * In this case, we don't want to return EAGAIN and do the
4697                  * callback execution because there are none to execute.
4698                  */
4699                 if (error == EAGAIN)
4700                         return (0);
4701                 else
4702                         return (error);
4703         }
4704 
4705         /* current caller was not in the list */
4706         delmap_call = nfs_init_delmapcall();
4707 
4708         mutex_enter(&rp->r_statelock);
4709         list_insert_tail(&rp->r_indelmap, delmap_call);
4710         mutex_exit(&rp->r_statelock);
4711 
4712         dmapp = kmem_alloc(sizeof (nfs_delmap_args_t), KM_SLEEP);
4713 
4714         dmapp->vp = vp;
4715         dmapp->off = off;
4716         dmapp->addr = addr;
4717         dmapp->len = len;
4718         dmapp->prot = prot;
4719         dmapp->maxprot = maxprot;
4720         dmapp->flags = flags;
4721         dmapp->cr = cr;
4722         dmapp->caller = delmap_call;
4723 
4724         error = as_add_callback(as, nfs_delmap_callback, dmapp,
4725             AS_UNMAP_EVENT, addr, len, KM_SLEEP);
4726 
4727         return (error ? error : EAGAIN);
4728 }
4729 
4730 /*
4731  * Remove some pages from an mmap'd vnode.  Just update the
4732  * count of pages.  If doing close-to-open, then flush all
4733  * of the pages associated with this file.  Otherwise, start
4734  * an asynchronous page flush to write out any dirty pages.
4735  * This will also associate a credential with the rnode which
4736  * can be used to write the pages.
4737  */
4738 /* ARGSUSED */
4739 static void
4740 nfs_delmap_callback(struct as *as, void *arg, uint_t event)
4741 {
4742         int                     error;
4743         rnode_t                 *rp;
4744         mntinfo_t               *mi;
4745         nfs_delmap_args_t       *dmapp = (nfs_delmap_args_t *)arg;
4746 
4747         rp = VTOR(dmapp->vp);
4748         mi = VTOMI(dmapp->vp);
4749 
4750         atomic_add_long((ulong_t *)&rp->r_mapcnt, -btopr(dmapp->len));
4751         ASSERT(rp->r_mapcnt >= 0);
4752 
4753         /*
4754          * Initiate a page flush if there are pages, the file system
4755          * was not mounted readonly, the segment was mapped shared, and
4756          * the pages themselves were writeable.
4757          */
4758         if (vn_has_cached_data(dmapp->vp) && !vn_is_readonly(dmapp->vp) &&
4759             dmapp->flags == MAP_SHARED && (dmapp->maxprot & PROT_WRITE)) {
4760                 mutex_enter(&rp->r_statelock);
4761                 rp->r_flags |= RDIRTY;
4762                 mutex_exit(&rp->r_statelock);
4763                 /*
4764                  * If this is a cross-zone access a sync putpage won't work, so
4765                  * the best we can do is try an async putpage.  That seems
4766                  * better than something more draconian such as discarding the
4767                  * dirty pages.
4768                  */
4769                 if ((mi->mi_flags & MI_NOCTO) ||
4770                     nfs_zone() != mi->mi_zone)
4771                         error = nfs_putpage(dmapp->vp, dmapp->off, dmapp->len,
4772                             B_ASYNC, dmapp->cr, NULL);
4773                 else
4774                         error = nfs_putpage(dmapp->vp, dmapp->off, dmapp->len,
4775                             0, dmapp->cr, NULL);
4776                 if (!error) {
4777                         mutex_enter(&rp->r_statelock);
4778                         error = rp->r_error;
4779                         rp->r_error = 0;
4780                         mutex_exit(&rp->r_statelock);
4781                 }
4782         } else
4783                 error = 0;
4784 
4785         if ((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO))
4786                 (void) nfs_putpage(dmapp->vp, dmapp->off, dmapp->len,
4787                     B_INVAL, dmapp->cr, NULL);
4788 
4789         dmapp->caller->error = error;
4790         (void) as_delete_callback(as, arg);
4791         kmem_free(dmapp, sizeof (nfs_delmap_args_t));
4792 }
4793 
4794 /* ARGSUSED */
4795 static int
4796 nfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
4797         caller_context_t *ct)
4798 {
4799         int error = 0;
4800 
4801         if (nfs_zone() != VTOMI(vp)->mi_zone)
4802                 return (EIO);
4803         /*
4804          * This looks a little weird because it's written in a general
4805          * manner but we make little use of cases.  If cntl() ever gets
4806          * widely used, the outer switch will make more sense.
4807          */
4808 
4809         switch (cmd) {
4810 
4811         /*
4812          * Large file spec - need to base answer new query with
4813          * hardcoded constant based on the protocol.
4814          */
4815         case _PC_FILESIZEBITS:
4816                 *valp = 32;
4817                 return (0);
4818 
4819         case _PC_LINK_MAX:
4820         case _PC_NAME_MAX:
4821         case _PC_PATH_MAX:
4822         case _PC_SYMLINK_MAX:
4823         case _PC_CHOWN_RESTRICTED:
4824         case _PC_NO_TRUNC: {
4825                 mntinfo_t *mi;
4826                 struct pathcnf *pc;
4827 
4828                 if ((mi = VTOMI(vp)) == NULL || (pc = mi->mi_pathconf) == NULL)
4829                         return (EINVAL);
4830                 error = _PC_ISSET(cmd, pc->pc_mask);    /* error or bool */
4831                 switch (cmd) {
4832                 case _PC_LINK_MAX:
4833                         *valp = pc->pc_link_max;
4834                         break;
4835                 case _PC_NAME_MAX:
4836                         *valp = pc->pc_name_max;
4837                         break;
4838                 case _PC_PATH_MAX:
4839                 case _PC_SYMLINK_MAX:
4840                         *valp = pc->pc_path_max;
4841                         break;
4842                 case _PC_CHOWN_RESTRICTED:
4843                         /*
4844                          * if we got here, error is really a boolean which
4845                          * indicates whether cmd is set or not.
4846                          */
4847                         *valp = error ? 1 : 0;  /* see above */
4848                         error = 0;
4849                         break;
4850                 case _PC_NO_TRUNC:
4851                         /*
4852                          * if we got here, error is really a boolean which
4853                          * indicates whether cmd is set or not.
4854                          */
4855                         *valp = error ? 1 : 0;  /* see above */
4856                         error = 0;
4857                         break;
4858                 }
4859                 return (error ? EINVAL : 0);
4860                 }
4861 
4862         case _PC_XATTR_EXISTS:
4863                 *valp = 0;
4864                 if (vp->v_vfsp->vfs_flag & VFS_XATTR) {
4865                         vnode_t *avp;
4866                         rnode_t *rp;
4867                         mntinfo_t *mi = VTOMI(vp);
4868 
4869                         if (!(mi->mi_flags & MI_EXTATTR))
4870                                 return (0);
4871 
4872                         rp = VTOR(vp);
4873                         if (nfs_rw_enter_sig(&rp->r_rwlock, RW_READER,
4874                             INTR(vp)))
4875                                 return (EINTR);
4876 
4877                         error = nfslookup_dnlc(vp, XATTR_DIR_NAME, &avp, cr);
4878                         if (error || avp == NULL)
4879                                 error = acl_getxattrdir2(vp, &avp, 0, cr, 0);
4880 
4881                         nfs_rw_exit(&rp->r_rwlock);
4882 
4883                         if (error == 0 && avp != NULL) {
4884                                 error = do_xattr_exists_check(avp, valp, cr);
4885                                 VN_RELE(avp);
4886                         }
4887                 }
4888                 return (error ? EINVAL : 0);
4889 
4890         case _PC_ACL_ENABLED:
4891                 *valp = _ACL_ACLENT_ENABLED;
4892                 return (0);
4893 
4894         default:
4895                 return (EINVAL);
4896         }
4897 }
4898 
4899 /*
4900  * Called by async thread to do synchronous pageio. Do the i/o, wait
4901  * for it to complete, and cleanup the page list when done.
4902  */
4903 static int
4904 nfs_sync_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
4905         int flags, cred_t *cr)
4906 {
4907         int error;
4908 
4909         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
4910         error = nfs_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
4911         if (flags & B_READ)
4912                 pvn_read_done(pp, (error ? B_ERROR : 0) | flags);
4913         else
4914                 pvn_write_done(pp, (error ? B_ERROR : 0) | flags);
4915         return (error);
4916 }
4917 
4918 /* ARGSUSED */
4919 static int
4920 nfs_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
4921         int flags, cred_t *cr, caller_context_t *ct)
4922 {
4923         int error;
4924         rnode_t *rp;
4925 
4926         if (pp == NULL)
4927                 return (EINVAL);
4928 
4929         if (io_off > MAXOFF32_T)
4930                 return (EFBIG);
4931         if (nfs_zone() != VTOMI(vp)->mi_zone)
4932                 return (EIO);
4933         rp = VTOR(vp);
4934         mutex_enter(&rp->r_statelock);
4935         rp->r_count++;
4936         mutex_exit(&rp->r_statelock);
4937 
4938         if (flags & B_ASYNC) {
4939                 error = nfs_async_pageio(vp, pp, io_off, io_len, flags, cr,
4940                     nfs_sync_pageio);
4941         } else
4942                 error = nfs_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
4943         mutex_enter(&rp->r_statelock);
4944         rp->r_count--;
4945         cv_broadcast(&rp->r_cv);
4946         mutex_exit(&rp->r_statelock);
4947         return (error);
4948 }
4949 
4950 /* ARGSUSED */
4951 static int
4952 nfs_setsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr,
4953         caller_context_t *ct)
4954 {
4955         int error;
4956         mntinfo_t *mi;
4957 
4958         mi = VTOMI(vp);
4959 
4960         if (nfs_zone() != mi->mi_zone)
4961                 return (EIO);
4962         if (mi->mi_flags & MI_ACL) {
4963                 error = acl_setacl2(vp, vsecattr, flag, cr);
4964                 if (mi->mi_flags & MI_ACL)
4965                         return (error);
4966         }
4967 
4968         return (ENOSYS);
4969 }
4970 
4971 /* ARGSUSED */
4972 static int
4973 nfs_getsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr,
4974         caller_context_t *ct)
4975 {
4976         int error;
4977         mntinfo_t *mi;
4978 
4979         mi = VTOMI(vp);
4980 
4981         if (nfs_zone() != mi->mi_zone)
4982                 return (EIO);
4983         if (mi->mi_flags & MI_ACL) {
4984                 error = acl_getacl2(vp, vsecattr, flag, cr);
4985                 if (mi->mi_flags & MI_ACL)
4986                         return (error);
4987         }
4988 
4989         return (fs_fab_acl(vp, vsecattr, flag, cr, ct));
4990 }
4991 
4992 /* ARGSUSED */
4993 static int
4994 nfs_shrlock(vnode_t *vp, int cmd, struct shrlock *shr, int flag, cred_t *cr,
4995         caller_context_t *ct)
4996 {
4997         int error;
4998         struct shrlock nshr;
4999         struct nfs_owner nfs_owner;
5000         netobj lm_fh;
5001 
5002         if (nfs_zone() != VTOMI(vp)->mi_zone)
5003                 return (EIO);
5004 
5005         /*
5006          * check for valid cmd parameter
5007          */
5008         if (cmd != F_SHARE && cmd != F_UNSHARE && cmd != F_HASREMOTELOCKS)
5009                 return (EINVAL);
5010 
5011         /*
5012          * Check access permissions
5013          */
5014         if (cmd == F_SHARE &&
5015             (((shr->s_access & F_RDACC) && !(flag & FREAD)) ||
5016             ((shr->s_access & F_WRACC) && !(flag & FWRITE))))
5017                 return (EBADF);
5018 
5019         /*
5020          * If the filesystem is mounted using local locking, pass the
5021          * request off to the local share code.
5022          */
5023         if (VTOMI(vp)->mi_flags & MI_LLOCK)
5024                 return (fs_shrlock(vp, cmd, shr, flag, cr, ct));
5025 
5026         switch (cmd) {
5027         case F_SHARE:
5028         case F_UNSHARE:
5029                 lm_fh.n_len = sizeof (fhandle_t);
5030                 lm_fh.n_bytes = (char *)VTOFH(vp);
5031 
5032                 /*
5033                  * If passed an owner that is too large to fit in an
5034                  * nfs_owner it is likely a recursive call from the
5035                  * lock manager client and pass it straight through.  If
5036                  * it is not a nfs_owner then simply return an error.
5037                  */
5038                 if (shr->s_own_len > sizeof (nfs_owner.lowner)) {
5039                         if (((struct nfs_owner *)shr->s_owner)->magic !=
5040                             NFS_OWNER_MAGIC)
5041                                 return (EINVAL);
5042 
5043                         if (error = lm_shrlock(vp, cmd, shr, flag, &lm_fh)) {
5044                                 error = set_errno(error);
5045                         }
5046                         return (error);
5047                 }
5048                 /*
5049                  * Remote share reservations owner is a combination of
5050                  * a magic number, hostname, and the local owner
5051                  */
5052                 bzero(&nfs_owner, sizeof (nfs_owner));
5053                 nfs_owner.magic = NFS_OWNER_MAGIC;
5054                 (void) strncpy(nfs_owner.hname, uts_nodename(),
5055                     sizeof (nfs_owner.hname));
5056                 bcopy(shr->s_owner, nfs_owner.lowner, shr->s_own_len);
5057                 nshr.s_access = shr->s_access;
5058                 nshr.s_deny = shr->s_deny;
5059                 nshr.s_sysid = 0;
5060                 nshr.s_pid = ttoproc(curthread)->p_pid;
5061                 nshr.s_own_len = sizeof (nfs_owner);
5062                 nshr.s_owner = (caddr_t)&nfs_owner;
5063 
5064                 if (error = lm_shrlock(vp, cmd, &nshr, flag, &lm_fh)) {
5065                         error = set_errno(error);
5066                 }
5067 
5068                 break;
5069 
5070         case F_HASREMOTELOCKS:
5071                 /*
5072                  * NFS client can't store remote locks itself
5073                  */
5074                 shr->s_access = 0;
5075                 error = 0;
5076                 break;
5077 
5078         default:
5079                 error = EINVAL;
5080                 break;
5081         }
5082 
5083         return (error);
5084 }