1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1990, 2010, Oracle and/or its affiliates. All rights reserved.
  23  *
  24  *      Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T.
  25  *      All rights reserved.
  26  */
  27 
  28 /*
  29  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  30  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  31  */
  32 
  33 #include <sys/param.h>
  34 #include <sys/types.h>
  35 #include <sys/systm.h>
  36 #include <sys/cred.h>
  37 #include <sys/time.h>
  38 #include <sys/vnode.h>
  39 #include <sys/vfs.h>
  40 #include <sys/vfs_opreg.h>
  41 #include <sys/file.h>
  42 #include <sys/filio.h>
  43 #include <sys/uio.h>
  44 #include <sys/buf.h>
  45 #include <sys/mman.h>
  46 #include <sys/pathname.h>
  47 #include <sys/dirent.h>
  48 #include <sys/debug.h>
  49 #include <sys/vmsystm.h>
  50 #include <sys/fcntl.h>
  51 #include <sys/flock.h>
  52 #include <sys/swap.h>
  53 #include <sys/errno.h>
  54 #include <sys/strsubr.h>
  55 #include <sys/sysmacros.h>
  56 #include <sys/kmem.h>
  57 #include <sys/cmn_err.h>
  58 #include <sys/pathconf.h>
  59 #include <sys/utsname.h>
  60 #include <sys/dnlc.h>
  61 #include <sys/acl.h>
  62 #include <sys/atomic.h>
  63 #include <sys/policy.h>
  64 #include <sys/sdt.h>
  65 
  66 #include <rpc/types.h>
  67 #include <rpc/auth.h>
  68 #include <rpc/clnt.h>
  69 
  70 #include <nfs/nfs.h>
  71 #include <nfs/nfs_clnt.h>
  72 #include <nfs/rnode.h>
  73 #include <nfs/nfs_acl.h>
  74 #include <nfs/lm.h>
  75 
  76 #include <vm/hat.h>
  77 #include <vm/as.h>
  78 #include <vm/page.h>
  79 #include <vm/pvn.h>
  80 #include <vm/seg.h>
  81 #include <vm/seg_map.h>
  82 #include <vm/seg_kpm.h>
  83 #include <vm/seg_vn.h>
  84 
  85 #include <fs/fs_subr.h>
  86 
  87 #include <sys/ddi.h>
  88 
  89 static int      nfs_rdwrlbn(vnode_t *, page_t *, u_offset_t, size_t, int,
  90                         cred_t *);
  91 static int      nfswrite(vnode_t *, caddr_t, uint_t, int, cred_t *);
  92 static int      nfsread(vnode_t *, caddr_t, uint_t, int, size_t *, cred_t *);
  93 static int      nfssetattr(vnode_t *, struct vattr *, int, cred_t *);
  94 static int      nfslookup_dnlc(vnode_t *, char *, vnode_t **, cred_t *);
  95 static int      nfslookup_otw(vnode_t *, char *, vnode_t **, cred_t *, int);
  96 static int      nfsrename(vnode_t *, char *, vnode_t *, char *, cred_t *,
  97                         caller_context_t *);
  98 static int      nfsreaddir(vnode_t *, rddir_cache *, cred_t *);
  99 static int      nfs_bio(struct buf *, cred_t *);
 100 static int      nfs_getapage(vnode_t *, u_offset_t, size_t, uint_t *,
 101                         page_t *[], size_t, struct seg *, caddr_t,
 102                         enum seg_rw, cred_t *);
 103 static void     nfs_readahead(vnode_t *, u_offset_t, caddr_t, struct seg *,
 104                         cred_t *);
 105 static int      nfs_sync_putapage(vnode_t *, page_t *, u_offset_t, size_t,
 106                         int, cred_t *);
 107 static int      nfs_sync_pageio(vnode_t *, page_t *, u_offset_t, size_t,
 108                         int, cred_t *);
 109 static void     nfs_delmap_callback(struct as *, void *, uint_t);
 110 
 111 /*
 112  * Error flags used to pass information about certain special errors
 113  * which need to be handled specially.
 114  */
 115 #define NFS_EOF                 -98
 116 
 117 /*
 118  * These are the vnode ops routines which implement the vnode interface to
 119  * the networked file system.  These routines just take their parameters,
 120  * make them look networkish by putting the right info into interface structs,
 121  * and then calling the appropriate remote routine(s) to do the work.
 122  *
 123  * Note on directory name lookup cacheing:  If we detect a stale fhandle,
 124  * we purge the directory cache relative to that vnode.  This way, the
 125  * user won't get burned by the cache repeatedly.  See <nfs/rnode.h> for
 126  * more details on rnode locking.
 127  */
 128 
 129 static int      nfs_open(vnode_t **, int, cred_t *, caller_context_t *);
 130 static int      nfs_close(vnode_t *, int, int, offset_t, cred_t *,
 131                         caller_context_t *);
 132 static int      nfs_read(vnode_t *, struct uio *, int, cred_t *,
 133                         caller_context_t *);
 134 static int      nfs_write(vnode_t *, struct uio *, int, cred_t *,
 135                         caller_context_t *);
 136 static int      nfs_ioctl(vnode_t *, int, intptr_t, int, cred_t *, int *,
 137                         caller_context_t *);
 138 static int      nfs_getattr(vnode_t *, struct vattr *, int, cred_t *,
 139                         caller_context_t *);
 140 static int      nfs_setattr(vnode_t *, struct vattr *, int, cred_t *,
 141                         caller_context_t *);
 142 static int      nfs_access(vnode_t *, int, int, cred_t *, caller_context_t *);
 143 static int      nfs_accessx(void *, int, cred_t *);
 144 static int      nfs_readlink(vnode_t *, struct uio *, cred_t *,
 145                         caller_context_t *);
 146 static int      nfs_fsync(vnode_t *, int, cred_t *, caller_context_t *);
 147 static void     nfs_inactive(vnode_t *, cred_t *, caller_context_t *);
 148 static int      nfs_lookup(vnode_t *, char *, vnode_t **, struct pathname *,
 149                         int, vnode_t *, cred_t *, caller_context_t *,
 150                         int *, pathname_t *);
 151 static int      nfs_create(vnode_t *, char *, struct vattr *, enum vcexcl,
 152                         int, vnode_t **, cred_t *, int, caller_context_t *,
 153                         vsecattr_t *);
 154 static int      nfs_remove(vnode_t *, char *, cred_t *, caller_context_t *,
 155                         int);
 156 static int      nfs_link(vnode_t *, vnode_t *, char *, cred_t *,
 157                         caller_context_t *, int);
 158 static int      nfs_rename(vnode_t *, char *, vnode_t *, char *, cred_t *,
 159                         caller_context_t *, int);
 160 static int      nfs_mkdir(vnode_t *, char *, struct vattr *, vnode_t **,
 161                         cred_t *, caller_context_t *, int, vsecattr_t *);
 162 static int      nfs_rmdir(vnode_t *, char *, vnode_t *, cred_t *,
 163                         caller_context_t *, int);
 164 static int      nfs_symlink(vnode_t *, char *, struct vattr *, char *,
 165                         cred_t *, caller_context_t *, int);
 166 static int      nfs_readdir(vnode_t *, struct uio *, cred_t *, int *,
 167                         caller_context_t *, int);
 168 static int      nfs_fid(vnode_t *, fid_t *, caller_context_t *);
 169 static int      nfs_rwlock(vnode_t *, int, caller_context_t *);
 170 static void     nfs_rwunlock(vnode_t *, int, caller_context_t *);
 171 static int      nfs_seek(vnode_t *, offset_t, offset_t *, caller_context_t *);
 172 static int      nfs_getpage(vnode_t *, offset_t, size_t, uint_t *,
 173                         page_t *[], size_t, struct seg *, caddr_t,
 174                         enum seg_rw, cred_t *, caller_context_t *);
 175 static int      nfs_putpage(vnode_t *, offset_t, size_t, int, cred_t *,
 176                         caller_context_t *);
 177 static int      nfs_map(vnode_t *, offset_t, struct as *, caddr_t *, size_t,
 178                         uchar_t, uchar_t, uint_t, cred_t *, caller_context_t *);
 179 static int      nfs_addmap(vnode_t *, offset_t, struct as *, caddr_t, size_t,
 180                         uchar_t, uchar_t, uint_t, cred_t *, caller_context_t *);
 181 static int      nfs_frlock(vnode_t *, int, struct flock64 *, int, offset_t,
 182                         struct flk_callback *, cred_t *, caller_context_t *);
 183 static int      nfs_space(vnode_t *, int, struct flock64 *, int, offset_t,
 184                         cred_t *, caller_context_t *);
 185 static int      nfs_realvp(vnode_t *, vnode_t **, caller_context_t *);
 186 static int      nfs_delmap(vnode_t *, offset_t, struct as *, caddr_t, size_t,
 187                         uint_t, uint_t, uint_t, cred_t *, caller_context_t *);
 188 static int      nfs_pathconf(vnode_t *, int, ulong_t *, cred_t *,
 189                         caller_context_t *);
 190 static int      nfs_pageio(vnode_t *, page_t *, u_offset_t, size_t, int,
 191                         cred_t *, caller_context_t *);
 192 static int      nfs_setsecattr(vnode_t *, vsecattr_t *, int, cred_t *,
 193                         caller_context_t *);
 194 static int      nfs_getsecattr(vnode_t *, vsecattr_t *, int, cred_t *,
 195                         caller_context_t *);
 196 static int      nfs_shrlock(vnode_t *, int, struct shrlock *, int, cred_t *,
 197                         caller_context_t *);
 198 
 199 struct vnodeops *nfs_vnodeops;
 200 
 201 const fs_operation_def_t nfs_vnodeops_template[] = {
 202         VOPNAME_OPEN,           { .vop_open = nfs_open },
 203         VOPNAME_CLOSE,          { .vop_close = nfs_close },
 204         VOPNAME_READ,           { .vop_read = nfs_read },
 205         VOPNAME_WRITE,          { .vop_write = nfs_write },
 206         VOPNAME_IOCTL,          { .vop_ioctl = nfs_ioctl },
 207         VOPNAME_GETATTR,        { .vop_getattr = nfs_getattr },
 208         VOPNAME_SETATTR,        { .vop_setattr = nfs_setattr },
 209         VOPNAME_ACCESS,         { .vop_access = nfs_access },
 210         VOPNAME_LOOKUP,         { .vop_lookup = nfs_lookup },
 211         VOPNAME_CREATE,         { .vop_create = nfs_create },
 212         VOPNAME_REMOVE,         { .vop_remove = nfs_remove },
 213         VOPNAME_LINK,           { .vop_link = nfs_link },
 214         VOPNAME_RENAME,         { .vop_rename = nfs_rename },
 215         VOPNAME_MKDIR,          { .vop_mkdir = nfs_mkdir },
 216         VOPNAME_RMDIR,          { .vop_rmdir = nfs_rmdir },
 217         VOPNAME_READDIR,        { .vop_readdir = nfs_readdir },
 218         VOPNAME_SYMLINK,        { .vop_symlink = nfs_symlink },
 219         VOPNAME_READLINK,       { .vop_readlink = nfs_readlink },
 220         VOPNAME_FSYNC,          { .vop_fsync = nfs_fsync },
 221         VOPNAME_INACTIVE,       { .vop_inactive = nfs_inactive },
 222         VOPNAME_FID,            { .vop_fid = nfs_fid },
 223         VOPNAME_RWLOCK,         { .vop_rwlock = nfs_rwlock },
 224         VOPNAME_RWUNLOCK,       { .vop_rwunlock = nfs_rwunlock },
 225         VOPNAME_SEEK,           { .vop_seek = nfs_seek },
 226         VOPNAME_FRLOCK,         { .vop_frlock = nfs_frlock },
 227         VOPNAME_SPACE,          { .vop_space = nfs_space },
 228         VOPNAME_REALVP,         { .vop_realvp = nfs_realvp },
 229         VOPNAME_GETPAGE,        { .vop_getpage = nfs_getpage },
 230         VOPNAME_PUTPAGE,        { .vop_putpage = nfs_putpage },
 231         VOPNAME_MAP,            { .vop_map = nfs_map },
 232         VOPNAME_ADDMAP,         { .vop_addmap = nfs_addmap },
 233         VOPNAME_DELMAP,         { .vop_delmap = nfs_delmap },
 234         VOPNAME_DUMP,           { .vop_dump = nfs_dump },
 235         VOPNAME_PATHCONF,       { .vop_pathconf = nfs_pathconf },
 236         VOPNAME_PAGEIO,         { .vop_pageio = nfs_pageio },
 237         VOPNAME_SETSECATTR,     { .vop_setsecattr = nfs_setsecattr },
 238         VOPNAME_GETSECATTR,     { .vop_getsecattr = nfs_getsecattr },
 239         VOPNAME_SHRLOCK,        { .vop_shrlock = nfs_shrlock },
 240         VOPNAME_VNEVENT,        { .vop_vnevent = fs_vnevent_support },
 241         NULL,                   NULL
 242 };
 243 
 244 /*
 245  * XXX:  This is referenced in modstubs.s
 246  */
 247 struct vnodeops *
 248 nfs_getvnodeops(void)
 249 {
 250         return (nfs_vnodeops);
 251 }
 252 
 253 /* ARGSUSED */
 254 static int
 255 nfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
 256 {
 257         int error;
 258         struct vattr va;
 259         rnode_t *rp;
 260         vnode_t *vp;
 261 
 262         vp = *vpp;
 263         rp = VTOR(vp);
 264         if (nfs_zone() != VTOMI(vp)->mi_zone)
 265                 return (EIO);
 266         mutex_enter(&rp->r_statelock);
 267         if (rp->r_cred == NULL) {
 268                 crhold(cr);
 269                 rp->r_cred = cr;
 270         }
 271         mutex_exit(&rp->r_statelock);
 272 
 273         /*
 274          * If there is no cached data or if close-to-open
 275          * consistency checking is turned off, we can avoid
 276          * the over the wire getattr.  Otherwise, if the
 277          * file system is mounted readonly, then just verify
 278          * the caches are up to date using the normal mechanism.
 279          * Else, if the file is not mmap'd, then just mark
 280          * the attributes as timed out.  They will be refreshed
 281          * and the caches validated prior to being used.
 282          * Else, the file system is mounted writeable so
 283          * force an over the wire GETATTR in order to ensure
 284          * that all cached data is valid.
 285          */
 286         if (vp->v_count > 1 ||
 287             ((vn_has_cached_data(vp) || HAVE_RDDIR_CACHE(rp)) &&
 288             !(VTOMI(vp)->mi_flags & MI_NOCTO))) {
 289                 if (vn_is_readonly(vp))
 290                         error = nfs_validate_caches(vp, cr);
 291                 else if (rp->r_mapcnt == 0 && vp->v_count == 1) {
 292                         PURGE_ATTRCACHE(vp);
 293                         error = 0;
 294                 } else {
 295                         va.va_mask = AT_ALL;
 296                         error = nfs_getattr_otw(vp, &va, cr);
 297                 }
 298         } else
 299                 error = 0;
 300 
 301         return (error);
 302 }
 303 
 304 /* ARGSUSED */
 305 static int
 306 nfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
 307         caller_context_t *ct)
 308 {
 309         rnode_t *rp;
 310         int error;
 311         struct vattr va;
 312 
 313         /*
 314          * zone_enter(2) prevents processes from changing zones with NFS files
 315          * open; if we happen to get here from the wrong zone we can't do
 316          * anything over the wire.
 317          */
 318         if (VTOMI(vp)->mi_zone != nfs_zone()) {
 319                 /*
 320                  * We could attempt to clean up locks, except we're sure
 321                  * that the current process didn't acquire any locks on
 322                  * the file: any attempt to lock a file belong to another zone
 323                  * will fail, and one can't lock an NFS file and then change
 324                  * zones, as that fails too.
 325                  *
 326                  * Returning an error here is the sane thing to do.  A
 327                  * subsequent call to VN_RELE() which translates to a
 328                  * nfs_inactive() will clean up state: if the zone of the
 329                  * vnode's origin is still alive and kicking, an async worker
 330                  * thread will handle the request (from the correct zone), and
 331                  * everything (minus the final nfs_getattr_otw() call) should
 332                  * be OK. If the zone is going away nfs_async_inactive() will
 333                  * throw away cached pages inline.
 334                  */
 335                 return (EIO);
 336         }
 337 
 338         /*
 339          * If we are using local locking for this filesystem, then
 340          * release all of the SYSV style record locks.  Otherwise,
 341          * we are doing network locking and we need to release all
 342          * of the network locks.  All of the locks held by this
 343          * process on this file are released no matter what the
 344          * incoming reference count is.
 345          */
 346         if (VTOMI(vp)->mi_flags & MI_LLOCK) {
 347                 cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
 348                 cleanshares(vp, ttoproc(curthread)->p_pid);
 349         } else
 350                 nfs_lockrelease(vp, flag, offset, cr);
 351 
 352         if (count > 1)
 353                 return (0);
 354 
 355         /*
 356          * If the file has been `unlinked', then purge the
 357          * DNLC so that this vnode will get reycled quicker
 358          * and the .nfs* file on the server will get removed.
 359          */
 360         rp = VTOR(vp);
 361         if (rp->r_unldvp != NULL)
 362                 dnlc_purge_vp(vp);
 363 
 364         /*
 365          * If the file was open for write and there are pages,
 366          * then if the file system was mounted using the "no-close-
 367          *      to-open" semantics, then start an asynchronous flush
 368          *      of the all of the pages in the file.
 369          * else the file system was not mounted using the "no-close-
 370          *      to-open" semantics, then do a synchronous flush and
 371          *      commit of all of the dirty and uncommitted pages.
 372          *
 373          * The asynchronous flush of the pages in the "nocto" path
 374          * mostly just associates a cred pointer with the rnode so
 375          * writes which happen later will have a better chance of
 376          * working.  It also starts the data being written to the
 377          * server, but without unnecessarily delaying the application.
 378          */
 379         if ((flag & FWRITE) && vn_has_cached_data(vp)) {
 380                 if ((VTOMI(vp)->mi_flags & MI_NOCTO)) {
 381                         error = nfs_putpage(vp, (offset_t)0, 0, B_ASYNC,
 382                             cr, ct);
 383                         if (error == EAGAIN)
 384                                 error = 0;
 385                 } else
 386                         error = nfs_putpage(vp, (offset_t)0, 0, 0, cr, ct);
 387                 if (!error) {
 388                         mutex_enter(&rp->r_statelock);
 389                         error = rp->r_error;
 390                         rp->r_error = 0;
 391                         mutex_exit(&rp->r_statelock);
 392                 }
 393         } else {
 394                 mutex_enter(&rp->r_statelock);
 395                 error = rp->r_error;
 396                 rp->r_error = 0;
 397                 mutex_exit(&rp->r_statelock);
 398         }
 399 
 400         /*
 401          * If RWRITEATTR is set, then issue an over the wire GETATTR to
 402          * refresh the attribute cache with a set of attributes which
 403          * weren't returned from a WRITE.  This will enable the close-
 404          * to-open processing to work.
 405          */
 406         if (rp->r_flags & RWRITEATTR)
 407                 (void) nfs_getattr_otw(vp, &va, cr);
 408 
 409         return (error);
 410 }
 411 
 412 /* ARGSUSED */
 413 static int
 414 nfs_read(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
 415         caller_context_t *ct)
 416 {
 417         rnode_t *rp;
 418         u_offset_t off;
 419         offset_t diff;
 420         int on;
 421         size_t n;
 422         caddr_t base;
 423         uint_t flags;
 424         int error;
 425         mntinfo_t *mi;
 426 
 427         rp = VTOR(vp);
 428         mi = VTOMI(vp);
 429 
 430         if (nfs_zone() != mi->mi_zone)
 431                 return (EIO);
 432 
 433         ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER));
 434 
 435         if (vp->v_type != VREG)
 436                 return (EISDIR);
 437 
 438         if (uiop->uio_resid == 0)
 439                 return (0);
 440 
 441         if (uiop->uio_loffset > MAXOFF32_T)
 442                 return (EFBIG);
 443 
 444         if (uiop->uio_loffset < 0 ||
 445             uiop->uio_loffset + uiop->uio_resid > MAXOFF32_T)
 446                 return (EINVAL);
 447 
 448         /*
 449          * Bypass VM if caching has been disabled (e.g., locking) or if
 450          * using client-side direct I/O and the file is not mmap'd and
 451          * there are no cached pages.
 452          */
 453         if ((vp->v_flag & VNOCACHE) ||
 454             (((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO)) &&
 455             rp->r_mapcnt == 0 && rp->r_inmap == 0 &&
 456             !vn_has_cached_data(vp))) {
 457                 size_t bufsize;
 458                 size_t resid = 0;
 459 
 460                 /*
 461                  * Let's try to do read in as large a chunk as we can
 462                  * (Filesystem (NFS client) bsize if possible/needed).
 463                  * For V3, this is 32K and for V2, this is 8K.
 464                  */
 465                 bufsize = MIN(uiop->uio_resid, VTOMI(vp)->mi_curread);
 466                 base = kmem_alloc(bufsize, KM_SLEEP);
 467                 do {
 468                         n = MIN(uiop->uio_resid, bufsize);
 469                         error = nfsread(vp, base, uiop->uio_offset, n,
 470                             &resid, cr);
 471                         if (!error) {
 472                                 n -= resid;
 473                                 error = uiomove(base, n, UIO_READ, uiop);
 474                         }
 475                 } while (!error && uiop->uio_resid > 0 && n > 0);
 476                 kmem_free(base, bufsize);
 477                 return (error);
 478         }
 479 
 480         error = 0;
 481 
 482         do {
 483                 off = uiop->uio_loffset & MAXBMASK; /* mapping offset */
 484                 on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */
 485                 n = MIN(MAXBSIZE - on, uiop->uio_resid);
 486 
 487                 error = nfs_validate_caches(vp, cr);
 488                 if (error)
 489                         break;
 490 
 491                 mutex_enter(&rp->r_statelock);
 492                 while (rp->r_flags & RINCACHEPURGE) {
 493                         if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
 494                                 mutex_exit(&rp->r_statelock);
 495                                 return (EINTR);
 496                         }
 497                 }
 498                 diff = rp->r_size - uiop->uio_loffset;
 499                 mutex_exit(&rp->r_statelock);
 500                 if (diff <= 0)
 501                         break;
 502                 if (diff < n)
 503                         n = (size_t)diff;
 504 
 505                 if (vpm_enable) {
 506                         /*
 507                          * Copy data.
 508                          */
 509                         error = vpm_data_copy(vp, off + on, n, uiop,
 510                             1, NULL, 0, S_READ);
 511                 } else {
 512                         base = segmap_getmapflt(segkmap, vp, off + on, n,
 513                             1, S_READ);
 514                         error = uiomove(base + on, n, UIO_READ, uiop);
 515                 }
 516 
 517                 if (!error) {
 518                         /*
 519                          * If read a whole block or read to eof,
 520                          * won't need this buffer again soon.
 521                          */
 522                         mutex_enter(&rp->r_statelock);
 523                         if (n + on == MAXBSIZE ||
 524                             uiop->uio_loffset == rp->r_size)
 525                                 flags = SM_DONTNEED;
 526                         else
 527                                 flags = 0;
 528                         mutex_exit(&rp->r_statelock);
 529                         if (vpm_enable) {
 530                                 error = vpm_sync_pages(vp, off, n, flags);
 531                         } else {
 532                                 error = segmap_release(segkmap, base, flags);
 533                         }
 534                 } else {
 535                         if (vpm_enable) {
 536                                 (void) vpm_sync_pages(vp, off, n, 0);
 537                         } else {
 538                                 (void) segmap_release(segkmap, base, 0);
 539                         }
 540                 }
 541         } while (!error && uiop->uio_resid > 0);
 542 
 543         return (error);
 544 }
 545 
 546 /* ARGSUSED */
 547 static int
 548 nfs_write(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
 549         caller_context_t *ct)
 550 {
 551         rnode_t *rp;
 552         u_offset_t off;
 553         caddr_t base;
 554         uint_t flags;
 555         int remainder;
 556         size_t n;
 557         int on;
 558         int error;
 559         int resid;
 560         offset_t offset;
 561         rlim_t limit;
 562         mntinfo_t *mi;
 563 
 564         rp = VTOR(vp);
 565 
 566         mi = VTOMI(vp);
 567         if (nfs_zone() != mi->mi_zone)
 568                 return (EIO);
 569         if (vp->v_type != VREG)
 570                 return (EISDIR);
 571 
 572         if (uiop->uio_resid == 0)
 573                 return (0);
 574 
 575         if (ioflag & FAPPEND) {
 576                 struct vattr va;
 577 
 578                 /*
 579                  * Must serialize if appending.
 580                  */
 581                 if (nfs_rw_lock_held(&rp->r_rwlock, RW_READER)) {
 582                         nfs_rw_exit(&rp->r_rwlock);
 583                         if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER,
 584                             INTR(vp)))
 585                                 return (EINTR);
 586                 }
 587 
 588                 va.va_mask = AT_SIZE;
 589                 error = nfsgetattr(vp, &va, cr);
 590                 if (error)
 591                         return (error);
 592                 uiop->uio_loffset = va.va_size;
 593         }
 594 
 595         if (uiop->uio_loffset > MAXOFF32_T)
 596                 return (EFBIG);
 597 
 598         offset = uiop->uio_loffset + uiop->uio_resid;
 599 
 600         if (uiop->uio_loffset < 0 || offset > MAXOFF32_T)
 601                 return (EINVAL);
 602 
 603         if (uiop->uio_llimit > (rlim64_t)MAXOFF32_T) {
 604                 limit = MAXOFF32_T;
 605         } else {
 606                 limit = (rlim_t)uiop->uio_llimit;
 607         }
 608 
 609         /*
 610          * Check to make sure that the process will not exceed
 611          * its limit on file size.  It is okay to write up to
 612          * the limit, but not beyond.  Thus, the write which
 613          * reaches the limit will be short and the next write
 614          * will return an error.
 615          */
 616         remainder = 0;
 617         if (offset > limit) {
 618                 remainder = offset - limit;
 619                 uiop->uio_resid = limit - uiop->uio_offset;
 620                 if (uiop->uio_resid <= 0) {
 621                         proc_t *p = ttoproc(curthread);
 622 
 623                         uiop->uio_resid += remainder;
 624                         mutex_enter(&p->p_lock);
 625                         (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
 626                             p->p_rctls, p, RCA_UNSAFE_SIGINFO);
 627                         mutex_exit(&p->p_lock);
 628                         return (EFBIG);
 629                 }
 630         }
 631 
 632         if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR(vp)))
 633                 return (EINTR);
 634 
 635         /*
 636          * Bypass VM if caching has been disabled (e.g., locking) or if
 637          * using client-side direct I/O and the file is not mmap'd and
 638          * there are no cached pages.
 639          */
 640         if ((vp->v_flag & VNOCACHE) ||
 641             (((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO)) &&
 642             rp->r_mapcnt == 0 && rp->r_inmap == 0 &&
 643             !vn_has_cached_data(vp))) {
 644                 size_t bufsize;
 645                 int count;
 646                 uint_t org_offset;
 647 
 648 nfs_fwrite:
 649                 if (rp->r_flags & RSTALE) {
 650                         resid = uiop->uio_resid;
 651                         offset = uiop->uio_loffset;
 652                         error = rp->r_error;
 653                         /*
 654                          * A close may have cleared r_error, if so,
 655                          * propagate ESTALE error return properly
 656                          */
 657                         if (error == 0)
 658                                 error = ESTALE;
 659                         goto bottom;
 660                 }
 661                 bufsize = MIN(uiop->uio_resid, mi->mi_curwrite);
 662                 base = kmem_alloc(bufsize, KM_SLEEP);
 663                 do {
 664                         resid = uiop->uio_resid;
 665                         offset = uiop->uio_loffset;
 666                         count = MIN(uiop->uio_resid, bufsize);
 667                         org_offset = uiop->uio_offset;
 668                         error = uiomove(base, count, UIO_WRITE, uiop);
 669                         if (!error) {
 670                                 error = nfswrite(vp, base, org_offset,
 671                                     count, cr);
 672                         }
 673                 } while (!error && uiop->uio_resid > 0);
 674                 kmem_free(base, bufsize);
 675                 goto bottom;
 676         }
 677 
 678         do {
 679                 off = uiop->uio_loffset & MAXBMASK; /* mapping offset */
 680                 on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */
 681                 n = MIN(MAXBSIZE - on, uiop->uio_resid);
 682 
 683                 resid = uiop->uio_resid;
 684                 offset = uiop->uio_loffset;
 685 
 686                 if (rp->r_flags & RSTALE) {
 687                         error = rp->r_error;
 688                         /*
 689                          * A close may have cleared r_error, if so,
 690                          * propagate ESTALE error return properly
 691                          */
 692                         if (error == 0)
 693                                 error = ESTALE;
 694                         break;
 695                 }
 696 
 697                 /*
 698                  * Don't create dirty pages faster than they
 699                  * can be cleaned so that the system doesn't
 700                  * get imbalanced.  If the async queue is
 701                  * maxed out, then wait for it to drain before
 702                  * creating more dirty pages.  Also, wait for
 703                  * any threads doing pagewalks in the vop_getattr
 704                  * entry points so that they don't block for
 705                  * long periods.
 706                  */
 707                 mutex_enter(&rp->r_statelock);
 708                 while ((mi->mi_max_threads != 0 &&
 709                     rp->r_awcount > 2 * mi->mi_max_threads) ||
 710                     rp->r_gcount > 0) {
 711                         if (INTR(vp)) {
 712                                 klwp_t *lwp = ttolwp(curthread);
 713 
 714                                 if (lwp != NULL)
 715                                         lwp->lwp_nostop++;
 716                                 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
 717                                         mutex_exit(&rp->r_statelock);
 718                                         if (lwp != NULL)
 719                                                 lwp->lwp_nostop--;
 720                                         error = EINTR;
 721                                         goto bottom;
 722                                 }
 723                                 if (lwp != NULL)
 724                                         lwp->lwp_nostop--;
 725                         } else
 726                                 cv_wait(&rp->r_cv, &rp->r_statelock);
 727                 }
 728                 mutex_exit(&rp->r_statelock);
 729 
 730                 /*
 731                  * Touch the page and fault it in if it is not in core
 732                  * before segmap_getmapflt or vpm_data_copy can lock it.
 733                  * This is to avoid the deadlock if the buffer is mapped
 734                  * to the same file through mmap which we want to write.
 735                  */
 736                 uio_prefaultpages((long)n, uiop);
 737 
 738                 if (vpm_enable) {
 739                         /*
 740                          * It will use kpm mappings, so no need to
 741                          * pass an address.
 742                          */
 743                         error = writerp(rp, NULL, n, uiop, 0);
 744                 } else  {
 745                         if (segmap_kpm) {
 746                                 int pon = uiop->uio_loffset & PAGEOFFSET;
 747                                 size_t pn = MIN(PAGESIZE - pon,
 748                                     uiop->uio_resid);
 749                                 int pagecreate;
 750 
 751                                 mutex_enter(&rp->r_statelock);
 752                                 pagecreate = (pon == 0) && (pn == PAGESIZE ||
 753                                     uiop->uio_loffset + pn >= rp->r_size);
 754                                 mutex_exit(&rp->r_statelock);
 755 
 756                                 base = segmap_getmapflt(segkmap, vp, off + on,
 757                                     pn, !pagecreate, S_WRITE);
 758 
 759                                 error = writerp(rp, base + pon, n, uiop,
 760                                     pagecreate);
 761 
 762                         } else {
 763                                 base = segmap_getmapflt(segkmap, vp, off + on,
 764                                     n, 0, S_READ);
 765                                 error = writerp(rp, base + on, n, uiop, 0);
 766                         }
 767                 }
 768 
 769                 if (!error) {
 770                         if (mi->mi_flags & MI_NOAC)
 771                                 flags = SM_WRITE;
 772                         else if (n + on == MAXBSIZE || IS_SWAPVP(vp)) {
 773                                 /*
 774                                  * Have written a whole block.
 775                                  * Start an asynchronous write
 776                                  * and mark the buffer to
 777                                  * indicate that it won't be
 778                                  * needed again soon.
 779                                  */
 780                                 flags = SM_WRITE | SM_ASYNC | SM_DONTNEED;
 781                         } else
 782                                 flags = 0;
 783                         if ((ioflag & (FSYNC|FDSYNC)) ||
 784                             (rp->r_flags & ROUTOFSPACE)) {
 785                                 flags &= ~SM_ASYNC;
 786                                 flags |= SM_WRITE;
 787                         }
 788                         if (vpm_enable) {
 789                                 error = vpm_sync_pages(vp, off, n, flags);
 790                         } else {
 791                                 error = segmap_release(segkmap, base, flags);
 792                         }
 793                 } else {
 794                         if (vpm_enable) {
 795                                 (void) vpm_sync_pages(vp, off, n, 0);
 796                         } else {
 797                                 (void) segmap_release(segkmap, base, 0);
 798                         }
 799                         /*
 800                          * In the event that we got an access error while
 801                          * faulting in a page for a write-only file just
 802                          * force a write.
 803                          */
 804                         if (error == EACCES)
 805                                 goto nfs_fwrite;
 806                 }
 807         } while (!error && uiop->uio_resid > 0);
 808 
 809 bottom:
 810         if (error) {
 811                 uiop->uio_resid = resid + remainder;
 812                 uiop->uio_loffset = offset;
 813         } else
 814                 uiop->uio_resid += remainder;
 815 
 816         nfs_rw_exit(&rp->r_lkserlock);
 817 
 818         return (error);
 819 }
 820 
 821 /*
 822  * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED}
 823  */
 824 static int
 825 nfs_rdwrlbn(vnode_t *vp, page_t *pp, u_offset_t off, size_t len,
 826         int flags, cred_t *cr)
 827 {
 828         struct buf *bp;
 829         int error;
 830 
 831         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
 832         bp = pageio_setup(pp, len, vp, flags);
 833         ASSERT(bp != NULL);
 834 
 835         /*
 836          * pageio_setup should have set b_addr to 0.  This
 837          * is correct since we want to do I/O on a page
 838          * boundary.  bp_mapin will use this addr to calculate
 839          * an offset, and then set b_addr to the kernel virtual
 840          * address it allocated for us.
 841          */
 842         ASSERT(bp->b_un.b_addr == 0);
 843 
 844         bp->b_edev = 0;
 845         bp->b_dev = 0;
 846         bp->b_lblkno = lbtodb(off);
 847         bp->b_file = vp;
 848         bp->b_offset = (offset_t)off;
 849         bp_mapin(bp);
 850 
 851         error = nfs_bio(bp, cr);
 852 
 853         bp_mapout(bp);
 854         pageio_done(bp);
 855 
 856         return (error);
 857 }
 858 
 859 /*
 860  * Write to file.  Writes to remote server in largest size
 861  * chunks that the server can handle.  Write is synchronous.
 862  */
 863 static int
 864 nfswrite(vnode_t *vp, caddr_t base, uint_t offset, int count, cred_t *cr)
 865 {
 866         rnode_t *rp;
 867         mntinfo_t *mi;
 868         struct nfswriteargs wa;
 869         struct nfsattrstat ns;
 870         int error;
 871         int tsize;
 872         int douprintf;
 873 
 874         douprintf = 1;
 875 
 876         rp = VTOR(vp);
 877         mi = VTOMI(vp);
 878 
 879         ASSERT(nfs_zone() == mi->mi_zone);
 880 
 881         wa.wa_args = &wa.wa_args_buf;
 882         wa.wa_fhandle = *VTOFH(vp);
 883 
 884         do {
 885                 tsize = MIN(mi->mi_curwrite, count);
 886                 wa.wa_data = base;
 887                 wa.wa_begoff = offset;
 888                 wa.wa_totcount = tsize;
 889                 wa.wa_count = tsize;
 890                 wa.wa_offset = offset;
 891 
 892                 if (mi->mi_io_kstats) {
 893                         mutex_enter(&mi->mi_lock);
 894                         kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
 895                         mutex_exit(&mi->mi_lock);
 896                 }
 897                 wa.wa_mblk = NULL;
 898                 do {
 899                         error = rfs2call(mi, RFS_WRITE,
 900                             xdr_writeargs, (caddr_t)&wa,
 901                             xdr_attrstat, (caddr_t)&ns, cr,
 902                             &douprintf, &ns.ns_status, 0, NULL);
 903                 } while (error == ENFS_TRYAGAIN);
 904                 if (mi->mi_io_kstats) {
 905                         mutex_enter(&mi->mi_lock);
 906                         kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
 907                         mutex_exit(&mi->mi_lock);
 908                 }
 909 
 910                 if (!error) {
 911                         error = geterrno(ns.ns_status);
 912                         /*
 913                          * Can't check for stale fhandle and purge caches
 914                          * here because pages are held by nfs_getpage.
 915                          * Just mark the attribute cache as timed out
 916                          * and set RWRITEATTR to indicate that the file
 917                          * was modified with a WRITE operation.
 918                          */
 919                         if (!error) {
 920                                 count -= tsize;
 921                                 base += tsize;
 922                                 offset += tsize;
 923                                 if (mi->mi_io_kstats) {
 924                                         mutex_enter(&mi->mi_lock);
 925                                         KSTAT_IO_PTR(mi->mi_io_kstats)->
 926                                             writes++;
 927                                         KSTAT_IO_PTR(mi->mi_io_kstats)->
 928                                             nwritten += tsize;
 929                                         mutex_exit(&mi->mi_lock);
 930                                 }
 931                                 lwp_stat_update(LWP_STAT_OUBLK, 1);
 932                                 mutex_enter(&rp->r_statelock);
 933                                 PURGE_ATTRCACHE_LOCKED(rp);
 934                                 rp->r_flags |= RWRITEATTR;
 935                                 mutex_exit(&rp->r_statelock);
 936                         }
 937                 }
 938         } while (!error && count);
 939 
 940         return (error);
 941 }
 942 
 943 /*
 944  * Read from a file.  Reads data in largest chunks our interface can handle.
 945  */
 946 static int
 947 nfsread(vnode_t *vp, caddr_t base, uint_t offset,
 948     int count, size_t *residp, cred_t *cr)
 949 {
 950         mntinfo_t *mi;
 951         struct nfsreadargs ra;
 952         struct nfsrdresult rr;
 953         int tsize;
 954         int error;
 955         int douprintf;
 956         failinfo_t fi;
 957         rnode_t *rp;
 958         struct vattr va;
 959         hrtime_t t;
 960 
 961         rp = VTOR(vp);
 962         mi = VTOMI(vp);
 963 
 964         ASSERT(nfs_zone() == mi->mi_zone);
 965 
 966         douprintf = 1;
 967 
 968         ra.ra_fhandle = *VTOFH(vp);
 969 
 970         fi.vp = vp;
 971         fi.fhp = (caddr_t)&ra.ra_fhandle;
 972         fi.copyproc = nfscopyfh;
 973         fi.lookupproc = nfslookup;
 974         fi.xattrdirproc = acl_getxattrdir2;
 975 
 976         do {
 977                 if (mi->mi_io_kstats) {
 978                         mutex_enter(&mi->mi_lock);
 979                         kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
 980                         mutex_exit(&mi->mi_lock);
 981                 }
 982 
 983                 do {
 984                         tsize = MIN(mi->mi_curread, count);
 985                         rr.rr_data = base;
 986                         ra.ra_offset = offset;
 987                         ra.ra_totcount = tsize;
 988                         ra.ra_count = tsize;
 989                         ra.ra_data = base;
 990                         t = gethrtime();
 991                         error = rfs2call(mi, RFS_READ,
 992                             xdr_readargs, (caddr_t)&ra,
 993                             xdr_rdresult, (caddr_t)&rr, cr,
 994                             &douprintf, &rr.rr_status, 0, &fi);
 995                 } while (error == ENFS_TRYAGAIN);
 996 
 997                 if (mi->mi_io_kstats) {
 998                         mutex_enter(&mi->mi_lock);
 999                         kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
1000                         mutex_exit(&mi->mi_lock);
1001                 }
1002 
1003                 if (!error) {
1004                         error = geterrno(rr.rr_status);
1005                         if (!error) {
1006                                 count -= rr.rr_count;
1007                                 base += rr.rr_count;
1008                                 offset += rr.rr_count;
1009                                 if (mi->mi_io_kstats) {
1010                                         mutex_enter(&mi->mi_lock);
1011                                         KSTAT_IO_PTR(mi->mi_io_kstats)->reads++;
1012                                         KSTAT_IO_PTR(mi->mi_io_kstats)->nread +=
1013                                             rr.rr_count;
1014                                         mutex_exit(&mi->mi_lock);
1015                                 }
1016                                 lwp_stat_update(LWP_STAT_INBLK, 1);
1017                         }
1018                 }
1019         } while (!error && count && rr.rr_count == tsize);
1020 
1021         *residp = count;
1022 
1023         if (!error) {
1024                 /*
1025                  * Since no error occurred, we have the current
1026                  * attributes and we need to do a cache check and then
1027                  * potentially update the cached attributes.  We can't
1028                  * use the normal attribute check and cache mechanisms
1029                  * because they might cause a cache flush which would
1030                  * deadlock.  Instead, we just check the cache to see
1031                  * if the attributes have changed.  If it is, then we
1032                  * just mark the attributes as out of date.  The next
1033                  * time that the attributes are checked, they will be
1034                  * out of date, new attributes will be fetched, and
1035                  * the page cache will be flushed.  If the attributes
1036                  * weren't changed, then we just update the cached
1037                  * attributes with these attributes.
1038                  */
1039                 /*
1040                  * If NFS_ACL is supported on the server, then the
1041                  * attributes returned by server may have minimal
1042                  * permissions sometimes denying access to users having
1043                  * proper access.  To get the proper attributes, mark
1044                  * the attributes as expired so that they will be
1045                  * regotten via the NFS_ACL GETATTR2 procedure.
1046                  */
1047                 error = nattr_to_vattr(vp, &rr.rr_attr, &va);
1048                 mutex_enter(&rp->r_statelock);
1049                 if (error || !CACHE_VALID(rp, va.va_mtime, va.va_size) ||
1050                     (mi->mi_flags & MI_ACL)) {
1051                         mutex_exit(&rp->r_statelock);
1052                         PURGE_ATTRCACHE(vp);
1053                 } else {
1054                         if (rp->r_mtime <= t) {
1055                                 nfs_attrcache_va(vp, &va);
1056                         }
1057                         mutex_exit(&rp->r_statelock);
1058                 }
1059         }
1060 
1061         return (error);
1062 }
1063 
1064 /* ARGSUSED */
1065 static int
1066 nfs_ioctl(vnode_t *vp, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp,
1067         caller_context_t *ct)
1068 {
1069 
1070         if (nfs_zone() != VTOMI(vp)->mi_zone)
1071                 return (EIO);
1072         switch (cmd) {
1073                 case _FIODIRECTIO:
1074                         return (nfs_directio(vp, (int)arg, cr));
1075                 default:
1076                         return (ENOTTY);
1077         }
1078 }
1079 
1080 /* ARGSUSED */
1081 static int
1082 nfs_getattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
1083         caller_context_t *ct)
1084 {
1085         int error;
1086         rnode_t *rp;
1087 
1088         if (nfs_zone() != VTOMI(vp)->mi_zone)
1089                 return (EIO);
1090         /*
1091          * If it has been specified that the return value will
1092          * just be used as a hint, and we are only being asked
1093          * for size, fsid or rdevid, then return the client's
1094          * notion of these values without checking to make sure
1095          * that the attribute cache is up to date.
1096          * The whole point is to avoid an over the wire GETATTR
1097          * call.
1098          */
1099         rp = VTOR(vp);
1100         if (flags & ATTR_HINT) {
1101                 if (vap->va_mask ==
1102                     (vap->va_mask & (AT_SIZE | AT_FSID | AT_RDEV))) {
1103                         mutex_enter(&rp->r_statelock);
1104                         if (vap->va_mask | AT_SIZE)
1105                                 vap->va_size = rp->r_size;
1106                         if (vap->va_mask | AT_FSID)
1107                                 vap->va_fsid = rp->r_attr.va_fsid;
1108                         if (vap->va_mask | AT_RDEV)
1109                                 vap->va_rdev = rp->r_attr.va_rdev;
1110                         mutex_exit(&rp->r_statelock);
1111                         return (0);
1112                 }
1113         }
1114 
1115         /*
1116          * Only need to flush pages if asking for the mtime
1117          * and if there any dirty pages or any outstanding
1118          * asynchronous (write) requests for this file.
1119          */
1120         if (vap->va_mask & AT_MTIME) {
1121                 if (vn_has_cached_data(vp) &&
1122                     ((rp->r_flags & RDIRTY) || rp->r_awcount > 0)) {
1123                         mutex_enter(&rp->r_statelock);
1124                         rp->r_gcount++;
1125                         mutex_exit(&rp->r_statelock);
1126                         error = nfs_putpage(vp, (offset_t)0, 0, 0, cr, ct);
1127                         mutex_enter(&rp->r_statelock);
1128                         if (error && (error == ENOSPC || error == EDQUOT)) {
1129                                 if (!rp->r_error)
1130                                         rp->r_error = error;
1131                         }
1132                         if (--rp->r_gcount == 0)
1133                                 cv_broadcast(&rp->r_cv);
1134                         mutex_exit(&rp->r_statelock);
1135                 }
1136         }
1137 
1138         return (nfsgetattr(vp, vap, cr));
1139 }
1140 
1141 /*ARGSUSED4*/
1142 static int
1143 nfs_setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
1144                 caller_context_t *ct)
1145 {
1146         int error;
1147         uint_t mask;
1148         struct vattr va;
1149 
1150         mask = vap->va_mask;
1151 
1152         if (mask & AT_NOSET)
1153                 return (EINVAL);
1154 
1155         if ((mask & AT_SIZE) &&
1156             vap->va_type == VREG &&
1157             vap->va_size > MAXOFF32_T)
1158                 return (EFBIG);
1159 
1160         if (nfs_zone() != VTOMI(vp)->mi_zone)
1161                 return (EIO);
1162 
1163         va.va_mask = AT_UID | AT_MODE;
1164 
1165         error = nfsgetattr(vp, &va, cr);
1166         if (error)
1167                 return (error);
1168 
1169         error = secpolicy_vnode_setattr(cr, vp, vap, &va, flags, nfs_accessx,
1170             vp);
1171 
1172         if (error)
1173                 return (error);
1174 
1175         error = nfssetattr(vp, vap, flags, cr);
1176 
1177         if (error == 0 && (mask & AT_SIZE) && vap->va_size == 0)
1178                 vnevent_truncate(vp, ct);
1179 
1180         return (error);
1181 }
1182 
1183 static int
1184 nfssetattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr)
1185 {
1186         int error;
1187         uint_t mask;
1188         struct nfssaargs args;
1189         struct nfsattrstat ns;
1190         int douprintf;
1191         rnode_t *rp;
1192         struct vattr va;
1193         mode_t omode;
1194         mntinfo_t *mi;
1195         vsecattr_t *vsp;
1196         hrtime_t t;
1197 
1198         mask = vap->va_mask;
1199 
1200         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
1201 
1202         rp = VTOR(vp);
1203 
1204         /*
1205          * Only need to flush pages if there are any pages and
1206          * if the file is marked as dirty in some fashion.  The
1207          * file must be flushed so that we can accurately
1208          * determine the size of the file and the cached data
1209          * after the SETATTR returns.  A file is considered to
1210          * be dirty if it is either marked with RDIRTY, has
1211          * outstanding i/o's active, or is mmap'd.  In this
1212          * last case, we can't tell whether there are dirty
1213          * pages, so we flush just to be sure.
1214          */
1215         if (vn_has_cached_data(vp) &&
1216             ((rp->r_flags & RDIRTY) ||
1217             rp->r_count > 0 ||
1218             rp->r_mapcnt > 0)) {
1219                 ASSERT(vp->v_type != VCHR);
1220                 error = nfs_putpage(vp, (offset_t)0, 0, 0, cr, NULL);
1221                 if (error && (error == ENOSPC || error == EDQUOT)) {
1222                         mutex_enter(&rp->r_statelock);
1223                         if (!rp->r_error)
1224                                 rp->r_error = error;
1225                         mutex_exit(&rp->r_statelock);
1226                 }
1227         }
1228 
1229         /*
1230          * If the system call was utime(2) or utimes(2) and the
1231          * application did not specify the times, then set the
1232          * mtime nanosecond field to 1 billion.  This will get
1233          * translated from 1 billion nanoseconds to 1 million
1234          * microseconds in the over the wire request.  The
1235          * server will use 1 million in the microsecond field
1236          * to tell whether both the mtime and atime should be
1237          * set to the server's current time.
1238          *
1239          * This is an overload of the protocol and should be
1240          * documented in the NFS Version 2 protocol specification.
1241          */
1242         if ((mask & AT_MTIME) && !(flags & ATTR_UTIME)) {
1243                 vap->va_mtime.tv_nsec = 1000000000;
1244                 if (NFS_TIME_T_OK(vap->va_mtime.tv_sec) &&
1245                     NFS_TIME_T_OK(vap->va_atime.tv_sec)) {
1246                         error = vattr_to_sattr(vap, &args.saa_sa);
1247                 } else {
1248                         /*
1249                          * Use server times. vap time values will not be used.
1250                          * To ensure no time overflow, make sure vap has
1251                          * valid values, but retain the original values.
1252                          */
1253                         timestruc_t     mtime = vap->va_mtime;
1254                         timestruc_t     atime = vap->va_atime;
1255                         time_t          now;
1256 
1257                         now = gethrestime_sec();
1258                         if (NFS_TIME_T_OK(now)) {
1259                                 /* Just in case server does not know of this */
1260                                 vap->va_mtime.tv_sec = now;
1261                                 vap->va_atime.tv_sec = now;
1262                         } else {
1263                                 vap->va_mtime.tv_sec = 0;
1264                                 vap->va_atime.tv_sec = 0;
1265                         }
1266                         error = vattr_to_sattr(vap, &args.saa_sa);
1267                         /* set vap times back on */
1268                         vap->va_mtime = mtime;
1269                         vap->va_atime = atime;
1270                 }
1271         } else {
1272                 /* Either do not set times or use the client specified times */
1273                 error = vattr_to_sattr(vap, &args.saa_sa);
1274         }
1275         if (error) {
1276                 /* req time field(s) overflow - return immediately */
1277                 return (error);
1278         }
1279         args.saa_fh = *VTOFH(vp);
1280 
1281         va.va_mask = AT_MODE;
1282         error = nfsgetattr(vp, &va, cr);
1283         if (error)
1284                 return (error);
1285         omode = va.va_mode;
1286 
1287         mi = VTOMI(vp);
1288 
1289         douprintf = 1;
1290 
1291         t = gethrtime();
1292 
1293         error = rfs2call(mi, RFS_SETATTR,
1294             xdr_saargs, (caddr_t)&args,
1295             xdr_attrstat, (caddr_t)&ns, cr,
1296             &douprintf, &ns.ns_status, 0, NULL);
1297 
1298         /*
1299          * Purge the access cache and ACL cache if changing either the
1300          * owner of the file, the group owner, or the mode.  These may
1301          * change the access permissions of the file, so purge old
1302          * information and start over again.
1303          */
1304         if ((mask & (AT_UID | AT_GID | AT_MODE)) && (mi->mi_flags & MI_ACL)) {
1305                 (void) nfs_access_purge_rp(rp);
1306                 if (rp->r_secattr != NULL) {
1307                         mutex_enter(&rp->r_statelock);
1308                         vsp = rp->r_secattr;
1309                         rp->r_secattr = NULL;
1310                         mutex_exit(&rp->r_statelock);
1311                         if (vsp != NULL)
1312                                 nfs_acl_free(vsp);
1313                 }
1314         }
1315 
1316         if (!error) {
1317                 error = geterrno(ns.ns_status);
1318                 if (!error) {
1319                         /*
1320                          * If changing the size of the file, invalidate
1321                          * any local cached data which is no longer part
1322                          * of the file.  We also possibly invalidate the
1323                          * last page in the file.  We could use
1324                          * pvn_vpzero(), but this would mark the page as
1325                          * modified and require it to be written back to
1326                          * the server for no particularly good reason.
1327                          * This way, if we access it, then we bring it
1328                          * back in.  A read should be cheaper than a
1329                          * write.
1330                          */
1331                         if (mask & AT_SIZE) {
1332                                 nfs_invalidate_pages(vp,
1333                                     (vap->va_size & PAGEMASK), cr);
1334                         }
1335                         (void) nfs_cache_fattr(vp, &ns.ns_attr, &va, t, cr);
1336                         /*
1337                          * If NFS_ACL is supported on the server, then the
1338                          * attributes returned by server may have minimal
1339                          * permissions sometimes denying access to users having
1340                          * proper access.  To get the proper attributes, mark
1341                          * the attributes as expired so that they will be
1342                          * regotten via the NFS_ACL GETATTR2 procedure.
1343                          */
1344                         if (mi->mi_flags & MI_ACL) {
1345                                 PURGE_ATTRCACHE(vp);
1346                         }
1347                         /*
1348                          * This next check attempts to deal with NFS
1349                          * servers which can not handle increasing
1350                          * the size of the file via setattr.  Most
1351                          * of these servers do not return an error,
1352                          * but do not change the size of the file.
1353                          * Hence, this check and then attempt to set
1354                          * the file size by writing 1 byte at the
1355                          * offset of the end of the file that we need.
1356                          */
1357                         if ((mask & AT_SIZE) &&
1358                             ns.ns_attr.na_size < (uint32_t)vap->va_size) {
1359                                 char zb = '\0';
1360 
1361                                 error = nfswrite(vp, &zb,
1362                                     vap->va_size - sizeof (zb),
1363                                     sizeof (zb), cr);
1364                         }
1365                         /*
1366                          * Some servers will change the mode to clear the setuid
1367                          * and setgid bits when changing the uid or gid.  The
1368                          * client needs to compensate appropriately.
1369                          */
1370                         if (mask & (AT_UID | AT_GID)) {
1371                                 int terror;
1372 
1373                                 va.va_mask = AT_MODE;
1374                                 terror = nfsgetattr(vp, &va, cr);
1375                                 if (!terror &&
1376                                     (((mask & AT_MODE) &&
1377                                     va.va_mode != vap->va_mode) ||
1378                                     (!(mask & AT_MODE) &&
1379                                     va.va_mode != omode))) {
1380                                         va.va_mask = AT_MODE;
1381                                         if (mask & AT_MODE)
1382                                                 va.va_mode = vap->va_mode;
1383                                         else
1384                                                 va.va_mode = omode;
1385                                         (void) nfssetattr(vp, &va, 0, cr);
1386                                 }
1387                         }
1388                 } else {
1389                         PURGE_ATTRCACHE(vp);
1390                         PURGE_STALE_FH(error, vp, cr);
1391                 }
1392         } else {
1393                 PURGE_ATTRCACHE(vp);
1394         }
1395 
1396         return (error);
1397 }
1398 
1399 static int
1400 nfs_accessx(void *vp, int mode, cred_t *cr)
1401 {
1402         ASSERT(nfs_zone() == VTOMI((vnode_t *)vp)->mi_zone);
1403         return (nfs_access(vp, mode, 0, cr, NULL));
1404 }
1405 
1406 /* ARGSUSED */
1407 static int
1408 nfs_access(vnode_t *vp, int mode, int flags, cred_t *cr, caller_context_t *ct)
1409 {
1410         struct vattr va;
1411         int error;
1412         mntinfo_t *mi;
1413         int shift = 0;
1414 
1415         mi = VTOMI(vp);
1416 
1417         if (nfs_zone() != mi->mi_zone)
1418                 return (EIO);
1419         if (mi->mi_flags & MI_ACL) {
1420                 error = acl_access2(vp, mode, flags, cr);
1421                 if (mi->mi_flags & MI_ACL)
1422                         return (error);
1423         }
1424 
1425         va.va_mask = AT_MODE | AT_UID | AT_GID;
1426         error = nfsgetattr(vp, &va, cr);
1427         if (error)
1428                 return (error);
1429 
1430         /*
1431          * Disallow write attempts on read-only
1432          * file systems, unless the file is a
1433          * device node.
1434          */
1435         if ((mode & VWRITE) && vn_is_readonly(vp) && !IS_DEVVP(vp))
1436                 return (EROFS);
1437 
1438         /*
1439          * Disallow attempts to access mandatory lock files.
1440          */
1441         if ((mode & (VWRITE | VREAD | VEXEC)) &&
1442             MANDLOCK(vp, va.va_mode))
1443                 return (EACCES);
1444 
1445         /*
1446          * Access check is based on only
1447          * one of owner, group, public.
1448          * If not owner, then check group.
1449          * If not a member of the group,
1450          * then check public access.
1451          */
1452         if (crgetuid(cr) != va.va_uid) {
1453                 shift += 3;
1454                 if (!groupmember(va.va_gid, cr))
1455                         shift += 3;
1456         }
1457 
1458         return (secpolicy_vnode_access2(cr, vp, va.va_uid,
1459             va.va_mode << shift, mode));
1460 }
1461 
1462 static int nfs_do_symlink_cache = 1;
1463 
1464 /* ARGSUSED */
1465 static int
1466 nfs_readlink(vnode_t *vp, struct uio *uiop, cred_t *cr, caller_context_t *ct)
1467 {
1468         int error;
1469         struct nfsrdlnres rl;
1470         rnode_t *rp;
1471         int douprintf;
1472         failinfo_t fi;
1473 
1474         /*
1475          * We want to be consistent with UFS semantics so we will return
1476          * EINVAL instead of ENXIO. This violates the XNFS spec and
1477          * the RFC 1094, which are wrong any way. BUGID 1138002.
1478          */
1479         if (vp->v_type != VLNK)
1480                 return (EINVAL);
1481 
1482         if (nfs_zone() != VTOMI(vp)->mi_zone)
1483                 return (EIO);
1484 
1485         rp = VTOR(vp);
1486         if (nfs_do_symlink_cache && rp->r_symlink.contents != NULL) {
1487                 error = nfs_validate_caches(vp, cr);
1488                 if (error)
1489                         return (error);
1490                 mutex_enter(&rp->r_statelock);
1491                 if (rp->r_symlink.contents != NULL) {
1492                         error = uiomove(rp->r_symlink.contents,
1493                             rp->r_symlink.len, UIO_READ, uiop);
1494                         mutex_exit(&rp->r_statelock);
1495                         return (error);
1496                 }
1497                 mutex_exit(&rp->r_statelock);
1498         }
1499 
1500 
1501         rl.rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP);
1502 
1503         fi.vp = vp;
1504         fi.fhp = NULL;          /* no need to update, filehandle not copied */
1505         fi.copyproc = nfscopyfh;
1506         fi.lookupproc = nfslookup;
1507         fi.xattrdirproc = acl_getxattrdir2;
1508 
1509         douprintf = 1;
1510 
1511         error = rfs2call(VTOMI(vp), RFS_READLINK,
1512             xdr_readlink, (caddr_t)VTOFH(vp),
1513             xdr_rdlnres, (caddr_t)&rl, cr,
1514             &douprintf, &rl.rl_status, 0, &fi);
1515 
1516         if (error) {
1517 
1518                 kmem_free((void *)rl.rl_data, NFS_MAXPATHLEN);
1519                 return (error);
1520         }
1521 
1522         error = geterrno(rl.rl_status);
1523         if (!error) {
1524                 error = uiomove(rl.rl_data, (int)rl.rl_count, UIO_READ, uiop);
1525                 if (nfs_do_symlink_cache && rp->r_symlink.contents == NULL) {
1526                         mutex_enter(&rp->r_statelock);
1527                         if (rp->r_symlink.contents == NULL) {
1528                                 rp->r_symlink.contents = rl.rl_data;
1529                                 rp->r_symlink.len = (int)rl.rl_count;
1530                                 rp->r_symlink.size = NFS_MAXPATHLEN;
1531                                 mutex_exit(&rp->r_statelock);
1532                         } else {
1533                                 mutex_exit(&rp->r_statelock);
1534 
1535                                 kmem_free((void *)rl.rl_data,
1536                                     NFS_MAXPATHLEN);
1537                         }
1538                 } else {
1539 
1540                         kmem_free((void *)rl.rl_data, NFS_MAXPATHLEN);
1541                 }
1542         } else {
1543                 PURGE_STALE_FH(error, vp, cr);
1544 
1545                 kmem_free((void *)rl.rl_data, NFS_MAXPATHLEN);
1546         }
1547 
1548         /*
1549          * Conform to UFS semantics (see comment above)
1550          */
1551         return (error == ENXIO ? EINVAL : error);
1552 }
1553 
1554 /*
1555  * Flush local dirty pages to stable storage on the server.
1556  *
1557  * If FNODSYNC is specified, then there is nothing to do because
1558  * metadata changes are not cached on the client before being
1559  * sent to the server.
1560  */
1561 /* ARGSUSED */
1562 static int
1563 nfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
1564 {
1565         int error;
1566 
1567         if ((syncflag & FNODSYNC) || IS_SWAPVP(vp))
1568                 return (0);
1569 
1570         if (nfs_zone() != VTOMI(vp)->mi_zone)
1571                 return (EIO);
1572 
1573         error = nfs_putpage(vp, (offset_t)0, 0, 0, cr, ct);
1574         if (!error)
1575                 error = VTOR(vp)->r_error;
1576         return (error);
1577 }
1578 
1579 
1580 /*
1581  * Weirdness: if the file was removed or the target of a rename
1582  * operation while it was open, it got renamed instead.  Here we
1583  * remove the renamed file.
1584  */
1585 /* ARGSUSED */
1586 static void
1587 nfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
1588 {
1589         rnode_t *rp;
1590 
1591         ASSERT(vp != DNLC_NO_VNODE);
1592 
1593         /*
1594          * If this is coming from the wrong zone, we let someone in the right
1595          * zone take care of it asynchronously.  We can get here due to
1596          * VN_RELE() being called from pageout() or fsflush().  This call may
1597          * potentially turn into an expensive no-op if, for instance, v_count
1598          * gets incremented in the meantime, but it's still correct.
1599          */
1600         if (nfs_zone() != VTOMI(vp)->mi_zone) {
1601                 nfs_async_inactive(vp, cr, nfs_inactive);
1602                 return;
1603         }
1604 
1605         rp = VTOR(vp);
1606 redo:
1607         if (rp->r_unldvp != NULL) {
1608                 /*
1609                  * Save the vnode pointer for the directory where the
1610                  * unlinked-open file got renamed, then set it to NULL
1611                  * to prevent another thread from getting here before
1612                  * we're done with the remove.  While we have the
1613                  * statelock, make local copies of the pertinent rnode
1614                  * fields.  If we weren't to do this in an atomic way, the
1615                  * the unl* fields could become inconsistent with respect
1616                  * to each other due to a race condition between this
1617                  * code and nfs_remove().  See bug report 1034328.
1618                  */
1619                 mutex_enter(&rp->r_statelock);
1620                 if (rp->r_unldvp != NULL) {
1621                         vnode_t *unldvp;
1622                         char *unlname;
1623                         cred_t *unlcred;
1624                         struct nfsdiropargs da;
1625                         enum nfsstat status;
1626                         int douprintf;
1627                         int error;
1628 
1629                         unldvp = rp->r_unldvp;
1630                         rp->r_unldvp = NULL;
1631                         unlname = rp->r_unlname;
1632                         rp->r_unlname = NULL;
1633                         unlcred = rp->r_unlcred;
1634                         rp->r_unlcred = NULL;
1635                         mutex_exit(&rp->r_statelock);
1636 
1637                         /*
1638                          * If there are any dirty pages left, then flush
1639                          * them.  This is unfortunate because they just
1640                          * may get thrown away during the remove operation,
1641                          * but we have to do this for correctness.
1642                          */
1643                         if (vn_has_cached_data(vp) &&
1644                             ((rp->r_flags & RDIRTY) || rp->r_count > 0)) {
1645                                 ASSERT(vp->v_type != VCHR);
1646                                 error = nfs_putpage(vp, (offset_t)0, 0, 0,
1647                                     cr, ct);
1648                                 if (error) {
1649                                         mutex_enter(&rp->r_statelock);
1650                                         if (!rp->r_error)
1651                                                 rp->r_error = error;
1652                                         mutex_exit(&rp->r_statelock);
1653                                 }
1654                         }
1655 
1656                         /*
1657                          * Do the remove operation on the renamed file
1658                          */
1659                         setdiropargs(&da, unlname, unldvp);
1660 
1661                         douprintf = 1;
1662 
1663                         (void) rfs2call(VTOMI(unldvp), RFS_REMOVE,
1664                             xdr_diropargs, (caddr_t)&da,
1665                             xdr_enum, (caddr_t)&status, unlcred,
1666                             &douprintf, &status, 0, NULL);
1667 
1668                         if (HAVE_RDDIR_CACHE(VTOR(unldvp)))
1669                                 nfs_purge_rddir_cache(unldvp);
1670                         PURGE_ATTRCACHE(unldvp);
1671 
1672                         /*
1673                          * Release stuff held for the remove
1674                          */
1675                         VN_RELE(unldvp);
1676                         kmem_free(unlname, MAXNAMELEN);
1677                         crfree(unlcred);
1678                         goto redo;
1679                 }
1680                 mutex_exit(&rp->r_statelock);
1681         }
1682 
1683         rp_addfree(rp, cr);
1684 }
1685 
1686 /*
1687  * Remote file system operations having to do with directory manipulation.
1688  */
1689 
1690 /* ARGSUSED */
1691 static int
1692 nfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
1693         int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
1694         int *direntflags, pathname_t *realpnp)
1695 {
1696         int error;
1697         vnode_t *vp;
1698         vnode_t *avp = NULL;
1699         rnode_t *drp;
1700 
1701         if (nfs_zone() != VTOMI(dvp)->mi_zone)
1702                 return (EPERM);
1703 
1704         drp = VTOR(dvp);
1705 
1706         /*
1707          * Are we looking up extended attributes?  If so, "dvp" is
1708          * the file or directory for which we want attributes, and
1709          * we need a lookup of the hidden attribute directory
1710          * before we lookup the rest of the path.
1711          */
1712         if (flags & LOOKUP_XATTR) {
1713                 bool_t cflag = ((flags & CREATE_XATTR_DIR) != 0);
1714                 mntinfo_t *mi;
1715 
1716                 mi = VTOMI(dvp);
1717                 if (!(mi->mi_flags & MI_EXTATTR))
1718                         return (EINVAL);
1719 
1720                 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR(dvp)))
1721                         return (EINTR);
1722 
1723                 (void) nfslookup_dnlc(dvp, XATTR_DIR_NAME, &avp, cr);
1724                 if (avp == NULL)
1725                         error = acl_getxattrdir2(dvp, &avp, cflag, cr, 0);
1726                 else
1727                         error = 0;
1728 
1729                 nfs_rw_exit(&drp->r_rwlock);
1730 
1731                 if (error) {
1732                         if (mi->mi_flags & MI_EXTATTR)
1733                                 return (error);
1734                         return (EINVAL);
1735                 }
1736                 dvp = avp;
1737                 drp = VTOR(dvp);
1738         }
1739 
1740         if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR(dvp))) {
1741                 error = EINTR;
1742                 goto out;
1743         }
1744 
1745         error = nfslookup(dvp, nm, vpp, pnp, flags, rdir, cr, 0);
1746 
1747         nfs_rw_exit(&drp->r_rwlock);
1748 
1749         /*
1750          * If vnode is a device, create special vnode.
1751          */
1752         if (!error && IS_DEVVP(*vpp)) {
1753                 vp = *vpp;
1754                 *vpp = specvp(vp, vp->v_rdev, vp->v_type, cr);
1755                 VN_RELE(vp);
1756         }
1757 
1758 out:
1759         if (avp != NULL)
1760                 VN_RELE(avp);
1761 
1762         return (error);
1763 }
1764 
1765 static int nfs_lookup_neg_cache = 1;
1766 
1767 #ifdef DEBUG
1768 static int nfs_lookup_dnlc_hits = 0;
1769 static int nfs_lookup_dnlc_misses = 0;
1770 static int nfs_lookup_dnlc_neg_hits = 0;
1771 static int nfs_lookup_dnlc_disappears = 0;
1772 static int nfs_lookup_dnlc_lookups = 0;
1773 #endif
1774 
1775 /* ARGSUSED */
1776 int
1777 nfslookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
1778         int flags, vnode_t *rdir, cred_t *cr, int rfscall_flags)
1779 {
1780         int error;
1781 
1782         ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone);
1783 
1784         /*
1785          * If lookup is for "", just return dvp.  Don't need
1786          * to send it over the wire, look it up in the dnlc,
1787          * or perform any access checks.
1788          */
1789         if (*nm == '\0') {
1790                 VN_HOLD(dvp);
1791                 *vpp = dvp;
1792                 return (0);
1793         }
1794 
1795         /*
1796          * Can't do lookups in non-directories.
1797          */
1798         if (dvp->v_type != VDIR)
1799                 return (ENOTDIR);
1800 
1801         /*
1802          * If we're called with RFSCALL_SOFT, it's important that
1803          * the only rfscall is one we make directly; if we permit
1804          * an access call because we're looking up "." or validating
1805          * a dnlc hit, we'll deadlock because that rfscall will not
1806          * have the RFSCALL_SOFT set.
1807          */
1808         if (rfscall_flags & RFSCALL_SOFT)
1809                 goto callit;
1810 
1811         /*
1812          * If lookup is for ".", just return dvp.  Don't need
1813          * to send it over the wire or look it up in the dnlc,
1814          * just need to check access.
1815          */
1816         if (strcmp(nm, ".") == 0) {
1817                 error = nfs_access(dvp, VEXEC, 0, cr, NULL);
1818                 if (error)
1819                         return (error);
1820                 VN_HOLD(dvp);
1821                 *vpp = dvp;
1822                 return (0);
1823         }
1824 
1825         /*
1826          * Lookup this name in the DNLC.  If there was a valid entry,
1827          * then return the results of the lookup.
1828          */
1829         error = nfslookup_dnlc(dvp, nm, vpp, cr);
1830         if (error || *vpp != NULL)
1831                 return (error);
1832 
1833 callit:
1834         error = nfslookup_otw(dvp, nm, vpp, cr, rfscall_flags);
1835 
1836         return (error);
1837 }
1838 
1839 static int
1840 nfslookup_dnlc(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr)
1841 {
1842         int error;
1843         vnode_t *vp;
1844 
1845         ASSERT(*nm != '\0');
1846         ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone);
1847 
1848         /*
1849          * Lookup this name in the DNLC.  If successful, then validate
1850          * the caches and then recheck the DNLC.  The DNLC is rechecked
1851          * just in case this entry got invalidated during the call
1852          * to nfs_validate_caches.
1853          *
1854          * An assumption is being made that it is safe to say that a
1855          * file exists which may not on the server.  Any operations to
1856          * the server will fail with ESTALE.
1857          */
1858 #ifdef DEBUG
1859         nfs_lookup_dnlc_lookups++;
1860 #endif
1861         vp = dnlc_lookup(dvp, nm);
1862         if (vp != NULL) {
1863                 VN_RELE(vp);
1864                 if (vp == DNLC_NO_VNODE && !vn_is_readonly(dvp)) {
1865                         PURGE_ATTRCACHE(dvp);
1866                 }
1867                 error = nfs_validate_caches(dvp, cr);
1868                 if (error)
1869                         return (error);
1870                 vp = dnlc_lookup(dvp, nm);
1871                 if (vp != NULL) {
1872                         error = nfs_access(dvp, VEXEC, 0, cr, NULL);
1873                         if (error) {
1874                                 VN_RELE(vp);
1875                                 return (error);
1876                         }
1877                         if (vp == DNLC_NO_VNODE) {
1878                                 VN_RELE(vp);
1879 #ifdef DEBUG
1880                                 nfs_lookup_dnlc_neg_hits++;
1881 #endif
1882                                 return (ENOENT);
1883                         }
1884                         *vpp = vp;
1885 #ifdef DEBUG
1886                         nfs_lookup_dnlc_hits++;
1887 #endif
1888                         return (0);
1889                 }
1890 #ifdef DEBUG
1891                 nfs_lookup_dnlc_disappears++;
1892 #endif
1893         }
1894 #ifdef DEBUG
1895         else
1896                 nfs_lookup_dnlc_misses++;
1897 #endif
1898 
1899         *vpp = NULL;
1900 
1901         return (0);
1902 }
1903 
1904 static int
1905 nfslookup_otw(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr,
1906         int rfscall_flags)
1907 {
1908         int error;
1909         struct nfsdiropargs da;
1910         struct nfsdiropres dr;
1911         int douprintf;
1912         failinfo_t fi;
1913         hrtime_t t;
1914 
1915         ASSERT(*nm != '\0');
1916         ASSERT(dvp->v_type == VDIR);
1917         ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone);
1918 
1919         setdiropargs(&da, nm, dvp);
1920 
1921         fi.vp = dvp;
1922         fi.fhp = NULL;          /* no need to update, filehandle not copied */
1923         fi.copyproc = nfscopyfh;
1924         fi.lookupproc = nfslookup;
1925         fi.xattrdirproc = acl_getxattrdir2;
1926 
1927         douprintf = 1;
1928 
1929         t = gethrtime();
1930 
1931         error = rfs2call(VTOMI(dvp), RFS_LOOKUP,
1932             xdr_diropargs, (caddr_t)&da,
1933             xdr_diropres, (caddr_t)&dr, cr,
1934             &douprintf, &dr.dr_status, rfscall_flags, &fi);
1935 
1936         if (!error) {
1937                 error = geterrno(dr.dr_status);
1938                 if (!error) {
1939                         *vpp = makenfsnode(&dr.dr_fhandle, &dr.dr_attr,
1940                             dvp->v_vfsp, t, cr, VTOR(dvp)->r_path, nm);
1941                         /*
1942                          * If NFS_ACL is supported on the server, then the
1943                          * attributes returned by server may have minimal
1944                          * permissions sometimes denying access to users having
1945                          * proper access.  To get the proper attributes, mark
1946                          * the attributes as expired so that they will be
1947                          * regotten via the NFS_ACL GETATTR2 procedure.
1948                          */
1949                         if (VTOMI(*vpp)->mi_flags & MI_ACL) {
1950                                 PURGE_ATTRCACHE(*vpp);
1951                         }
1952                         if (!(rfscall_flags & RFSCALL_SOFT))
1953                                 dnlc_update(dvp, nm, *vpp);
1954                 } else {
1955                         PURGE_STALE_FH(error, dvp, cr);
1956                         if (error == ENOENT && nfs_lookup_neg_cache)
1957                                 dnlc_enter(dvp, nm, DNLC_NO_VNODE);
1958                 }
1959         }
1960 
1961         return (error);
1962 }
1963 
1964 /* ARGSUSED */
1965 static int
1966 nfs_create(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive,
1967         int mode, vnode_t **vpp, cred_t *cr, int lfaware, caller_context_t *ct,
1968         vsecattr_t *vsecp)
1969 {
1970         int error;
1971         struct nfscreatargs args;
1972         struct nfsdiropres dr;
1973         int douprintf;
1974         vnode_t *vp;
1975         rnode_t *rp;
1976         struct vattr vattr;
1977         rnode_t *drp;
1978         vnode_t *tempvp;
1979         hrtime_t t;
1980 
1981         drp = VTOR(dvp);
1982 
1983         if (nfs_zone() != VTOMI(dvp)->mi_zone)
1984                 return (EPERM);
1985         if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
1986                 return (EINTR);
1987 
1988         /*
1989          * We make a copy of the attributes because the caller does not
1990          * expect us to change what va points to.
1991          */
1992         vattr = *va;
1993 
1994         /*
1995          * If the pathname is "", just use dvp.  Don't need
1996          * to send it over the wire, look it up in the dnlc,
1997          * or perform any access checks.
1998          */
1999         if (*nm == '\0') {
2000                 error = 0;
2001                 VN_HOLD(dvp);
2002                 vp = dvp;
2003         /*
2004          * If the pathname is ".", just use dvp.  Don't need
2005          * to send it over the wire or look it up in the dnlc,
2006          * just need to check access.
2007          */
2008         } else if (strcmp(nm, ".") == 0) {
2009                 error = nfs_access(dvp, VEXEC, 0, cr, ct);
2010                 if (error) {
2011                         nfs_rw_exit(&drp->r_rwlock);
2012                         return (error);
2013                 }
2014                 VN_HOLD(dvp);
2015                 vp = dvp;
2016         /*
2017          * We need to go over the wire, just to be sure whether the
2018          * file exists or not.  Using the DNLC can be dangerous in
2019          * this case when making a decision regarding existence.
2020          */
2021         } else {
2022                 error = nfslookup_otw(dvp, nm, &vp, cr, 0);
2023         }
2024         if (!error) {
2025                 if (exclusive == EXCL)
2026                         error = EEXIST;
2027                 else if (vp->v_type == VDIR && (mode & VWRITE))
2028                         error = EISDIR;
2029                 else {
2030                         /*
2031                          * If vnode is a device, create special vnode.
2032                          */
2033                         if (IS_DEVVP(vp)) {
2034                                 tempvp = vp;
2035                                 vp = specvp(vp, vp->v_rdev, vp->v_type, cr);
2036                                 VN_RELE(tempvp);
2037                         }
2038                         if (!(error = VOP_ACCESS(vp, mode, 0, cr, ct))) {
2039                                 if ((vattr.va_mask & AT_SIZE) &&
2040                                     vp->v_type == VREG) {
2041                                         vattr.va_mask = AT_SIZE;
2042                                         error = nfssetattr(vp, &vattr, 0, cr);
2043 
2044                                         if (!error) {
2045                                                 /*
2046                                                  * Existing file was truncated;
2047                                                  * emit a create event.
2048                                                  */
2049                                                 vnevent_create(vp, ct);
2050                                         }
2051                                 }
2052                         }
2053                 }
2054                 nfs_rw_exit(&drp->r_rwlock);
2055                 if (error) {
2056                         VN_RELE(vp);
2057                 } else {
2058                         *vpp = vp;
2059                 }
2060                 return (error);
2061         }
2062 
2063         ASSERT(vattr.va_mask & AT_TYPE);
2064         if (vattr.va_type == VREG) {
2065                 ASSERT(vattr.va_mask & AT_MODE);
2066                 if (MANDMODE(vattr.va_mode)) {
2067                         nfs_rw_exit(&drp->r_rwlock);
2068                         return (EACCES);
2069                 }
2070         }
2071 
2072         dnlc_remove(dvp, nm);
2073 
2074         setdiropargs(&args.ca_da, nm, dvp);
2075 
2076         /*
2077          * Decide what the group-id of the created file should be.
2078          * Set it in attribute list as advisory...then do a setattr
2079          * if the server didn't get it right the first time.
2080          */
2081         error = setdirgid(dvp, &vattr.va_gid, cr);
2082         if (error) {
2083                 nfs_rw_exit(&drp->r_rwlock);
2084                 return (error);
2085         }
2086         vattr.va_mask |= AT_GID;
2087 
2088         /*
2089          * This is a completely gross hack to make mknod
2090          * work over the wire until we can wack the protocol
2091          */
2092 #define IFCHR           0020000         /* character special */
2093 #define IFBLK           0060000         /* block special */
2094 #define IFSOCK          0140000         /* socket */
2095 
2096         /*
2097          * dev_t is uint_t in 5.x and short in 4.x. Both 4.x
2098          * supports 8 bit majors. 5.x supports 14 bit majors. 5.x supports 18
2099          * bits in the minor number where 4.x supports 8 bits.  If the 5.x
2100          * minor/major numbers <= 8 bits long, compress the device
2101          * number before sending it. Otherwise, the 4.x server will not
2102          * create the device with the correct device number and nothing can be
2103          * done about this.
2104          */
2105         if (vattr.va_type == VCHR || vattr.va_type == VBLK) {
2106                 dev_t d = vattr.va_rdev;
2107                 dev32_t dev32;
2108 
2109                 if (vattr.va_type == VCHR)
2110                         vattr.va_mode |= IFCHR;
2111                 else
2112                         vattr.va_mode |= IFBLK;
2113 
2114                 (void) cmpldev(&dev32, d);
2115                 if (dev32 & ~((SO4_MAXMAJ << L_BITSMINOR32) | SO4_MAXMIN))
2116                         vattr.va_size = (u_offset_t)dev32;
2117                 else
2118                         vattr.va_size = (u_offset_t)nfsv2_cmpdev(d);
2119 
2120                 vattr.va_mask |= AT_MODE|AT_SIZE;
2121         } else if (vattr.va_type == VFIFO) {
2122                 vattr.va_mode |= IFCHR;         /* xtra kludge for namedpipe */
2123                 vattr.va_size = (u_offset_t)NFS_FIFO_DEV;       /* blech */
2124                 vattr.va_mask |= AT_MODE|AT_SIZE;
2125         } else if (vattr.va_type == VSOCK) {
2126                 vattr.va_mode |= IFSOCK;
2127                 /*
2128                  * To avoid triggering bugs in the servers set AT_SIZE
2129                  * (all other RFS_CREATE calls set this).
2130                  */
2131                 vattr.va_size = 0;
2132                 vattr.va_mask |= AT_MODE|AT_SIZE;
2133         }
2134 
2135         args.ca_sa = &args.ca_sa_buf;
2136         error = vattr_to_sattr(&vattr, args.ca_sa);
2137         if (error) {
2138                 /* req time field(s) overflow - return immediately */
2139                 nfs_rw_exit(&drp->r_rwlock);
2140                 return (error);
2141         }
2142 
2143         douprintf = 1;
2144 
2145         t = gethrtime();
2146 
2147         error = rfs2call(VTOMI(dvp), RFS_CREATE,
2148             xdr_creatargs, (caddr_t)&args,
2149             xdr_diropres, (caddr_t)&dr, cr,
2150             &douprintf, &dr.dr_status, 0, NULL);
2151 
2152         PURGE_ATTRCACHE(dvp);   /* mod time changed */
2153 
2154         if (!error) {
2155                 error = geterrno(dr.dr_status);
2156                 if (!error) {
2157                         if (HAVE_RDDIR_CACHE(drp))
2158                                 nfs_purge_rddir_cache(dvp);
2159                         vp = makenfsnode(&dr.dr_fhandle, &dr.dr_attr,
2160                             dvp->v_vfsp, t, cr, NULL, NULL);
2161                         /*
2162                          * If NFS_ACL is supported on the server, then the
2163                          * attributes returned by server may have minimal
2164                          * permissions sometimes denying access to users having
2165                          * proper access.  To get the proper attributes, mark
2166                          * the attributes as expired so that they will be
2167                          * regotten via the NFS_ACL GETATTR2 procedure.
2168                          */
2169                         if (VTOMI(vp)->mi_flags & MI_ACL) {
2170                                 PURGE_ATTRCACHE(vp);
2171                         }
2172                         dnlc_update(dvp, nm, vp);
2173                         rp = VTOR(vp);
2174                         if (vattr.va_size == 0) {
2175                                 mutex_enter(&rp->r_statelock);
2176                                 rp->r_size = 0;
2177                                 mutex_exit(&rp->r_statelock);
2178                                 if (vn_has_cached_data(vp)) {
2179                                         ASSERT(vp->v_type != VCHR);
2180                                         nfs_invalidate_pages(vp,
2181                                             (u_offset_t)0, cr);
2182                                 }
2183                         }
2184 
2185                         /*
2186                          * Make sure the gid was set correctly.
2187                          * If not, try to set it (but don't lose
2188                          * any sleep over it).
2189                          */
2190                         if (vattr.va_gid != rp->r_attr.va_gid) {
2191                                 vattr.va_mask = AT_GID;
2192                                 (void) nfssetattr(vp, &vattr, 0, cr);
2193                         }
2194 
2195                         /*
2196                          * If vnode is a device create special vnode
2197                          */
2198                         if (IS_DEVVP(vp)) {
2199                                 *vpp = specvp(vp, vp->v_rdev, vp->v_type, cr);
2200                                 VN_RELE(vp);
2201                         } else
2202                                 *vpp = vp;
2203                 } else {
2204                         PURGE_STALE_FH(error, dvp, cr);
2205                 }
2206         }
2207 
2208         nfs_rw_exit(&drp->r_rwlock);
2209 
2210         return (error);
2211 }
2212 
2213 /*
2214  * Weirdness: if the vnode to be removed is open
2215  * we rename it instead of removing it and nfs_inactive
2216  * will remove the new name.
2217  */
2218 /* ARGSUSED */
2219 static int
2220 nfs_remove(vnode_t *dvp, char *nm, cred_t *cr, caller_context_t *ct, int flags)
2221 {
2222         int error;
2223         struct nfsdiropargs da;
2224         enum nfsstat status;
2225         vnode_t *vp;
2226         char *tmpname;
2227         int douprintf;
2228         rnode_t *rp;
2229         rnode_t *drp;
2230 
2231         if (nfs_zone() != VTOMI(dvp)->mi_zone)
2232                 return (EPERM);
2233         drp = VTOR(dvp);
2234         if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
2235                 return (EINTR);
2236 
2237         error = nfslookup(dvp, nm, &vp, NULL, 0, NULL, cr, 0);
2238         if (error) {
2239                 nfs_rw_exit(&drp->r_rwlock);
2240                 return (error);
2241         }
2242 
2243         if (vp->v_type == VDIR && secpolicy_fs_linkdir(cr, dvp->v_vfsp)) {
2244                 VN_RELE(vp);
2245                 nfs_rw_exit(&drp->r_rwlock);
2246                 return (EPERM);
2247         }
2248 
2249         /*
2250          * First just remove the entry from the name cache, as it
2251          * is most likely the only entry for this vp.
2252          */
2253         dnlc_remove(dvp, nm);
2254 
2255         /*
2256          * If the file has a v_count > 1 then there may be more than one
2257          * entry in the name cache due multiple links or an open file,
2258          * but we don't have the real reference count so flush all
2259          * possible entries.
2260          */
2261         if (vp->v_count > 1)
2262                 dnlc_purge_vp(vp);
2263 
2264         /*
2265          * Now we have the real reference count on the vnode
2266          */
2267         rp = VTOR(vp);
2268         mutex_enter(&rp->r_statelock);
2269         if (vp->v_count > 1 &&
2270             (rp->r_unldvp == NULL || strcmp(nm, rp->r_unlname) == 0)) {
2271                 mutex_exit(&rp->r_statelock);
2272                 tmpname = newname();
2273                 error = nfsrename(dvp, nm, dvp, tmpname, cr, ct);
2274                 if (error)
2275                         kmem_free(tmpname, MAXNAMELEN);
2276                 else {
2277                         mutex_enter(&rp->r_statelock);
2278                         if (rp->r_unldvp == NULL) {
2279                                 VN_HOLD(dvp);
2280                                 rp->r_unldvp = dvp;
2281                                 if (rp->r_unlcred != NULL)
2282                                         crfree(rp->r_unlcred);
2283                                 crhold(cr);
2284                                 rp->r_unlcred = cr;
2285                                 rp->r_unlname = tmpname;
2286                         } else {
2287                                 kmem_free(rp->r_unlname, MAXNAMELEN);
2288                                 rp->r_unlname = tmpname;
2289                         }
2290                         mutex_exit(&rp->r_statelock);
2291                 }
2292         } else {
2293                 mutex_exit(&rp->r_statelock);
2294                 /*
2295                  * We need to flush any dirty pages which happen to
2296                  * be hanging around before removing the file.  This
2297                  * shouldn't happen very often and mostly on file
2298                  * systems mounted "nocto".
2299                  */
2300                 if (vn_has_cached_data(vp) &&
2301                     ((rp->r_flags & RDIRTY) || rp->r_count > 0)) {
2302                         error = nfs_putpage(vp, (offset_t)0, 0, 0, cr, ct);
2303                         if (error && (error == ENOSPC || error == EDQUOT)) {
2304                                 mutex_enter(&rp->r_statelock);
2305                                 if (!rp->r_error)
2306                                         rp->r_error = error;
2307                                 mutex_exit(&rp->r_statelock);
2308                         }
2309                 }
2310 
2311                 setdiropargs(&da, nm, dvp);
2312 
2313                 douprintf = 1;
2314 
2315                 error = rfs2call(VTOMI(dvp), RFS_REMOVE,
2316                     xdr_diropargs, (caddr_t)&da,
2317                     xdr_enum, (caddr_t)&status, cr,
2318                     &douprintf, &status, 0, NULL);
2319 
2320                 /*
2321                  * The xattr dir may be gone after last attr is removed,
2322                  * so flush it from dnlc.
2323                  */
2324                 if (dvp->v_flag & V_XATTRDIR)
2325                         dnlc_purge_vp(dvp);
2326 
2327                 PURGE_ATTRCACHE(dvp);   /* mod time changed */
2328                 PURGE_ATTRCACHE(vp);    /* link count changed */
2329 
2330                 if (!error) {
2331                         error = geterrno(status);
2332                         if (!error) {
2333                                 if (HAVE_RDDIR_CACHE(drp))
2334                                         nfs_purge_rddir_cache(dvp);
2335                         } else {
2336                                 PURGE_STALE_FH(error, dvp, cr);
2337                         }
2338                 }
2339         }
2340 
2341         if (error == 0) {
2342                 vnevent_remove(vp, dvp, nm, ct);
2343         }
2344         VN_RELE(vp);
2345 
2346         nfs_rw_exit(&drp->r_rwlock);
2347 
2348         return (error);
2349 }
2350 
2351 /* ARGSUSED */
2352 static int
2353 nfs_link(vnode_t *tdvp, vnode_t *svp, char *tnm, cred_t *cr,
2354         caller_context_t *ct, int flags)
2355 {
2356         int error;
2357         struct nfslinkargs args;
2358         enum nfsstat status;
2359         vnode_t *realvp;
2360         int douprintf;
2361         rnode_t *tdrp;
2362 
2363         if (nfs_zone() != VTOMI(tdvp)->mi_zone)
2364                 return (EPERM);
2365         if (VOP_REALVP(svp, &realvp, ct) == 0)
2366                 svp = realvp;
2367 
2368         args.la_from = VTOFH(svp);
2369         setdiropargs(&args.la_to, tnm, tdvp);
2370 
2371         tdrp = VTOR(tdvp);
2372         if (nfs_rw_enter_sig(&tdrp->r_rwlock, RW_WRITER, INTR(tdvp)))
2373                 return (EINTR);
2374 
2375         dnlc_remove(tdvp, tnm);
2376 
2377         douprintf = 1;
2378 
2379         error = rfs2call(VTOMI(svp), RFS_LINK,
2380             xdr_linkargs, (caddr_t)&args,
2381             xdr_enum, (caddr_t)&status, cr,
2382             &douprintf, &status, 0, NULL);
2383 
2384         PURGE_ATTRCACHE(tdvp);  /* mod time changed */
2385         PURGE_ATTRCACHE(svp);   /* link count changed */
2386 
2387         if (!error) {
2388                 error = geterrno(status);
2389                 if (!error) {
2390                         if (HAVE_RDDIR_CACHE(tdrp))
2391                                 nfs_purge_rddir_cache(tdvp);
2392                 }
2393         }
2394 
2395         nfs_rw_exit(&tdrp->r_rwlock);
2396 
2397         if (!error) {
2398                 /*
2399                  * Notify the source file of this link operation.
2400                  */
2401                 vnevent_link(svp, ct);
2402         }
2403         return (error);
2404 }
2405 
2406 /* ARGSUSED */
2407 static int
2408 nfs_rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr,
2409         caller_context_t *ct, int flags)
2410 {
2411         vnode_t *realvp;
2412 
2413         if (nfs_zone() != VTOMI(odvp)->mi_zone)
2414                 return (EPERM);
2415         if (VOP_REALVP(ndvp, &realvp, ct) == 0)
2416                 ndvp = realvp;
2417 
2418         return (nfsrename(odvp, onm, ndvp, nnm, cr, ct));
2419 }
2420 
2421 /*
2422  * nfsrename does the real work of renaming in NFS Version 2.
2423  */
2424 static int
2425 nfsrename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr,
2426     caller_context_t *ct)
2427 {
2428         int error;
2429         enum nfsstat status;
2430         struct nfsrnmargs args;
2431         int douprintf;
2432         vnode_t *nvp = NULL;
2433         vnode_t *ovp = NULL;
2434         char *tmpname;
2435         rnode_t *rp;
2436         rnode_t *odrp;
2437         rnode_t *ndrp;
2438 
2439         ASSERT(nfs_zone() == VTOMI(odvp)->mi_zone);
2440         if (strcmp(onm, ".") == 0 || strcmp(onm, "..") == 0 ||
2441             strcmp(nnm, ".") == 0 || strcmp(nnm, "..") == 0)
2442                 return (EINVAL);
2443 
2444         odrp = VTOR(odvp);
2445         ndrp = VTOR(ndvp);
2446         if ((intptr_t)odrp < (intptr_t)ndrp) {
2447                 if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR(odvp)))
2448                         return (EINTR);
2449                 if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR(ndvp))) {
2450                         nfs_rw_exit(&odrp->r_rwlock);
2451                         return (EINTR);
2452                 }
2453         } else {
2454                 if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR(ndvp)))
2455                         return (EINTR);
2456                 if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR(odvp))) {
2457                         nfs_rw_exit(&ndrp->r_rwlock);
2458                         return (EINTR);
2459                 }
2460         }
2461 
2462         /*
2463          * Lookup the target file.  If it exists, it needs to be
2464          * checked to see whether it is a mount point and whether
2465          * it is active (open).
2466          */
2467         error = nfslookup(ndvp, nnm, &nvp, NULL, 0, NULL, cr, 0);
2468         if (!error) {
2469                 /*
2470                  * If this file has been mounted on, then just
2471                  * return busy because renaming to it would remove
2472                  * the mounted file system from the name space.
2473                  */
2474                 if (vn_mountedvfs(nvp) != NULL) {
2475                         VN_RELE(nvp);
2476                         nfs_rw_exit(&odrp->r_rwlock);
2477                         nfs_rw_exit(&ndrp->r_rwlock);
2478                         return (EBUSY);
2479                 }
2480 
2481                 /*
2482                  * Purge the name cache of all references to this vnode
2483                  * so that we can check the reference count to infer
2484                  * whether it is active or not.
2485                  */
2486                 /*
2487                  * First just remove the entry from the name cache, as it
2488                  * is most likely the only entry for this vp.
2489                  */
2490                 dnlc_remove(ndvp, nnm);
2491                 /*
2492                  * If the file has a v_count > 1 then there may be more
2493                  * than one entry in the name cache due multiple links
2494                  * or an open file, but we don't have the real reference
2495                  * count so flush all possible entries.
2496                  */
2497                 if (nvp->v_count > 1)
2498                         dnlc_purge_vp(nvp);
2499 
2500                 /*
2501                  * If the vnode is active and is not a directory,
2502                  * arrange to rename it to a
2503                  * temporary file so that it will continue to be
2504                  * accessible.  This implements the "unlink-open-file"
2505                  * semantics for the target of a rename operation.
2506                  * Before doing this though, make sure that the
2507                  * source and target files are not already the same.
2508                  */
2509                 if (nvp->v_count > 1 && nvp->v_type != VDIR) {
2510                         /*
2511                          * Lookup the source name.
2512                          */
2513                         error = nfslookup(odvp, onm, &ovp, NULL, 0, NULL,
2514                             cr, 0);
2515 
2516                         /*
2517                          * The source name *should* already exist.
2518                          */
2519                         if (error) {
2520                                 VN_RELE(nvp);
2521                                 nfs_rw_exit(&odrp->r_rwlock);
2522                                 nfs_rw_exit(&ndrp->r_rwlock);
2523                                 return (error);
2524                         }
2525 
2526                         /*
2527                          * Compare the two vnodes.  If they are the same,
2528                          * just release all held vnodes and return success.
2529                          */
2530                         if (ovp == nvp) {
2531                                 VN_RELE(ovp);
2532                                 VN_RELE(nvp);
2533                                 nfs_rw_exit(&odrp->r_rwlock);
2534                                 nfs_rw_exit(&ndrp->r_rwlock);
2535                                 return (0);
2536                         }
2537 
2538                         /*
2539                          * Can't mix and match directories and non-
2540                          * directories in rename operations.  We already
2541                          * know that the target is not a directory.  If
2542                          * the source is a directory, return an error.
2543                          */
2544                         if (ovp->v_type == VDIR) {
2545                                 VN_RELE(ovp);
2546                                 VN_RELE(nvp);
2547                                 nfs_rw_exit(&odrp->r_rwlock);
2548                                 nfs_rw_exit(&ndrp->r_rwlock);
2549                                 return (ENOTDIR);
2550                         }
2551 
2552                         /*
2553                          * The target file exists, is not the same as
2554                          * the source file, and is active.  Link it
2555                          * to a temporary filename to avoid having
2556                          * the server removing the file completely.
2557                          */
2558                         tmpname = newname();
2559                         error = nfs_link(ndvp, nvp, tmpname, cr, NULL, 0);
2560                         if (error == EOPNOTSUPP) {
2561                                 error = nfs_rename(ndvp, nnm, ndvp, tmpname,
2562                                     cr, NULL, 0);
2563                         }
2564                         if (error) {
2565                                 kmem_free(tmpname, MAXNAMELEN);
2566                                 VN_RELE(ovp);
2567                                 VN_RELE(nvp);
2568                                 nfs_rw_exit(&odrp->r_rwlock);
2569                                 nfs_rw_exit(&ndrp->r_rwlock);
2570                                 return (error);
2571                         }
2572                         rp = VTOR(nvp);
2573                         mutex_enter(&rp->r_statelock);
2574                         if (rp->r_unldvp == NULL) {
2575                                 VN_HOLD(ndvp);
2576                                 rp->r_unldvp = ndvp;
2577                                 if (rp->r_unlcred != NULL)
2578                                         crfree(rp->r_unlcred);
2579                                 crhold(cr);
2580                                 rp->r_unlcred = cr;
2581                                 rp->r_unlname = tmpname;
2582                         } else {
2583                                 kmem_free(rp->r_unlname, MAXNAMELEN);
2584                                 rp->r_unlname = tmpname;
2585                         }
2586                         mutex_exit(&rp->r_statelock);
2587                 }
2588         }
2589 
2590         if (ovp == NULL) {
2591                 /*
2592                  * When renaming directories to be a subdirectory of a
2593                  * different parent, the dnlc entry for ".." will no
2594                  * longer be valid, so it must be removed.
2595                  *
2596                  * We do a lookup here to determine whether we are renaming
2597                  * a directory and we need to check if we are renaming
2598                  * an unlinked file.  This might have already been done
2599                  * in previous code, so we check ovp == NULL to avoid
2600                  * doing it twice.
2601                  */
2602 
2603                 error = nfslookup(odvp, onm, &ovp, NULL, 0, NULL, cr, 0);
2604 
2605                 /*
2606                  * The source name *should* already exist.
2607                  */
2608                 if (error) {
2609                         nfs_rw_exit(&odrp->r_rwlock);
2610                         nfs_rw_exit(&ndrp->r_rwlock);
2611                         if (nvp) {
2612                                 VN_RELE(nvp);
2613                         }
2614                         return (error);
2615                 }
2616                 ASSERT(ovp != NULL);
2617         }
2618 
2619         dnlc_remove(odvp, onm);
2620         dnlc_remove(ndvp, nnm);
2621 
2622         setdiropargs(&args.rna_from, onm, odvp);
2623         setdiropargs(&args.rna_to, nnm, ndvp);
2624 
2625         douprintf = 1;
2626 
2627         error = rfs2call(VTOMI(odvp), RFS_RENAME,
2628             xdr_rnmargs, (caddr_t)&args,
2629             xdr_enum, (caddr_t)&status, cr,
2630             &douprintf, &status, 0, NULL);
2631 
2632         PURGE_ATTRCACHE(odvp);  /* mod time changed */
2633         PURGE_ATTRCACHE(ndvp);  /* mod time changed */
2634 
2635         if (!error) {
2636                 error = geterrno(status);
2637                 if (!error) {
2638                         if (HAVE_RDDIR_CACHE(odrp))
2639                                 nfs_purge_rddir_cache(odvp);
2640                         if (HAVE_RDDIR_CACHE(ndrp))
2641                                 nfs_purge_rddir_cache(ndvp);
2642                         /*
2643                          * when renaming directories to be a subdirectory of a
2644                          * different parent, the dnlc entry for ".." will no
2645                          * longer be valid, so it must be removed
2646                          */
2647                         rp = VTOR(ovp);
2648                         if (ndvp != odvp) {
2649                                 if (ovp->v_type == VDIR) {
2650                                         dnlc_remove(ovp, "..");
2651                                         if (HAVE_RDDIR_CACHE(rp))
2652                                                 nfs_purge_rddir_cache(ovp);
2653                                 }
2654                         }
2655 
2656                         /*
2657                          * If we are renaming the unlinked file, update the
2658                          * r_unldvp and r_unlname as needed.
2659                          */
2660                         mutex_enter(&rp->r_statelock);
2661                         if (rp->r_unldvp != NULL) {
2662                                 if (strcmp(rp->r_unlname, onm) == 0) {
2663                                         (void) strncpy(rp->r_unlname,
2664                                             nnm, MAXNAMELEN);
2665                                         rp->r_unlname[MAXNAMELEN - 1] = '\0';
2666 
2667                                         if (ndvp != rp->r_unldvp) {
2668                                                 VN_RELE(rp->r_unldvp);
2669                                                 rp->r_unldvp = ndvp;
2670                                                 VN_HOLD(ndvp);
2671                                         }
2672                                 }
2673                         }
2674                         mutex_exit(&rp->r_statelock);
2675                 } else {
2676                         /*
2677                          * System V defines rename to return EEXIST, not
2678                          * ENOTEMPTY if the target directory is not empty.
2679                          * Over the wire, the error is NFSERR_ENOTEMPTY
2680                          * which geterrno maps to ENOTEMPTY.
2681                          */
2682                         if (error == ENOTEMPTY)
2683                                 error = EEXIST;
2684                 }
2685         }
2686 
2687         if (error == 0) {
2688                 if (nvp)
2689                         vnevent_rename_dest(nvp, ndvp, nnm, ct);
2690 
2691                 if (odvp != ndvp)
2692                         vnevent_rename_dest_dir(ndvp, ct);
2693 
2694                 ASSERT(ovp != NULL);
2695                 vnevent_rename_src(ovp, odvp, onm, ct);
2696         }
2697 
2698         if (nvp) {
2699                 VN_RELE(nvp);
2700         }
2701         VN_RELE(ovp);
2702 
2703         nfs_rw_exit(&odrp->r_rwlock);
2704         nfs_rw_exit(&ndrp->r_rwlock);
2705 
2706         return (error);
2707 }
2708 
2709 /* ARGSUSED */
2710 static int
2711 nfs_mkdir(vnode_t *dvp, char *nm, struct vattr *va, vnode_t **vpp, cred_t *cr,
2712         caller_context_t *ct, int flags, vsecattr_t *vsecp)
2713 {
2714         int error;
2715         struct nfscreatargs args;
2716         struct nfsdiropres dr;
2717         int douprintf;
2718         rnode_t *drp;
2719         hrtime_t t;
2720 
2721         if (nfs_zone() != VTOMI(dvp)->mi_zone)
2722                 return (EPERM);
2723 
2724         setdiropargs(&args.ca_da, nm, dvp);
2725 
2726         /*
2727          * Decide what the group-id and set-gid bit of the created directory
2728          * should be.  May have to do a setattr to get the gid right.
2729          */
2730         error = setdirgid(dvp, &va->va_gid, cr);
2731         if (error)
2732                 return (error);
2733         error = setdirmode(dvp, &va->va_mode, cr);
2734         if (error)
2735                 return (error);
2736         va->va_mask |= AT_MODE|AT_GID;
2737 
2738         args.ca_sa = &args.ca_sa_buf;
2739         error = vattr_to_sattr(va, args.ca_sa);
2740         if (error) {
2741                 /* req time field(s) overflow - return immediately */
2742                 return (error);
2743         }
2744 
2745         drp = VTOR(dvp);
2746         if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
2747                 return (EINTR);
2748 
2749         dnlc_remove(dvp, nm);
2750 
2751         douprintf = 1;
2752 
2753         t = gethrtime();
2754 
2755         error = rfs2call(VTOMI(dvp), RFS_MKDIR,
2756             xdr_creatargs, (caddr_t)&args,
2757             xdr_diropres, (caddr_t)&dr, cr,
2758             &douprintf, &dr.dr_status, 0, NULL);
2759 
2760         PURGE_ATTRCACHE(dvp);   /* mod time changed */
2761 
2762         if (!error) {
2763                 error = geterrno(dr.dr_status);
2764                 if (!error) {
2765                         if (HAVE_RDDIR_CACHE(drp))
2766                                 nfs_purge_rddir_cache(dvp);
2767                         /*
2768                          * The attributes returned by RFS_MKDIR can not
2769                          * be depended upon, so mark the attribute cache
2770                          * as purged.  A subsequent GETATTR will get the
2771                          * correct attributes from the server.
2772                          */
2773                         *vpp = makenfsnode(&dr.dr_fhandle, &dr.dr_attr,
2774                             dvp->v_vfsp, t, cr, NULL, NULL);
2775                         PURGE_ATTRCACHE(*vpp);
2776                         dnlc_update(dvp, nm, *vpp);
2777 
2778                         /*
2779                          * Make sure the gid was set correctly.
2780                          * If not, try to set it (but don't lose
2781                          * any sleep over it).
2782                          */
2783                         if (va->va_gid != VTOR(*vpp)->r_attr.va_gid) {
2784                                 va->va_mask = AT_GID;
2785                                 (void) nfssetattr(*vpp, va, 0, cr);
2786                         }
2787                 } else {
2788                         PURGE_STALE_FH(error, dvp, cr);
2789                 }
2790         }
2791 
2792         nfs_rw_exit(&drp->r_rwlock);
2793 
2794         return (error);
2795 }
2796 
2797 /* ARGSUSED */
2798 static int
2799 nfs_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr,
2800         caller_context_t *ct, int flags)
2801 {
2802         int error;
2803         enum nfsstat status;
2804         struct nfsdiropargs da;
2805         vnode_t *vp;
2806         int douprintf;
2807         rnode_t *drp;
2808 
2809         if (nfs_zone() != VTOMI(dvp)->mi_zone)
2810                 return (EPERM);
2811         drp = VTOR(dvp);
2812         if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
2813                 return (EINTR);
2814 
2815         /*
2816          * Attempt to prevent a rmdir(".") from succeeding.
2817          */
2818         error = nfslookup(dvp, nm, &vp, NULL, 0, NULL, cr, 0);
2819         if (error) {
2820                 nfs_rw_exit(&drp->r_rwlock);
2821                 return (error);
2822         }
2823 
2824         if (vp == cdir) {
2825                 VN_RELE(vp);
2826                 nfs_rw_exit(&drp->r_rwlock);
2827                 return (EINVAL);
2828         }
2829 
2830         setdiropargs(&da, nm, dvp);
2831 
2832         /*
2833          * First just remove the entry from the name cache, as it
2834          * is most likely an entry for this vp.
2835          */
2836         dnlc_remove(dvp, nm);
2837 
2838         /*
2839          * If there vnode reference count is greater than one, then
2840          * there may be additional references in the DNLC which will
2841          * need to be purged.  First, trying removing the entry for
2842          * the parent directory and see if that removes the additional
2843          * reference(s).  If that doesn't do it, then use dnlc_purge_vp
2844          * to completely remove any references to the directory which
2845          * might still exist in the DNLC.
2846          */
2847         if (vp->v_count > 1) {
2848                 dnlc_remove(vp, "..");
2849                 if (vp->v_count > 1)
2850                         dnlc_purge_vp(vp);
2851         }
2852 
2853         douprintf = 1;
2854 
2855         error = rfs2call(VTOMI(dvp), RFS_RMDIR,
2856             xdr_diropargs, (caddr_t)&da,
2857             xdr_enum, (caddr_t)&status, cr,
2858             &douprintf, &status, 0, NULL);
2859 
2860         PURGE_ATTRCACHE(dvp);   /* mod time changed */
2861 
2862         if (error) {
2863                 VN_RELE(vp);
2864                 nfs_rw_exit(&drp->r_rwlock);
2865                 return (error);
2866         }
2867 
2868         error = geterrno(status);
2869         if (!error) {
2870                 if (HAVE_RDDIR_CACHE(drp))
2871                         nfs_purge_rddir_cache(dvp);
2872                 if (HAVE_RDDIR_CACHE(VTOR(vp)))
2873                         nfs_purge_rddir_cache(vp);
2874         } else {
2875                 PURGE_STALE_FH(error, dvp, cr);
2876                 /*
2877                  * System V defines rmdir to return EEXIST, not
2878                  * ENOTEMPTY if the directory is not empty.  Over
2879                  * the wire, the error is NFSERR_ENOTEMPTY which
2880                  * geterrno maps to ENOTEMPTY.
2881                  */
2882                 if (error == ENOTEMPTY)
2883                         error = EEXIST;
2884         }
2885 
2886         if (error == 0) {
2887                 vnevent_rmdir(vp, dvp, nm, ct);
2888         }
2889         VN_RELE(vp);
2890 
2891         nfs_rw_exit(&drp->r_rwlock);
2892 
2893         return (error);
2894 }
2895 
2896 /* ARGSUSED */
2897 static int
2898 nfs_symlink(vnode_t *dvp, char *lnm, struct vattr *tva, char *tnm, cred_t *cr,
2899         caller_context_t *ct, int flags)
2900 {
2901         int error;
2902         struct nfsslargs args;
2903         enum nfsstat status;
2904         int douprintf;
2905         rnode_t *drp;
2906 
2907         if (nfs_zone() != VTOMI(dvp)->mi_zone)
2908                 return (EPERM);
2909         setdiropargs(&args.sla_from, lnm, dvp);
2910         args.sla_sa = &args.sla_sa_buf;
2911         error = vattr_to_sattr(tva, args.sla_sa);
2912         if (error) {
2913                 /* req time field(s) overflow - return immediately */
2914                 return (error);
2915         }
2916         args.sla_tnm = tnm;
2917 
2918         drp = VTOR(dvp);
2919         if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
2920                 return (EINTR);
2921 
2922         dnlc_remove(dvp, lnm);
2923 
2924         douprintf = 1;
2925 
2926         error = rfs2call(VTOMI(dvp), RFS_SYMLINK,
2927             xdr_slargs, (caddr_t)&args,
2928             xdr_enum, (caddr_t)&status, cr,
2929             &douprintf, &status, 0, NULL);
2930 
2931         PURGE_ATTRCACHE(dvp);   /* mod time changed */
2932 
2933         if (!error) {
2934                 error = geterrno(status);
2935                 if (!error) {
2936                         if (HAVE_RDDIR_CACHE(drp))
2937                                 nfs_purge_rddir_cache(dvp);
2938                 } else {
2939                         PURGE_STALE_FH(error, dvp, cr);
2940                 }
2941         }
2942 
2943         nfs_rw_exit(&drp->r_rwlock);
2944 
2945         return (error);
2946 }
2947 
2948 #ifdef DEBUG
2949 static int nfs_readdir_cache_hits = 0;
2950 static int nfs_readdir_cache_shorts = 0;
2951 static int nfs_readdir_cache_waits = 0;
2952 static int nfs_readdir_cache_misses = 0;
2953 static int nfs_readdir_readahead = 0;
2954 #endif
2955 
2956 static int nfs_shrinkreaddir = 0;
2957 
2958 /*
2959  * Read directory entries.
2960  * There are some weird things to look out for here.  The uio_offset
2961  * field is either 0 or it is the offset returned from a previous
2962  * readdir.  It is an opaque value used by the server to find the
2963  * correct directory block to read. The count field is the number
2964  * of blocks to read on the server.  This is advisory only, the server
2965  * may return only one block's worth of entries.  Entries may be compressed
2966  * on the server.
2967  */
2968 /* ARGSUSED */
2969 static int
2970 nfs_readdir(vnode_t *vp, struct uio *uiop, cred_t *cr, int *eofp,
2971         caller_context_t *ct, int flags)
2972 {
2973         int error;
2974         size_t count;
2975         rnode_t *rp;
2976         rddir_cache *rdc;
2977         rddir_cache *nrdc;
2978         rddir_cache *rrdc;
2979 #ifdef DEBUG
2980         int missed;
2981 #endif
2982         rddir_cache srdc;
2983         avl_index_t where;
2984 
2985         rp = VTOR(vp);
2986 
2987         ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER));
2988         if (nfs_zone() != VTOMI(vp)->mi_zone)
2989                 return (EIO);
2990         /*
2991          * Make sure that the directory cache is valid.
2992          */
2993         if (HAVE_RDDIR_CACHE(rp)) {
2994                 if (nfs_disable_rddir_cache) {
2995                         /*
2996                          * Setting nfs_disable_rddir_cache in /etc/system
2997                          * allows interoperability with servers that do not
2998                          * properly update the attributes of directories.
2999                          * Any cached information gets purged before an
3000                          * access is made to it.
3001                          */
3002                         nfs_purge_rddir_cache(vp);
3003                 } else {
3004                         error = nfs_validate_caches(vp, cr);
3005                         if (error)
3006                                 return (error);
3007                 }
3008         }
3009 
3010         /*
3011          * UGLINESS: SunOS 3.2 servers apparently cannot always handle an
3012          * RFS_READDIR request with rda_count set to more than 0x400. So
3013          * we reduce the request size here purely for compatibility.
3014          *
3015          * In general, this is no longer required.  However, if a server
3016          * is discovered which can not handle requests larger than 1024,
3017          * nfs_shrinkreaddir can be set to 1 to enable this backwards
3018          * compatibility.
3019          *
3020          * In any case, the request size is limited to NFS_MAXDATA bytes.
3021          */
3022         count = MIN(uiop->uio_iov->iov_len,
3023             nfs_shrinkreaddir ? 0x400 : NFS_MAXDATA);
3024 
3025         nrdc = NULL;
3026 #ifdef DEBUG
3027         missed = 0;
3028 #endif
3029 top:
3030         /*
3031          * Short circuit last readdir which always returns 0 bytes.
3032          * This can be done after the directory has been read through
3033          * completely at least once.  This will set r_direof which
3034          * can be used to find the value of the last cookie.
3035          */
3036         mutex_enter(&rp->r_statelock);
3037         if (rp->r_direof != NULL &&
3038             uiop->uio_offset == rp->r_direof->nfs_ncookie) {
3039                 mutex_exit(&rp->r_statelock);
3040 #ifdef DEBUG
3041                 nfs_readdir_cache_shorts++;
3042 #endif
3043                 if (eofp)
3044                         *eofp = 1;
3045                 if (nrdc != NULL)
3046                         rddir_cache_rele(nrdc);
3047                 return (0);
3048         }
3049         /*
3050          * Look for a cache entry.  Cache entries are identified
3051          * by the NFS cookie value and the byte count requested.
3052          */
3053         srdc.nfs_cookie = uiop->uio_offset;
3054         srdc.buflen = count;
3055         rdc = avl_find(&rp->r_dir, &srdc, &where);
3056         if (rdc != NULL) {
3057                 rddir_cache_hold(rdc);
3058                 /*
3059                  * If the cache entry is in the process of being
3060                  * filled in, wait until this completes.  The
3061                  * RDDIRWAIT bit is set to indicate that someone
3062                  * is waiting and then the thread currently
3063                  * filling the entry is done, it should do a
3064                  * cv_broadcast to wakeup all of the threads
3065                  * waiting for it to finish.
3066                  */
3067                 if (rdc->flags & RDDIR) {
3068                         nfs_rw_exit(&rp->r_rwlock);
3069                         rdc->flags |= RDDIRWAIT;
3070 #ifdef DEBUG
3071                         nfs_readdir_cache_waits++;
3072 #endif
3073                         if (!cv_wait_sig(&rdc->cv, &rp->r_statelock)) {
3074                                 /*
3075                                  * We got interrupted, probably
3076                                  * the user typed ^C or an alarm
3077                                  * fired.  We free the new entry
3078                                  * if we allocated one.
3079                                  */
3080                                 mutex_exit(&rp->r_statelock);
3081                                 (void) nfs_rw_enter_sig(&rp->r_rwlock,
3082                                     RW_READER, FALSE);
3083                                 rddir_cache_rele(rdc);
3084                                 if (nrdc != NULL)
3085                                         rddir_cache_rele(nrdc);
3086                                 return (EINTR);
3087                         }
3088                         mutex_exit(&rp->r_statelock);
3089                         (void) nfs_rw_enter_sig(&rp->r_rwlock,
3090                             RW_READER, FALSE);
3091                         rddir_cache_rele(rdc);
3092                         goto top;
3093                 }
3094                 /*
3095                  * Check to see if a readdir is required to
3096                  * fill the entry.  If so, mark this entry
3097                  * as being filled, remove our reference,
3098                  * and branch to the code to fill the entry.
3099                  */
3100                 if (rdc->flags & RDDIRREQ) {
3101                         rdc->flags &= ~RDDIRREQ;
3102                         rdc->flags |= RDDIR;
3103                         if (nrdc != NULL)
3104                                 rddir_cache_rele(nrdc);
3105                         nrdc = rdc;
3106                         mutex_exit(&rp->r_statelock);
3107                         goto bottom;
3108                 }
3109 #ifdef DEBUG
3110                 if (!missed)
3111                         nfs_readdir_cache_hits++;
3112 #endif
3113                 /*
3114                  * If an error occurred while attempting
3115                  * to fill the cache entry, just return it.
3116                  */
3117                 if (rdc->error) {
3118                         error = rdc->error;
3119                         mutex_exit(&rp->r_statelock);
3120                         rddir_cache_rele(rdc);
3121                         if (nrdc != NULL)
3122                                 rddir_cache_rele(nrdc);
3123                         return (error);
3124                 }
3125 
3126                 /*
3127                  * The cache entry is complete and good,
3128                  * copyout the dirent structs to the calling
3129                  * thread.
3130                  */
3131                 error = uiomove(rdc->entries, rdc->entlen, UIO_READ, uiop);
3132 
3133                 /*
3134                  * If no error occurred during the copyout,
3135                  * update the offset in the uio struct to
3136                  * contain the value of the next cookie
3137                  * and set the eof value appropriately.
3138                  */
3139                 if (!error) {
3140                         uiop->uio_offset = rdc->nfs_ncookie;
3141                         if (eofp)
3142                                 *eofp = rdc->eof;
3143                 }
3144 
3145                 /*
3146                  * Decide whether to do readahead.  Don't if
3147                  * have already read to the end of directory.
3148                  */
3149                 if (rdc->eof) {
3150                         rp->r_direof = rdc;
3151                         mutex_exit(&rp->r_statelock);
3152                         rddir_cache_rele(rdc);
3153                         if (nrdc != NULL)
3154                                 rddir_cache_rele(nrdc);
3155                         return (error);
3156                 }
3157 
3158                 /*
3159                  * Check to see whether we found an entry
3160                  * for the readahead.  If so, we don't need
3161                  * to do anything further, so free the new
3162                  * entry if one was allocated.  Otherwise,
3163                  * allocate a new entry, add it to the cache,
3164                  * and then initiate an asynchronous readdir
3165                  * operation to fill it.
3166                  */
3167                 srdc.nfs_cookie = rdc->nfs_ncookie;
3168                 srdc.buflen = count;
3169                 rrdc = avl_find(&rp->r_dir, &srdc, &where);
3170                 if (rrdc != NULL) {
3171                         if (nrdc != NULL)
3172                                 rddir_cache_rele(nrdc);
3173                 } else {
3174                         if (nrdc != NULL)
3175                                 rrdc = nrdc;
3176                         else {
3177                                 rrdc = rddir_cache_alloc(KM_NOSLEEP);
3178                         }
3179                         if (rrdc != NULL) {
3180                                 rrdc->nfs_cookie = rdc->nfs_ncookie;
3181                                 rrdc->buflen = count;
3182                                 avl_insert(&rp->r_dir, rrdc, where);
3183                                 rddir_cache_hold(rrdc);
3184                                 mutex_exit(&rp->r_statelock);
3185                                 rddir_cache_rele(rdc);
3186 #ifdef DEBUG
3187                                 nfs_readdir_readahead++;
3188 #endif
3189                                 nfs_async_readdir(vp, rrdc, cr, nfsreaddir);
3190                                 return (error);
3191                         }
3192                 }
3193 
3194                 mutex_exit(&rp->r_statelock);
3195                 rddir_cache_rele(rdc);
3196                 return (error);
3197         }
3198 
3199         /*
3200          * Didn't find an entry in the cache.  Construct a new empty
3201          * entry and link it into the cache.  Other processes attempting
3202          * to access this entry will need to wait until it is filled in.
3203          *
3204          * Since kmem_alloc may block, another pass through the cache
3205          * will need to be taken to make sure that another process
3206          * hasn't already added an entry to the cache for this request.
3207          */
3208         if (nrdc == NULL) {
3209                 mutex_exit(&rp->r_statelock);
3210                 nrdc = rddir_cache_alloc(KM_SLEEP);
3211                 nrdc->nfs_cookie = uiop->uio_offset;
3212                 nrdc->buflen = count;
3213                 goto top;
3214         }
3215 
3216         /*
3217          * Add this entry to the cache.
3218          */
3219         avl_insert(&rp->r_dir, nrdc, where);
3220         rddir_cache_hold(nrdc);
3221         mutex_exit(&rp->r_statelock);
3222 
3223 bottom:
3224 #ifdef DEBUG
3225         missed = 1;
3226         nfs_readdir_cache_misses++;
3227 #endif
3228         /*
3229          * Do the readdir.
3230          */
3231         error = nfsreaddir(vp, nrdc, cr);
3232 
3233         /*
3234          * If this operation failed, just return the error which occurred.
3235          */
3236         if (error != 0)
3237                 return (error);
3238 
3239         /*
3240          * Since the RPC operation will have taken sometime and blocked
3241          * this process, another pass through the cache will need to be
3242          * taken to find the correct cache entry.  It is possible that
3243          * the correct cache entry will not be there (although one was
3244          * added) because the directory changed during the RPC operation
3245          * and the readdir cache was flushed.  In this case, just start
3246          * over.  It is hoped that this will not happen too often... :-)
3247          */
3248         nrdc = NULL;
3249         goto top;
3250         /* NOTREACHED */
3251 }
3252 
3253 static int
3254 nfsreaddir(vnode_t *vp, rddir_cache *rdc, cred_t *cr)
3255 {
3256         int error;
3257         struct nfsrddirargs rda;
3258         struct nfsrddirres rd;
3259         rnode_t *rp;
3260         mntinfo_t *mi;
3261         uint_t count;
3262         int douprintf;
3263         failinfo_t fi, *fip;
3264 
3265         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
3266         count = rdc->buflen;
3267 
3268         rp = VTOR(vp);
3269         mi = VTOMI(vp);
3270 
3271         rda.rda_fh = *VTOFH(vp);
3272         rda.rda_offset = rdc->nfs_cookie;
3273 
3274         /*
3275          * NFS client failover support
3276          * suppress failover unless we have a zero cookie
3277          */
3278         if (rdc->nfs_cookie == (off_t)0) {
3279                 fi.vp = vp;
3280                 fi.fhp = (caddr_t)&rda.rda_fh;
3281                 fi.copyproc = nfscopyfh;
3282                 fi.lookupproc = nfslookup;
3283                 fi.xattrdirproc = acl_getxattrdir2;
3284                 fip = &fi;
3285         } else {
3286                 fip = NULL;
3287         }
3288 
3289         rd.rd_entries = kmem_alloc(rdc->buflen, KM_SLEEP);
3290         rd.rd_size = count;
3291         rd.rd_offset = rda.rda_offset;
3292 
3293         douprintf = 1;
3294 
3295         if (mi->mi_io_kstats) {
3296                 mutex_enter(&mi->mi_lock);
3297                 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
3298                 mutex_exit(&mi->mi_lock);
3299         }
3300 
3301         do {
3302                 rda.rda_count = MIN(count, mi->mi_curread);
3303                 error = rfs2call(mi, RFS_READDIR,
3304                     xdr_rddirargs, (caddr_t)&rda,
3305                     xdr_getrddirres, (caddr_t)&rd, cr,
3306                     &douprintf, &rd.rd_status, 0, fip);
3307         } while (error == ENFS_TRYAGAIN);
3308 
3309         if (mi->mi_io_kstats) {
3310                 mutex_enter(&mi->mi_lock);
3311                 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
3312                 mutex_exit(&mi->mi_lock);
3313         }
3314 
3315         /*
3316          * Since we are actually doing a READDIR RPC, we must have
3317          * exclusive access to the cache entry being filled.  Thus,
3318          * it is safe to update all fields except for the flags
3319          * field.  The r_statelock in the rnode must be held to
3320          * prevent two different threads from simultaneously
3321          * attempting to update the flags field.  This can happen
3322          * if we are turning off RDDIR and the other thread is
3323          * trying to set RDDIRWAIT.
3324          */
3325         ASSERT(rdc->flags & RDDIR);
3326         if (!error) {
3327                 error = geterrno(rd.rd_status);
3328                 if (!error) {
3329                         rdc->nfs_ncookie = rd.rd_offset;
3330                         rdc->eof = rd.rd_eof ? 1 : 0;
3331                         rdc->entlen = rd.rd_size;
3332                         ASSERT(rdc->entlen <= rdc->buflen);
3333 #ifdef DEBUG
3334                         rdc->entries = rddir_cache_buf_alloc(rdc->buflen,
3335                             KM_SLEEP);
3336 #else
3337                         rdc->entries = kmem_alloc(rdc->buflen, KM_SLEEP);
3338 #endif
3339                         bcopy(rd.rd_entries, rdc->entries, rdc->entlen);
3340                         rdc->error = 0;
3341                         if (mi->mi_io_kstats) {
3342                                 mutex_enter(&mi->mi_lock);
3343                                 KSTAT_IO_PTR(mi->mi_io_kstats)->reads++;
3344                                 KSTAT_IO_PTR(mi->mi_io_kstats)->nread +=
3345                                     rd.rd_size;
3346                                 mutex_exit(&mi->mi_lock);
3347                         }
3348                 } else {
3349                         PURGE_STALE_FH(error, vp, cr);
3350                 }
3351         }
3352         if (error) {
3353                 rdc->entries = NULL;
3354                 rdc->error = error;
3355         }
3356         kmem_free(rd.rd_entries, rdc->buflen);
3357 
3358         mutex_enter(&rp->r_statelock);
3359         rdc->flags &= ~RDDIR;
3360         if (rdc->flags & RDDIRWAIT) {
3361                 rdc->flags &= ~RDDIRWAIT;
3362                 cv_broadcast(&rdc->cv);
3363         }
3364         if (error)
3365                 rdc->flags |= RDDIRREQ;
3366         mutex_exit(&rp->r_statelock);
3367 
3368         rddir_cache_rele(rdc);
3369 
3370         return (error);
3371 }
3372 
3373 #ifdef DEBUG
3374 static int nfs_bio_do_stop = 0;
3375 #endif
3376 
3377 static int
3378 nfs_bio(struct buf *bp, cred_t *cr)
3379 {
3380         rnode_t *rp = VTOR(bp->b_vp);
3381         int count;
3382         int error;
3383         cred_t *cred;
3384         uint_t offset;
3385 
3386         DTRACE_IO1(start, struct buf *, bp);
3387 
3388         ASSERT(nfs_zone() == VTOMI(bp->b_vp)->mi_zone);
3389         offset = dbtob(bp->b_blkno);
3390 
3391         if (bp->b_flags & B_READ) {
3392                 mutex_enter(&rp->r_statelock);
3393                 if (rp->r_cred != NULL) {
3394                         cred = rp->r_cred;
3395                         crhold(cred);
3396                 } else {
3397                         rp->r_cred = cr;
3398                         crhold(cr);
3399                         cred = cr;
3400                         crhold(cred);
3401                 }
3402                 mutex_exit(&rp->r_statelock);
3403         read_again:
3404                 error = bp->b_error = nfsread(bp->b_vp, bp->b_un.b_addr,
3405                     offset, bp->b_bcount, &bp->b_resid, cred);
3406 
3407                 crfree(cred);
3408                 if (!error) {
3409                         if (bp->b_resid) {
3410                                 /*
3411                                  * Didn't get it all because we hit EOF,
3412                                  * zero all the memory beyond the EOF.
3413                                  */
3414                                 /* bzero(rdaddr + */
3415                                 bzero(bp->b_un.b_addr +
3416                                     bp->b_bcount - bp->b_resid, bp->b_resid);
3417                         }
3418                         mutex_enter(&rp->r_statelock);
3419                         if (bp->b_resid == bp->b_bcount &&
3420                             offset >= rp->r_size) {
3421                                 /*
3422                                  * We didn't read anything at all as we are
3423                                  * past EOF.  Return an error indicator back
3424                                  * but don't destroy the pages (yet).
3425                                  */
3426                                 error = NFS_EOF;
3427                         }
3428                         mutex_exit(&rp->r_statelock);
3429                 } else if (error == EACCES) {
3430                         mutex_enter(&rp->r_statelock);
3431                         if (cred != cr) {
3432                                 if (rp->r_cred != NULL)
3433                                         crfree(rp->r_cred);
3434                                 rp->r_cred = cr;
3435                                 crhold(cr);
3436                                 cred = cr;
3437                                 crhold(cred);
3438                                 mutex_exit(&rp->r_statelock);
3439                                 goto read_again;
3440                         }
3441                         mutex_exit(&rp->r_statelock);
3442                 }
3443         } else {
3444                 if (!(rp->r_flags & RSTALE)) {
3445                         mutex_enter(&rp->r_statelock);
3446                         if (rp->r_cred != NULL) {
3447                                 cred = rp->r_cred;
3448                                 crhold(cred);
3449                         } else {
3450                                 rp->r_cred = cr;
3451                                 crhold(cr);
3452                                 cred = cr;
3453                                 crhold(cred);
3454                         }
3455                         mutex_exit(&rp->r_statelock);
3456                 write_again:
3457                         mutex_enter(&rp->r_statelock);
3458                         count = MIN(bp->b_bcount, rp->r_size - offset);
3459                         mutex_exit(&rp->r_statelock);
3460                         if (count < 0)
3461                                 cmn_err(CE_PANIC, "nfs_bio: write count < 0");
3462 #ifdef DEBUG
3463                         if (count == 0) {
3464                                 zcmn_err(getzoneid(), CE_WARN,
3465                                     "nfs_bio: zero length write at %d",
3466                                     offset);
3467                                 nfs_printfhandle(&rp->r_fh);
3468                                 if (nfs_bio_do_stop)
3469                                         debug_enter("nfs_bio");
3470                         }
3471 #endif
3472                         error = nfswrite(bp->b_vp, bp->b_un.b_addr, offset,
3473                             count, cred);
3474                         if (error == EACCES) {
3475                                 mutex_enter(&rp->r_statelock);
3476                                 if (cred != cr) {
3477                                         if (rp->r_cred != NULL)
3478                                                 crfree(rp->r_cred);
3479                                         rp->r_cred = cr;
3480                                         crhold(cr);
3481                                         crfree(cred);
3482                                         cred = cr;
3483                                         crhold(cred);
3484                                         mutex_exit(&rp->r_statelock);
3485                                         goto write_again;
3486                                 }
3487                                 mutex_exit(&rp->r_statelock);
3488                         }
3489                         bp->b_error = error;
3490                         if (error && error != EINTR) {
3491                                 /*
3492                                  * Don't print EDQUOT errors on the console.
3493                                  * Don't print asynchronous EACCES errors.
3494                                  * Don't print EFBIG errors.
3495                                  * Print all other write errors.
3496                                  */
3497                                 if (error != EDQUOT && error != EFBIG &&
3498                                     (error != EACCES ||
3499                                     !(bp->b_flags & B_ASYNC)))
3500                                         nfs_write_error(bp->b_vp, error, cred);
3501                                 /*
3502                                  * Update r_error and r_flags as appropriate.
3503                                  * If the error was ESTALE, then mark the
3504                                  * rnode as not being writeable and save
3505                                  * the error status.  Otherwise, save any
3506                                  * errors which occur from asynchronous
3507                                  * page invalidations.  Any errors occurring
3508                                  * from other operations should be saved
3509                                  * by the caller.
3510                                  */
3511                                 mutex_enter(&rp->r_statelock);
3512                                 if (error == ESTALE) {
3513                                         rp->r_flags |= RSTALE;
3514                                         if (!rp->r_error)
3515                                                 rp->r_error = error;
3516                                 } else if (!rp->r_error &&
3517                                     (bp->b_flags &
3518                                     (B_INVAL|B_FORCE|B_ASYNC)) ==
3519                                     (B_INVAL|B_FORCE|B_ASYNC)) {
3520                                         rp->r_error = error;
3521                                 }
3522                                 mutex_exit(&rp->r_statelock);
3523                         }
3524                         crfree(cred);
3525                 } else {
3526                         error = rp->r_error;
3527                         /*
3528                          * A close may have cleared r_error, if so,
3529                          * propagate ESTALE error return properly
3530                          */
3531                         if (error == 0)
3532                                 error = ESTALE;
3533                 }
3534         }
3535 
3536         if (error != 0 && error != NFS_EOF)
3537                 bp->b_flags |= B_ERROR;
3538 
3539         DTRACE_IO1(done, struct buf *, bp);
3540 
3541         return (error);
3542 }
3543 
3544 /* ARGSUSED */
3545 static int
3546 nfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
3547 {
3548         struct nfs_fid *fp;
3549         rnode_t *rp;
3550 
3551         rp = VTOR(vp);
3552 
3553         if (fidp->fid_len < (sizeof (struct nfs_fid) - sizeof (short))) {
3554                 fidp->fid_len = sizeof (struct nfs_fid) - sizeof (short);
3555                 return (ENOSPC);
3556         }
3557         fp = (struct nfs_fid *)fidp;
3558         fp->nf_pad = 0;
3559         fp->nf_len = sizeof (struct nfs_fid) - sizeof (short);
3560         bcopy(rp->r_fh.fh_buf, fp->nf_data, NFS_FHSIZE);
3561         return (0);
3562 }
3563 
3564 /* ARGSUSED2 */
3565 static int
3566 nfs_rwlock(vnode_t *vp, int write_lock, caller_context_t *ctp)
3567 {
3568         rnode_t *rp = VTOR(vp);
3569 
3570         if (!write_lock) {
3571                 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE);
3572                 return (V_WRITELOCK_FALSE);
3573         }
3574 
3575         if ((rp->r_flags & RDIRECTIO) || (VTOMI(vp)->mi_flags & MI_DIRECTIO)) {
3576                 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE);
3577                 if (rp->r_mapcnt == 0 && !vn_has_cached_data(vp))
3578                         return (V_WRITELOCK_FALSE);
3579                 nfs_rw_exit(&rp->r_rwlock);
3580         }
3581 
3582         (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, FALSE);
3583         return (V_WRITELOCK_TRUE);
3584 }
3585 
3586 /* ARGSUSED */
3587 static void
3588 nfs_rwunlock(vnode_t *vp, int write_lock, caller_context_t *ctp)
3589 {
3590         rnode_t *rp = VTOR(vp);
3591 
3592         nfs_rw_exit(&rp->r_rwlock);
3593 }
3594 
3595 /* ARGSUSED */
3596 static int
3597 nfs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp, caller_context_t *ct)
3598 {
3599 
3600         /*
3601          * Because we stuff the readdir cookie into the offset field
3602          * someone may attempt to do an lseek with the cookie which
3603          * we want to succeed.
3604          */
3605         if (vp->v_type == VDIR)
3606                 return (0);
3607         if (*noffp < 0 || *noffp > MAXOFF32_T)
3608                 return (EINVAL);
3609         return (0);
3610 }
3611 
3612 /*
3613  * number of NFS_MAXDATA blocks to read ahead
3614  * optimized for 100 base-T.
3615  */
3616 static int nfs_nra = 4;
3617 
3618 #ifdef DEBUG
3619 static int nfs_lostpage = 0;    /* number of times we lost original page */
3620 #endif
3621 
3622 /*
3623  * Return all the pages from [off..off+len) in file
3624  */
3625 /* ARGSUSED */
3626 static int
3627 nfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp,
3628         page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
3629         enum seg_rw rw, cred_t *cr, caller_context_t *ct)
3630 {
3631         rnode_t *rp;
3632         int error;
3633         mntinfo_t *mi;
3634 
3635         if (vp->v_flag & VNOMAP)
3636                 return (ENOSYS);
3637 
3638         ASSERT(off <= MAXOFF32_T);
3639         if (nfs_zone() != VTOMI(vp)->mi_zone)
3640                 return (EIO);
3641         if (protp != NULL)
3642                 *protp = PROT_ALL;
3643 
3644         /*
3645          * Now valididate that the caches are up to date.
3646          */
3647         error = nfs_validate_caches(vp, cr);
3648         if (error)
3649                 return (error);
3650 
3651         rp = VTOR(vp);
3652         mi = VTOMI(vp);
3653 retry:
3654         mutex_enter(&rp->r_statelock);
3655 
3656         /*
3657          * Don't create dirty pages faster than they
3658          * can be cleaned so that the system doesn't
3659          * get imbalanced.  If the async queue is
3660          * maxed out, then wait for it to drain before
3661          * creating more dirty pages.  Also, wait for
3662          * any threads doing pagewalks in the vop_getattr
3663          * entry points so that they don't block for
3664          * long periods.
3665          */
3666         if (rw == S_CREATE) {
3667                 while ((mi->mi_max_threads != 0 &&
3668                     rp->r_awcount > 2 * mi->mi_max_threads) ||
3669                     rp->r_gcount > 0)
3670                         cv_wait(&rp->r_cv, &rp->r_statelock);
3671         }
3672 
3673         /*
3674          * If we are getting called as a side effect of an nfs_write()
3675          * operation the local file size might not be extended yet.
3676          * In this case we want to be able to return pages of zeroes.
3677          */
3678         if (off + len > rp->r_size + PAGEOFFSET && seg != segkmap) {
3679                 mutex_exit(&rp->r_statelock);
3680                 return (EFAULT);                /* beyond EOF */
3681         }
3682 
3683         mutex_exit(&rp->r_statelock);
3684 
3685         error = pvn_getpages(nfs_getapage, vp, off, len, protp, pl, plsz,
3686             seg, addr, rw, cr);
3687 
3688         switch (error) {
3689         case NFS_EOF:
3690                 nfs_purge_caches(vp, NFS_NOPURGE_DNLC, cr);
3691                 goto retry;
3692         case ESTALE:
3693                 PURGE_STALE_FH(error, vp, cr);
3694         }
3695 
3696         return (error);
3697 }
3698 
3699 /*
3700  * Called from pvn_getpages to get a particular page.
3701  */
3702 /* ARGSUSED */
3703 static int
3704 nfs_getapage(vnode_t *vp, u_offset_t off, size_t len, uint_t *protp,
3705         page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
3706         enum seg_rw rw, cred_t *cr)
3707 {
3708         rnode_t *rp;
3709         uint_t bsize;
3710         struct buf *bp;
3711         page_t *pp;
3712         u_offset_t lbn;
3713         u_offset_t io_off;
3714         u_offset_t blkoff;
3715         u_offset_t rablkoff;
3716         size_t io_len;
3717         uint_t blksize;
3718         int error;
3719         int readahead;
3720         int readahead_issued = 0;
3721         int ra_window; /* readahead window */
3722         page_t *pagefound;
3723 
3724         if (nfs_zone() != VTOMI(vp)->mi_zone)
3725                 return (EIO);
3726         rp = VTOR(vp);
3727         bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
3728 
3729 reread:
3730         bp = NULL;
3731         pp = NULL;
3732         pagefound = NULL;
3733 
3734         if (pl != NULL)
3735                 pl[0] = NULL;
3736 
3737         error = 0;
3738         lbn = off / bsize;
3739         blkoff = lbn * bsize;
3740 
3741         /*
3742          * Queueing up the readahead before doing the synchronous read
3743          * results in a significant increase in read throughput because
3744          * of the increased parallelism between the async threads and
3745          * the process context.
3746          */
3747         if ((off & ((vp->v_vfsp->vfs_bsize) - 1)) == 0 &&
3748             rw != S_CREATE &&
3749             !(vp->v_flag & VNOCACHE)) {
3750                 mutex_enter(&rp->r_statelock);
3751 
3752                 /*
3753                  * Calculate the number of readaheads to do.
3754                  * a) No readaheads at offset = 0.
3755                  * b) Do maximum(nfs_nra) readaheads when the readahead
3756                  *    window is closed.
3757                  * c) Do readaheads between 1 to (nfs_nra - 1) depending
3758                  *    upon how far the readahead window is open or close.
3759                  * d) No readaheads if rp->r_nextr is not within the scope
3760                  *    of the readahead window (random i/o).
3761                  */
3762 
3763                 if (off == 0)
3764                         readahead = 0;
3765                 else if (blkoff == rp->r_nextr)
3766                         readahead = nfs_nra;
3767                 else if (rp->r_nextr > blkoff &&
3768                     ((ra_window = (rp->r_nextr - blkoff) / bsize)
3769                     <= (nfs_nra - 1)))
3770                         readahead = nfs_nra - ra_window;
3771                 else
3772                         readahead = 0;
3773 
3774                 rablkoff = rp->r_nextr;
3775                 while (readahead > 0 && rablkoff + bsize < rp->r_size) {
3776                         mutex_exit(&rp->r_statelock);
3777                         if (nfs_async_readahead(vp, rablkoff + bsize,
3778                             addr + (rablkoff + bsize - off), seg, cr,
3779                             nfs_readahead) < 0) {
3780                                 mutex_enter(&rp->r_statelock);
3781                                 break;
3782                         }
3783                         readahead--;
3784                         rablkoff += bsize;
3785                         /*
3786                          * Indicate that we did a readahead so
3787                          * readahead offset is not updated
3788                          * by the synchronous read below.
3789                          */
3790                         readahead_issued = 1;
3791                         mutex_enter(&rp->r_statelock);
3792                         /*
3793                          * set readahead offset to
3794                          * offset of last async readahead
3795                          * request.
3796                          */
3797                         rp->r_nextr = rablkoff;
3798                 }
3799                 mutex_exit(&rp->r_statelock);
3800         }
3801 
3802 again:
3803         if ((pagefound = page_exists(vp, off)) == NULL) {
3804                 if (pl == NULL) {
3805                         (void) nfs_async_readahead(vp, blkoff, addr, seg, cr,
3806                             nfs_readahead);
3807                 } else if (rw == S_CREATE) {
3808                         /*
3809                          * Block for this page is not allocated, or the offset
3810                          * is beyond the current allocation size, or we're
3811                          * allocating a swap slot and the page was not found,
3812                          * so allocate it and return a zero page.
3813                          */
3814                         if ((pp = page_create_va(vp, off,
3815                             PAGESIZE, PG_WAIT, seg, addr)) == NULL)
3816                                 cmn_err(CE_PANIC, "nfs_getapage: page_create");
3817                         io_len = PAGESIZE;
3818                         mutex_enter(&rp->r_statelock);
3819                         rp->r_nextr = off + PAGESIZE;
3820                         mutex_exit(&rp->r_statelock);
3821                 } else {
3822                         /*
3823                          * Need to go to server to get a BLOCK, exception to
3824                          * that being while reading at offset = 0 or doing
3825                          * random i/o, in that case read only a PAGE.
3826                          */
3827                         mutex_enter(&rp->r_statelock);
3828                         if (blkoff < rp->r_size &&
3829                             blkoff + bsize >= rp->r_size) {
3830                                 /*
3831                                  * If only a block or less is left in
3832                                  * the file, read all that is remaining.
3833                                  */
3834                                 if (rp->r_size <= off) {
3835                                         /*
3836                                          * Trying to access beyond EOF,
3837                                          * set up to get at least one page.
3838                                          */
3839                                         blksize = off + PAGESIZE - blkoff;
3840                                 } else
3841                                         blksize = rp->r_size - blkoff;
3842                         } else if ((off == 0) ||
3843                             (off != rp->r_nextr && !readahead_issued)) {
3844                                 blksize = PAGESIZE;
3845                                 blkoff = off; /* block = page here */
3846                         } else
3847                                 blksize = bsize;
3848                         mutex_exit(&rp->r_statelock);
3849 
3850                         pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
3851                             &io_len, blkoff, blksize, 0);
3852 
3853                         /*
3854                          * Some other thread has entered the page,
3855                          * so just use it.
3856                          */
3857                         if (pp == NULL)
3858                                 goto again;
3859 
3860                         /*
3861                          * Now round the request size up to page boundaries.
3862                          * This ensures that the entire page will be
3863                          * initialized to zeroes if EOF is encountered.
3864                          */
3865                         io_len = ptob(btopr(io_len));
3866 
3867                         bp = pageio_setup(pp, io_len, vp, B_READ);
3868                         ASSERT(bp != NULL);
3869 
3870                         /*
3871                          * pageio_setup should have set b_addr to 0.  This
3872                          * is correct since we want to do I/O on a page
3873                          * boundary.  bp_mapin will use this addr to calculate
3874                          * an offset, and then set b_addr to the kernel virtual
3875                          * address it allocated for us.
3876                          */
3877                         ASSERT(bp->b_un.b_addr == 0);
3878 
3879                         bp->b_edev = 0;
3880                         bp->b_dev = 0;
3881                         bp->b_lblkno = lbtodb(io_off);
3882                         bp->b_file = vp;
3883                         bp->b_offset = (offset_t)off;
3884                         bp_mapin(bp);
3885 
3886                         /*
3887                          * If doing a write beyond what we believe is EOF,
3888                          * don't bother trying to read the pages from the
3889                          * server, we'll just zero the pages here.  We
3890                          * don't check that the rw flag is S_WRITE here
3891                          * because some implementations may attempt a
3892                          * read access to the buffer before copying data.
3893                          */
3894                         mutex_enter(&rp->r_statelock);
3895                         if (io_off >= rp->r_size && seg == segkmap) {
3896                                 mutex_exit(&rp->r_statelock);
3897                                 bzero(bp->b_un.b_addr, io_len);
3898                         } else {
3899                                 mutex_exit(&rp->r_statelock);
3900                                 error = nfs_bio(bp, cr);
3901                         }
3902 
3903                         /*
3904                          * Unmap the buffer before freeing it.
3905                          */
3906                         bp_mapout(bp);
3907                         pageio_done(bp);
3908 
3909                         if (error == NFS_EOF) {
3910                                 /*
3911                                  * If doing a write system call just return
3912                                  * zeroed pages, else user tried to get pages
3913                                  * beyond EOF, return error.  We don't check
3914                                  * that the rw flag is S_WRITE here because
3915                                  * some implementations may attempt a read
3916                                  * access to the buffer before copying data.
3917                                  */
3918                                 if (seg == segkmap)
3919                                         error = 0;
3920                                 else
3921                                         error = EFAULT;
3922                         }
3923 
3924                         if (!readahead_issued && !error) {
3925                                 mutex_enter(&rp->r_statelock);
3926                                 rp->r_nextr = io_off + io_len;
3927                                 mutex_exit(&rp->r_statelock);
3928                         }
3929                 }
3930         }
3931 
3932 out:
3933         if (pl == NULL)
3934                 return (error);
3935 
3936         if (error) {
3937                 if (pp != NULL)
3938                         pvn_read_done(pp, B_ERROR);
3939                 return (error);
3940         }
3941 
3942         if (pagefound) {
3943                 se_t se = (rw == S_CREATE ? SE_EXCL : SE_SHARED);
3944 
3945                 /*
3946                  * Page exists in the cache, acquire the appropriate lock.
3947                  * If this fails, start all over again.
3948                  */
3949                 if ((pp = page_lookup(vp, off, se)) == NULL) {
3950 #ifdef DEBUG
3951                         nfs_lostpage++;
3952 #endif
3953                         goto reread;
3954                 }
3955                 pl[0] = pp;
3956                 pl[1] = NULL;
3957                 return (0);
3958         }
3959 
3960         if (pp != NULL)
3961                 pvn_plist_init(pp, pl, plsz, off, io_len, rw);
3962 
3963         return (error);
3964 }
3965 
3966 static void
3967 nfs_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr, struct seg *seg,
3968         cred_t *cr)
3969 {
3970         int error;
3971         page_t *pp;
3972         u_offset_t io_off;
3973         size_t io_len;
3974         struct buf *bp;
3975         uint_t bsize, blksize;
3976         rnode_t *rp = VTOR(vp);
3977 
3978         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
3979 
3980         bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
3981 
3982         mutex_enter(&rp->r_statelock);
3983         if (blkoff < rp->r_size && blkoff + bsize > rp->r_size) {
3984                 /*
3985                  * If less than a block left in file read less
3986                  * than a block.
3987                  */
3988                 blksize = rp->r_size - blkoff;
3989         } else
3990                 blksize = bsize;
3991         mutex_exit(&rp->r_statelock);
3992 
3993         pp = pvn_read_kluster(vp, blkoff, segkmap, addr,
3994             &io_off, &io_len, blkoff, blksize, 1);
3995         /*
3996          * The isra flag passed to the kluster function is 1, we may have
3997          * gotten a return value of NULL for a variety of reasons (# of free
3998          * pages < minfree, someone entered the page on the vnode etc). In all
3999          * cases, we want to punt on the readahead.
4000          */
4001         if (pp == NULL)
4002                 return;
4003 
4004         /*
4005          * Now round the request size up to page boundaries.
4006          * This ensures that the entire page will be
4007          * initialized to zeroes if EOF is encountered.
4008          */
4009         io_len = ptob(btopr(io_len));
4010 
4011         bp = pageio_setup(pp, io_len, vp, B_READ);
4012         ASSERT(bp != NULL);
4013 
4014         /*
4015          * pageio_setup should have set b_addr to 0.  This is correct since
4016          * we want to do I/O on a page boundary. bp_mapin() will use this addr
4017          * to calculate an offset, and then set b_addr to the kernel virtual
4018          * address it allocated for us.
4019          */
4020         ASSERT(bp->b_un.b_addr == 0);
4021 
4022         bp->b_edev = 0;
4023         bp->b_dev = 0;
4024         bp->b_lblkno = lbtodb(io_off);
4025         bp->b_file = vp;
4026         bp->b_offset = (offset_t)blkoff;
4027         bp_mapin(bp);
4028 
4029         /*
4030          * If doing a write beyond what we believe is EOF, don't bother trying
4031          * to read the pages from the server, we'll just zero the pages here.
4032          * We don't check that the rw flag is S_WRITE here because some
4033          * implementations may attempt a read access to the buffer before
4034          * copying data.
4035          */
4036         mutex_enter(&rp->r_statelock);
4037         if (io_off >= rp->r_size && seg == segkmap) {
4038                 mutex_exit(&rp->r_statelock);
4039                 bzero(bp->b_un.b_addr, io_len);
4040                 error = 0;
4041         } else {
4042                 mutex_exit(&rp->r_statelock);
4043                 error = nfs_bio(bp, cr);
4044                 if (error == NFS_EOF)
4045                         error = 0;
4046         }
4047 
4048         /*
4049          * Unmap the buffer before freeing it.
4050          */
4051         bp_mapout(bp);
4052         pageio_done(bp);
4053 
4054         pvn_read_done(pp, error ? B_READ | B_ERROR : B_READ);
4055 
4056         /*
4057          * In case of error set readahead offset
4058          * to the lowest offset.
4059          * pvn_read_done() calls VN_DISPOSE to destroy the pages
4060          */
4061         if (error && rp->r_nextr > io_off) {
4062                 mutex_enter(&rp->r_statelock);
4063                 if (rp->r_nextr > io_off)
4064                         rp->r_nextr = io_off;
4065                 mutex_exit(&rp->r_statelock);
4066         }
4067 }
4068 
4069 /*
4070  * Flags are composed of {B_INVAL, B_FREE, B_DONTNEED, B_FORCE}
4071  * If len == 0, do from off to EOF.
4072  *
4073  * The normal cases should be len == 0 && off == 0 (entire vp list),
4074  * len == MAXBSIZE (from segmap_release actions), and len == PAGESIZE
4075  * (from pageout).
4076  */
4077 /* ARGSUSED */
4078 static int
4079 nfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr,
4080         caller_context_t *ct)
4081 {
4082         int error;
4083         rnode_t *rp;
4084 
4085         ASSERT(cr != NULL);
4086 
4087         /*
4088          * XXX - Why should this check be made here?
4089          */
4090         if (vp->v_flag & VNOMAP)
4091                 return (ENOSYS);
4092 
4093         if (len == 0 && !(flags & B_INVAL) && vn_is_readonly(vp))
4094                 return (0);
4095 
4096         if (!(flags & B_ASYNC) && nfs_zone() != VTOMI(vp)->mi_zone)
4097                 return (EIO);
4098         ASSERT(off <= MAXOFF32_T);
4099 
4100         rp = VTOR(vp);
4101         mutex_enter(&rp->r_statelock);
4102         rp->r_count++;
4103         mutex_exit(&rp->r_statelock);
4104         error = nfs_putpages(vp, off, len, flags, cr);
4105         mutex_enter(&rp->r_statelock);
4106         rp->r_count--;
4107         cv_broadcast(&rp->r_cv);
4108         mutex_exit(&rp->r_statelock);
4109 
4110         return (error);
4111 }
4112 
4113 /*
4114  * Write out a single page, possibly klustering adjacent dirty pages.
4115  */
4116 int
4117 nfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, size_t *lenp,
4118         int flags, cred_t *cr)
4119 {
4120         u_offset_t io_off;
4121         u_offset_t lbn_off;
4122         u_offset_t lbn;
4123         size_t io_len;
4124         uint_t bsize;
4125         int error;
4126         rnode_t *rp;
4127 
4128         ASSERT(!vn_is_readonly(vp));
4129         ASSERT(pp != NULL);
4130         ASSERT(cr != NULL);
4131         ASSERT((flags & B_ASYNC) || nfs_zone() == VTOMI(vp)->mi_zone);
4132 
4133         rp = VTOR(vp);
4134         ASSERT(rp->r_count > 0);
4135 
4136         ASSERT(pp->p_offset <= MAXOFF32_T);
4137 
4138         bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
4139         lbn = pp->p_offset / bsize;
4140         lbn_off = lbn * bsize;
4141 
4142         /*
4143          * Find a kluster that fits in one block, or in
4144          * one page if pages are bigger than blocks.  If
4145          * there is less file space allocated than a whole
4146          * page, we'll shorten the i/o request below.
4147          */
4148         pp = pvn_write_kluster(vp, pp, &io_off, &io_len, lbn_off,
4149             roundup(bsize, PAGESIZE), flags);
4150 
4151         /*
4152          * pvn_write_kluster shouldn't have returned a page with offset
4153          * behind the original page we were given.  Verify that.
4154          */
4155         ASSERT((pp->p_offset / bsize) >= lbn);
4156 
4157         /*
4158          * Now pp will have the list of kept dirty pages marked for
4159          * write back.  It will also handle invalidation and freeing
4160          * of pages that are not dirty.  Check for page length rounding
4161          * problems.
4162          */
4163         if (io_off + io_len > lbn_off + bsize) {
4164                 ASSERT((io_off + io_len) - (lbn_off + bsize) < PAGESIZE);
4165                 io_len = lbn_off + bsize - io_off;
4166         }
4167         /*
4168          * The RMODINPROGRESS flag makes sure that nfs(3)_bio() sees a
4169          * consistent value of r_size. RMODINPROGRESS is set in writerp().
4170          * When RMODINPROGRESS is set it indicates that a uiomove() is in
4171          * progress and the r_size has not been made consistent with the
4172          * new size of the file. When the uiomove() completes the r_size is
4173          * updated and the RMODINPROGRESS flag is cleared.
4174          *
4175          * The RMODINPROGRESS flag makes sure that nfs(3)_bio() sees a
4176          * consistent value of r_size. Without this handshaking, it is
4177          * possible that nfs(3)_bio() picks  up the old value of r_size
4178          * before the uiomove() in writerp() completes. This will result
4179          * in the write through nfs(3)_bio() being dropped.
4180          *
4181          * More precisely, there is a window between the time the uiomove()
4182          * completes and the time the r_size is updated. If a VOP_PUTPAGE()
4183          * operation intervenes in this window, the page will be picked up,
4184          * because it is dirty (it will be unlocked, unless it was
4185          * pagecreate'd). When the page is picked up as dirty, the dirty
4186          * bit is reset (pvn_getdirty()). In nfs(3)write(), r_size is
4187          * checked. This will still be the old size. Therefore the page will
4188          * not be written out. When segmap_release() calls VOP_PUTPAGE(),
4189          * the page will be found to be clean and the write will be dropped.
4190          */
4191         if (rp->r_flags & RMODINPROGRESS) {
4192                 mutex_enter(&rp->r_statelock);
4193                 if ((rp->r_flags & RMODINPROGRESS) &&
4194                     rp->r_modaddr + MAXBSIZE > io_off &&
4195                     rp->r_modaddr < io_off + io_len) {
4196                         page_t *plist;
4197                         /*
4198                          * A write is in progress for this region of the file.
4199                          * If we did not detect RMODINPROGRESS here then this
4200                          * path through nfs_putapage() would eventually go to
4201                          * nfs(3)_bio() and may not write out all of the data
4202                          * in the pages. We end up losing data. So we decide
4203                          * to set the modified bit on each page in the page
4204                          * list and mark the rnode with RDIRTY. This write
4205                          * will be restarted at some later time.
4206                          */
4207                         plist = pp;
4208                         while (plist != NULL) {
4209                                 pp = plist;
4210                                 page_sub(&plist, pp);
4211                                 hat_setmod(pp);
4212                                 page_io_unlock(pp);
4213                                 page_unlock(pp);
4214                         }
4215                         rp->r_flags |= RDIRTY;
4216                         mutex_exit(&rp->r_statelock);
4217                         if (offp)
4218                                 *offp = io_off;
4219                         if (lenp)
4220                                 *lenp = io_len;
4221                         return (0);
4222                 }
4223                 mutex_exit(&rp->r_statelock);
4224         }
4225 
4226         if (flags & B_ASYNC) {
4227                 error = nfs_async_putapage(vp, pp, io_off, io_len, flags, cr,
4228                     nfs_sync_putapage);
4229         } else
4230                 error = nfs_sync_putapage(vp, pp, io_off, io_len, flags, cr);
4231 
4232         if (offp)
4233                 *offp = io_off;
4234         if (lenp)
4235                 *lenp = io_len;
4236         return (error);
4237 }
4238 
4239 static int
4240 nfs_sync_putapage(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
4241         int flags, cred_t *cr)
4242 {
4243         int error;
4244         rnode_t *rp;
4245 
4246         flags |= B_WRITE;
4247 
4248         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
4249         error = nfs_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
4250 
4251         rp = VTOR(vp);
4252 
4253         if ((error == ENOSPC || error == EDQUOT || error == EACCES) &&
4254             (flags & (B_INVAL|B_FORCE)) != (B_INVAL|B_FORCE)) {
4255                 if (!(rp->r_flags & ROUTOFSPACE)) {
4256                         mutex_enter(&rp->r_statelock);
4257                         rp->r_flags |= ROUTOFSPACE;
4258                         mutex_exit(&rp->r_statelock);
4259                 }
4260                 flags |= B_ERROR;
4261                 pvn_write_done(pp, flags);
4262                 /*
4263                  * If this was not an async thread, then try again to
4264                  * write out the pages, but this time, also destroy
4265                  * them whether or not the write is successful.  This
4266                  * will prevent memory from filling up with these
4267                  * pages and destroying them is the only alternative
4268                  * if they can't be written out.
4269                  *
4270                  * Don't do this if this is an async thread because
4271                  * when the pages are unlocked in pvn_write_done,
4272                  * some other thread could have come along, locked
4273                  * them, and queued for an async thread.  It would be
4274                  * possible for all of the async threads to be tied
4275                  * up waiting to lock the pages again and they would
4276                  * all already be locked and waiting for an async
4277                  * thread to handle them.  Deadlock.
4278                  */
4279                 if (!(flags & B_ASYNC)) {
4280                         error = nfs_putpage(vp, io_off, io_len,
4281                             B_INVAL | B_FORCE, cr, NULL);
4282                 }
4283         } else {
4284                 if (error)
4285                         flags |= B_ERROR;
4286                 else if (rp->r_flags & ROUTOFSPACE) {
4287                         mutex_enter(&rp->r_statelock);
4288                         rp->r_flags &= ~ROUTOFSPACE;
4289                         mutex_exit(&rp->r_statelock);
4290                 }
4291                 pvn_write_done(pp, flags);
4292         }
4293 
4294         return (error);
4295 }
4296 
4297 /* ARGSUSED */
4298 static int
4299 nfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
4300         size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
4301         caller_context_t *ct)
4302 {
4303         struct segvn_crargs vn_a;
4304         int error;
4305         rnode_t *rp;
4306         struct vattr va;
4307 
4308         if (nfs_zone() != VTOMI(vp)->mi_zone)
4309                 return (EIO);
4310 
4311         if (vp->v_flag & VNOMAP)
4312                 return (ENOSYS);
4313 
4314         if (off > MAXOFF32_T)
4315                 return (EFBIG);
4316 
4317         if (off < 0 || off + len < 0)
4318                 return (ENXIO);
4319 
4320         if (vp->v_type != VREG)
4321                 return (ENODEV);
4322 
4323         /*
4324          * If there is cached data and if close-to-open consistency
4325          * checking is not turned off and if the file system is not
4326          * mounted readonly, then force an over the wire getattr.
4327          * Otherwise, just invoke nfsgetattr to get a copy of the
4328          * attributes.  The attribute cache will be used unless it
4329          * is timed out and if it is, then an over the wire getattr
4330          * will be issued.
4331          */
4332         va.va_mask = AT_ALL;
4333         if (vn_has_cached_data(vp) &&
4334             !(VTOMI(vp)->mi_flags & MI_NOCTO) && !vn_is_readonly(vp))
4335                 error = nfs_getattr_otw(vp, &va, cr);
4336         else
4337                 error = nfsgetattr(vp, &va, cr);
4338         if (error)
4339                 return (error);
4340 
4341         /*
4342          * Check to see if the vnode is currently marked as not cachable.
4343          * This means portions of the file are locked (through VOP_FRLOCK).
4344          * In this case the map request must be refused.  We use
4345          * rp->r_lkserlock to avoid a race with concurrent lock requests.
4346          */
4347         rp = VTOR(vp);
4348 
4349         /*
4350          * Atomically increment r_inmap after acquiring r_rwlock. The
4351          * idea here is to acquire r_rwlock to block read/write and
4352          * not to protect r_inmap. r_inmap will inform nfs_read/write()
4353          * that we are in nfs_map(). Now, r_rwlock is acquired in order
4354          * and we can prevent the deadlock that would have occurred
4355          * when nfs_addmap() would have acquired it out of order.
4356          *
4357          * Since we are not protecting r_inmap by any lock, we do not
4358          * hold any lock when we decrement it. We atomically decrement
4359          * r_inmap after we release r_lkserlock.
4360          */
4361 
4362         if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, INTR(vp)))
4363                 return (EINTR);
4364         atomic_inc_uint(&rp->r_inmap);
4365         nfs_rw_exit(&rp->r_rwlock);
4366 
4367         if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR(vp))) {
4368                 atomic_dec_uint(&rp->r_inmap);
4369                 return (EINTR);
4370         }
4371         if (vp->v_flag & VNOCACHE) {
4372                 error = EAGAIN;
4373                 goto done;
4374         }
4375 
4376         /*
4377          * Don't allow concurrent locks and mapping if mandatory locking is
4378          * enabled.
4379          */
4380         if ((flk_has_remote_locks(vp) || lm_has_sleep(vp)) &&
4381             MANDLOCK(vp, va.va_mode)) {
4382                 error = EAGAIN;
4383                 goto done;
4384         }
4385 
4386         as_rangelock(as);
4387         error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
4388         if (error != 0) {
4389                 as_rangeunlock(as);
4390                 goto done;
4391         }
4392 
4393         vn_a.vp = vp;
4394         vn_a.offset = off;
4395         vn_a.type = (flags & MAP_TYPE);
4396         vn_a.prot = (uchar_t)prot;
4397         vn_a.maxprot = (uchar_t)maxprot;
4398         vn_a.flags = (flags & ~MAP_TYPE);
4399         vn_a.cred = cr;
4400         vn_a.amp = NULL;
4401         vn_a.szc = 0;
4402         vn_a.lgrp_mem_policy_flags = 0;
4403 
4404         error = as_map(as, *addrp, len, segvn_create, &vn_a);
4405         as_rangeunlock(as);
4406 
4407 done:
4408         nfs_rw_exit(&rp->r_lkserlock);
4409         atomic_dec_uint(&rp->r_inmap);
4410         return (error);
4411 }
4412 
4413 /* ARGSUSED */
4414 static int
4415 nfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
4416         size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
4417         caller_context_t *ct)
4418 {
4419         rnode_t *rp;
4420 
4421         if (vp->v_flag & VNOMAP)
4422                 return (ENOSYS);
4423         if (nfs_zone() != VTOMI(vp)->mi_zone)
4424                 return (EIO);
4425 
4426         rp = VTOR(vp);
4427         atomic_add_long((ulong_t *)&rp->r_mapcnt, btopr(len));
4428 
4429         return (0);
4430 }
4431 
4432 /* ARGSUSED */
4433 static int
4434 nfs_frlock(vnode_t *vp, int cmd, struct flock64 *bfp, int flag, offset_t offset,
4435         struct flk_callback *flk_cbp, cred_t *cr, caller_context_t *ct)
4436 {
4437         netobj lm_fh;
4438         int rc;
4439         u_offset_t start, end;
4440         rnode_t *rp;
4441         int error = 0, intr = INTR(vp);
4442 
4443         /* check for valid cmd parameter */
4444         if (cmd != F_GETLK && cmd != F_SETLK && cmd != F_SETLKW)
4445                 return (EINVAL);
4446         if (nfs_zone() != VTOMI(vp)->mi_zone)
4447                 return (EIO);
4448 
4449         /* Verify l_type. */
4450         switch (bfp->l_type) {
4451         case F_RDLCK:
4452                 if (cmd != F_GETLK && !(flag & FREAD))
4453                         return (EBADF);
4454                 break;
4455         case F_WRLCK:
4456                 if (cmd != F_GETLK && !(flag & FWRITE))
4457                         return (EBADF);
4458                 break;
4459         case F_UNLCK:
4460                 intr = 0;
4461                 break;
4462 
4463         default:
4464                 return (EINVAL);
4465         }
4466 
4467         /* check the validity of the lock range */
4468         if (rc = flk_convert_lock_data(vp, bfp, &start, &end, offset))
4469                 return (rc);
4470         if (rc = flk_check_lock_data(start, end, MAXOFF32_T))
4471                 return (rc);
4472 
4473         /*
4474          * If the filesystem is mounted using local locking, pass the
4475          * request off to the local locking code.
4476          */
4477         if (VTOMI(vp)->mi_flags & MI_LLOCK) {
4478                 if (offset > MAXOFF32_T)
4479                         return (EFBIG);
4480                 if (cmd == F_SETLK || cmd == F_SETLKW) {
4481                         /*
4482                          * For complete safety, we should be holding
4483                          * r_lkserlock.  However, we can't call
4484                          * lm_safelock and then fs_frlock while
4485                          * holding r_lkserlock, so just invoke
4486                          * lm_safelock and expect that this will
4487                          * catch enough of the cases.
4488                          */
4489                         if (!lm_safelock(vp, bfp, cr))
4490                                 return (EAGAIN);
4491                 }
4492                 return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
4493         }
4494 
4495         rp = VTOR(vp);
4496 
4497         /*
4498          * Check whether the given lock request can proceed, given the
4499          * current file mappings.
4500          */
4501         if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_WRITER, intr))
4502                 return (EINTR);
4503         if (cmd == F_SETLK || cmd == F_SETLKW) {
4504                 if (!lm_safelock(vp, bfp, cr)) {
4505                         rc = EAGAIN;
4506                         goto done;
4507                 }
4508         }
4509 
4510         /*
4511          * Flush the cache after waiting for async I/O to finish.  For new
4512          * locks, this is so that the process gets the latest bits from the
4513          * server.  For unlocks, this is so that other clients see the
4514          * latest bits once the file has been unlocked.  If currently dirty
4515          * pages can't be flushed, then don't allow a lock to be set.  But
4516          * allow unlocks to succeed, to avoid having orphan locks on the
4517          * server.
4518          */
4519         if (cmd != F_GETLK) {
4520                 mutex_enter(&rp->r_statelock);
4521                 while (rp->r_count > 0) {
4522                         if (intr) {
4523                                 klwp_t *lwp = ttolwp(curthread);
4524 
4525                                 if (lwp != NULL)
4526                                         lwp->lwp_nostop++;
4527                                 if (cv_wait_sig(&rp->r_cv, &rp->r_statelock)
4528                                     == 0) {
4529                                         if (lwp != NULL)
4530                                                 lwp->lwp_nostop--;
4531                                         rc = EINTR;
4532                                         break;
4533                                 }
4534                                 if (lwp != NULL)
4535                                         lwp->lwp_nostop--;
4536                         } else
4537                         cv_wait(&rp->r_cv, &rp->r_statelock);
4538                 }
4539                 mutex_exit(&rp->r_statelock);
4540                 if (rc != 0)
4541                         goto done;
4542                 error = nfs_putpage(vp, (offset_t)0, 0, B_INVAL, cr, ct);
4543                 if (error) {
4544                         if (error == ENOSPC || error == EDQUOT) {
4545                                 mutex_enter(&rp->r_statelock);
4546                                 if (!rp->r_error)
4547                                         rp->r_error = error;
4548                                 mutex_exit(&rp->r_statelock);
4549                         }
4550                         if (bfp->l_type != F_UNLCK) {
4551                                 rc = ENOLCK;
4552                                 goto done;
4553                         }
4554                 }
4555         }
4556 
4557         lm_fh.n_len = sizeof (fhandle_t);
4558         lm_fh.n_bytes = (char *)VTOFH(vp);
4559 
4560         /*
4561          * Call the lock manager to do the real work of contacting
4562          * the server and obtaining the lock.
4563          */
4564         rc = lm_frlock(vp, cmd, bfp, flag, offset, cr, &lm_fh, flk_cbp);
4565 
4566         if (rc == 0)
4567                 nfs_lockcompletion(vp, cmd);
4568 
4569 done:
4570         nfs_rw_exit(&rp->r_lkserlock);
4571         return (rc);
4572 }
4573 
4574 /*
4575  * Free storage space associated with the specified vnode.  The portion
4576  * to be freed is specified by bfp->l_start and bfp->l_len (already
4577  * normalized to a "whence" of 0).
4578  *
4579  * This is an experimental facility whose continued existence is not
4580  * guaranteed.  Currently, we only support the special case
4581  * of l_len == 0, meaning free to end of file.
4582  */
4583 /* ARGSUSED */
4584 static int
4585 nfs_space(vnode_t *vp, int cmd, struct flock64 *bfp, int flag,
4586         offset_t offset, cred_t *cr, caller_context_t *ct)
4587 {
4588         int error;
4589 
4590         ASSERT(vp->v_type == VREG);
4591         if (cmd != F_FREESP)
4592                 return (EINVAL);
4593 
4594         if (offset > MAXOFF32_T)
4595                 return (EFBIG);
4596 
4597         if ((bfp->l_start > MAXOFF32_T) || (bfp->l_end > MAXOFF32_T) ||
4598             (bfp->l_len > MAXOFF32_T))
4599                 return (EFBIG);
4600 
4601         if (nfs_zone() != VTOMI(vp)->mi_zone)
4602                 return (EIO);
4603 
4604         error = convoff(vp, bfp, 0, offset);
4605         if (!error) {
4606                 ASSERT(bfp->l_start >= 0);
4607                 if (bfp->l_len == 0) {
4608                         struct vattr va;
4609 
4610                         /*
4611                          * ftruncate should not change the ctime and
4612                          * mtime if we truncate the file to its
4613                          * previous size.
4614                          */
4615                         va.va_mask = AT_SIZE;
4616                         error = nfsgetattr(vp, &va, cr);
4617                         if (error || va.va_size == bfp->l_start)
4618                                 return (error);
4619                         va.va_mask = AT_SIZE;
4620                         va.va_size = bfp->l_start;
4621                         error = nfssetattr(vp, &va, 0, cr);
4622 
4623                         if (error == 0 && bfp->l_start == 0)
4624                                 vnevent_truncate(vp, ct);
4625                 } else
4626                         error = EINVAL;
4627         }
4628 
4629         return (error);
4630 }
4631 
4632 /* ARGSUSED */
4633 static int
4634 nfs_realvp(vnode_t *vp, vnode_t **vpp, caller_context_t *ct)
4635 {
4636 
4637         return (EINVAL);
4638 }
4639 
4640 /*
4641  * Setup and add an address space callback to do the work of the delmap call.
4642  * The callback will (and must be) deleted in the actual callback function.
4643  *
4644  * This is done in order to take care of the problem that we have with holding
4645  * the address space's a_lock for a long period of time (e.g. if the NFS server
4646  * is down).  Callbacks will be executed in the address space code while the
4647  * a_lock is not held.  Holding the address space's a_lock causes things such
4648  * as ps and fork to hang because they are trying to acquire this lock as well.
4649  */
4650 /* ARGSUSED */
4651 static int
4652 nfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
4653         size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr,
4654         caller_context_t *ct)
4655 {
4656         int                     caller_found;
4657         int                     error;
4658         rnode_t                 *rp;
4659         nfs_delmap_args_t       *dmapp;
4660         nfs_delmapcall_t        *delmap_call;
4661 
4662         if (vp->v_flag & VNOMAP)
4663                 return (ENOSYS);
4664         /*
4665          * A process may not change zones if it has NFS pages mmap'ed
4666          * in, so we can't legitimately get here from the wrong zone.
4667          */
4668         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
4669 
4670         rp = VTOR(vp);
4671 
4672         /*
4673          * The way that the address space of this process deletes its mapping
4674          * of this file is via the following call chains:
4675          * - as_free()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs_delmap()
4676          * - as_unmap()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs_delmap()
4677          *
4678          * With the use of address space callbacks we are allowed to drop the
4679          * address space lock, a_lock, while executing the NFS operations that
4680          * need to go over the wire.  Returning EAGAIN to the caller of this
4681          * function is what drives the execution of the callback that we add
4682          * below.  The callback will be executed by the address space code
4683          * after dropping the a_lock.  When the callback is finished, since
4684          * we dropped the a_lock, it must be re-acquired and segvn_unmap()
4685          * is called again on the same segment to finish the rest of the work
4686          * that needs to happen during unmapping.
4687          *
4688          * This action of calling back into the segment driver causes
4689          * nfs_delmap() to get called again, but since the callback was
4690          * already executed at this point, it already did the work and there
4691          * is nothing left for us to do.
4692          *
4693          * To Summarize:
4694          * - The first time nfs_delmap is called by the current thread is when
4695          * we add the caller associated with this delmap to the delmap caller
4696          * list, add the callback, and return EAGAIN.
4697          * - The second time in this call chain when nfs_delmap is called we
4698          * will find this caller in the delmap caller list and realize there
4699          * is no more work to do thus removing this caller from the list and
4700          * returning the error that was set in the callback execution.
4701          */
4702         caller_found = nfs_find_and_delete_delmapcall(rp, &error);
4703         if (caller_found) {
4704                 /*
4705                  * 'error' is from the actual delmap operations.  To avoid
4706                  * hangs, we need to handle the return of EAGAIN differently
4707                  * since this is what drives the callback execution.
4708                  * In this case, we don't want to return EAGAIN and do the
4709                  * callback execution because there are none to execute.
4710                  */
4711                 if (error == EAGAIN)
4712                         return (0);
4713                 else
4714                         return (error);
4715         }
4716 
4717         /* current caller was not in the list */
4718         delmap_call = nfs_init_delmapcall();
4719 
4720         mutex_enter(&rp->r_statelock);
4721         list_insert_tail(&rp->r_indelmap, delmap_call);
4722         mutex_exit(&rp->r_statelock);
4723 
4724         dmapp = kmem_alloc(sizeof (nfs_delmap_args_t), KM_SLEEP);
4725 
4726         dmapp->vp = vp;
4727         dmapp->off = off;
4728         dmapp->addr = addr;
4729         dmapp->len = len;
4730         dmapp->prot = prot;
4731         dmapp->maxprot = maxprot;
4732         dmapp->flags = flags;
4733         dmapp->cr = cr;
4734         dmapp->caller = delmap_call;
4735 
4736         error = as_add_callback(as, nfs_delmap_callback, dmapp,
4737             AS_UNMAP_EVENT, addr, len, KM_SLEEP);
4738 
4739         return (error ? error : EAGAIN);
4740 }
4741 
4742 /*
4743  * Remove some pages from an mmap'd vnode.  Just update the
4744  * count of pages.  If doing close-to-open, then flush all
4745  * of the pages associated with this file.  Otherwise, start
4746  * an asynchronous page flush to write out any dirty pages.
4747  * This will also associate a credential with the rnode which
4748  * can be used to write the pages.
4749  */
4750 /* ARGSUSED */
4751 static void
4752 nfs_delmap_callback(struct as *as, void *arg, uint_t event)
4753 {
4754         int                     error;
4755         rnode_t                 *rp;
4756         mntinfo_t               *mi;
4757         nfs_delmap_args_t       *dmapp = (nfs_delmap_args_t *)arg;
4758 
4759         rp = VTOR(dmapp->vp);
4760         mi = VTOMI(dmapp->vp);
4761 
4762         atomic_add_long((ulong_t *)&rp->r_mapcnt, -btopr(dmapp->len));
4763         ASSERT(rp->r_mapcnt >= 0);
4764 
4765         /*
4766          * Initiate a page flush if there are pages, the file system
4767          * was not mounted readonly, the segment was mapped shared, and
4768          * the pages themselves were writeable.
4769          */
4770         if (vn_has_cached_data(dmapp->vp) && !vn_is_readonly(dmapp->vp) &&
4771             dmapp->flags == MAP_SHARED && (dmapp->maxprot & PROT_WRITE)) {
4772                 mutex_enter(&rp->r_statelock);
4773                 rp->r_flags |= RDIRTY;
4774                 mutex_exit(&rp->r_statelock);
4775                 /*
4776                  * If this is a cross-zone access a sync putpage won't work, so
4777                  * the best we can do is try an async putpage.  That seems
4778                  * better than something more draconian such as discarding the
4779                  * dirty pages.
4780                  */
4781                 if ((mi->mi_flags & MI_NOCTO) ||
4782                     nfs_zone() != mi->mi_zone)
4783                         error = nfs_putpage(dmapp->vp, dmapp->off, dmapp->len,
4784                             B_ASYNC, dmapp->cr, NULL);
4785                 else
4786                         error = nfs_putpage(dmapp->vp, dmapp->off, dmapp->len,
4787                             0, dmapp->cr, NULL);
4788                 if (!error) {
4789                         mutex_enter(&rp->r_statelock);
4790                         error = rp->r_error;
4791                         rp->r_error = 0;
4792                         mutex_exit(&rp->r_statelock);
4793                 }
4794         } else
4795                 error = 0;
4796 
4797         if ((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO))
4798                 (void) nfs_putpage(dmapp->vp, dmapp->off, dmapp->len,
4799                     B_INVAL, dmapp->cr, NULL);
4800 
4801         dmapp->caller->error = error;
4802         (void) as_delete_callback(as, arg);
4803         kmem_free(dmapp, sizeof (nfs_delmap_args_t));
4804 }
4805 
4806 /* ARGSUSED */
4807 static int
4808 nfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
4809         caller_context_t *ct)
4810 {
4811         int error = 0;
4812 
4813         if (nfs_zone() != VTOMI(vp)->mi_zone)
4814                 return (EIO);
4815         /*
4816          * This looks a little weird because it's written in a general
4817          * manner but we make little use of cases.  If cntl() ever gets
4818          * widely used, the outer switch will make more sense.
4819          */
4820 
4821         switch (cmd) {
4822 
4823         /*
4824          * Large file spec - need to base answer new query with
4825          * hardcoded constant based on the protocol.
4826          */
4827         case _PC_FILESIZEBITS:
4828                 *valp = 32;
4829                 return (0);
4830 
4831         case _PC_LINK_MAX:
4832         case _PC_NAME_MAX:
4833         case _PC_PATH_MAX:
4834         case _PC_SYMLINK_MAX:
4835         case _PC_CHOWN_RESTRICTED:
4836         case _PC_NO_TRUNC: {
4837                 mntinfo_t *mi;
4838                 struct pathcnf *pc;
4839 
4840                 if ((mi = VTOMI(vp)) == NULL || (pc = mi->mi_pathconf) == NULL)
4841                         return (EINVAL);
4842                 error = _PC_ISSET(cmd, pc->pc_mask);    /* error or bool */
4843                 switch (cmd) {
4844                 case _PC_LINK_MAX:
4845                         *valp = pc->pc_link_max;
4846                         break;
4847                 case _PC_NAME_MAX:
4848                         *valp = pc->pc_name_max;
4849                         break;
4850                 case _PC_PATH_MAX:
4851                 case _PC_SYMLINK_MAX:
4852                         *valp = pc->pc_path_max;
4853                         break;
4854                 case _PC_CHOWN_RESTRICTED:
4855                         /*
4856                          * if we got here, error is really a boolean which
4857                          * indicates whether cmd is set or not.
4858                          */
4859                         *valp = error ? 1 : 0;  /* see above */
4860                         error = 0;
4861                         break;
4862                 case _PC_NO_TRUNC:
4863                         /*
4864                          * if we got here, error is really a boolean which
4865                          * indicates whether cmd is set or not.
4866                          */
4867                         *valp = error ? 1 : 0;  /* see above */
4868                         error = 0;
4869                         break;
4870                 }
4871                 return (error ? EINVAL : 0);
4872                 }
4873 
4874         case _PC_XATTR_EXISTS:
4875                 *valp = 0;
4876                 if (vp->v_vfsp->vfs_flag & VFS_XATTR) {
4877                         vnode_t *avp;
4878                         rnode_t *rp;
4879                         mntinfo_t *mi = VTOMI(vp);
4880 
4881                         if (!(mi->mi_flags & MI_EXTATTR))
4882                                 return (0);
4883 
4884                         rp = VTOR(vp);
4885                         if (nfs_rw_enter_sig(&rp->r_rwlock, RW_READER,
4886                             INTR(vp)))
4887                                 return (EINTR);
4888 
4889                         error = nfslookup_dnlc(vp, XATTR_DIR_NAME, &avp, cr);
4890                         if (error || avp == NULL)
4891                                 error = acl_getxattrdir2(vp, &avp, 0, cr, 0);
4892 
4893                         nfs_rw_exit(&rp->r_rwlock);
4894 
4895                         if (error == 0 && avp != NULL) {
4896                                 error = do_xattr_exists_check(avp, valp, cr);
4897                                 VN_RELE(avp);
4898                         }
4899                 }
4900                 return (error ? EINVAL : 0);
4901 
4902         case _PC_ACL_ENABLED:
4903                 *valp = _ACL_ACLENT_ENABLED;
4904                 return (0);
4905 
4906         default:
4907                 return (EINVAL);
4908         }
4909 }
4910 
4911 /*
4912  * Called by async thread to do synchronous pageio. Do the i/o, wait
4913  * for it to complete, and cleanup the page list when done.
4914  */
4915 static int
4916 nfs_sync_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
4917         int flags, cred_t *cr)
4918 {
4919         int error;
4920 
4921         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
4922         error = nfs_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
4923         if (flags & B_READ)
4924                 pvn_read_done(pp, (error ? B_ERROR : 0) | flags);
4925         else
4926                 pvn_write_done(pp, (error ? B_ERROR : 0) | flags);
4927         return (error);
4928 }
4929 
4930 /* ARGSUSED */
4931 static int
4932 nfs_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
4933         int flags, cred_t *cr, caller_context_t *ct)
4934 {
4935         int error;
4936         rnode_t *rp;
4937 
4938         if (pp == NULL)
4939                 return (EINVAL);
4940 
4941         if (io_off > MAXOFF32_T)
4942                 return (EFBIG);
4943         if (nfs_zone() != VTOMI(vp)->mi_zone)
4944                 return (EIO);
4945         rp = VTOR(vp);
4946         mutex_enter(&rp->r_statelock);
4947         rp->r_count++;
4948         mutex_exit(&rp->r_statelock);
4949 
4950         if (flags & B_ASYNC) {
4951                 error = nfs_async_pageio(vp, pp, io_off, io_len, flags, cr,
4952                     nfs_sync_pageio);
4953         } else
4954                 error = nfs_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
4955         mutex_enter(&rp->r_statelock);
4956         rp->r_count--;
4957         cv_broadcast(&rp->r_cv);
4958         mutex_exit(&rp->r_statelock);
4959         return (error);
4960 }
4961 
4962 /* ARGSUSED */
4963 static int
4964 nfs_setsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr,
4965         caller_context_t *ct)
4966 {
4967         int error;
4968         mntinfo_t *mi;
4969 
4970         mi = VTOMI(vp);
4971 
4972         if (nfs_zone() != mi->mi_zone)
4973                 return (EIO);
4974         if (mi->mi_flags & MI_ACL) {
4975                 error = acl_setacl2(vp, vsecattr, flag, cr);
4976                 if (mi->mi_flags & MI_ACL)
4977                         return (error);
4978         }
4979 
4980         return (ENOSYS);
4981 }
4982 
4983 /* ARGSUSED */
4984 static int
4985 nfs_getsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr,
4986         caller_context_t *ct)
4987 {
4988         int error;
4989         mntinfo_t *mi;
4990 
4991         mi = VTOMI(vp);
4992 
4993         if (nfs_zone() != mi->mi_zone)
4994                 return (EIO);
4995         if (mi->mi_flags & MI_ACL) {
4996                 error = acl_getacl2(vp, vsecattr, flag, cr);
4997                 if (mi->mi_flags & MI_ACL)
4998                         return (error);
4999         }
5000 
5001         return (fs_fab_acl(vp, vsecattr, flag, cr, ct));
5002 }
5003 
5004 /* ARGSUSED */
5005 static int
5006 nfs_shrlock(vnode_t *vp, int cmd, struct shrlock *shr, int flag, cred_t *cr,
5007         caller_context_t *ct)
5008 {
5009         int error;
5010         struct shrlock nshr;
5011         struct nfs_owner nfs_owner;
5012         netobj lm_fh;
5013 
5014         if (nfs_zone() != VTOMI(vp)->mi_zone)
5015                 return (EIO);
5016 
5017         /*
5018          * check for valid cmd parameter
5019          */
5020         if (cmd != F_SHARE && cmd != F_UNSHARE && cmd != F_HASREMOTELOCKS)
5021                 return (EINVAL);
5022 
5023         /*
5024          * Check access permissions
5025          */
5026         if (cmd == F_SHARE &&
5027             (((shr->s_access & F_RDACC) && !(flag & FREAD)) ||
5028             ((shr->s_access & F_WRACC) && !(flag & FWRITE))))
5029                 return (EBADF);
5030 
5031         /*
5032          * If the filesystem is mounted using local locking, pass the
5033          * request off to the local share code.
5034          */
5035         if (VTOMI(vp)->mi_flags & MI_LLOCK)
5036                 return (fs_shrlock(vp, cmd, shr, flag, cr, ct));
5037 
5038         switch (cmd) {
5039         case F_SHARE:
5040         case F_UNSHARE:
5041                 lm_fh.n_len = sizeof (fhandle_t);
5042                 lm_fh.n_bytes = (char *)VTOFH(vp);
5043 
5044                 /*
5045                  * If passed an owner that is too large to fit in an
5046                  * nfs_owner it is likely a recursive call from the
5047                  * lock manager client and pass it straight through.  If
5048                  * it is not a nfs_owner then simply return an error.
5049                  */
5050                 if (shr->s_own_len > sizeof (nfs_owner.lowner)) {
5051                         if (((struct nfs_owner *)shr->s_owner)->magic !=
5052                             NFS_OWNER_MAGIC)
5053                                 return (EINVAL);
5054 
5055                         if (error = lm_shrlock(vp, cmd, shr, flag, &lm_fh)) {
5056                                 error = set_errno(error);
5057                         }
5058                         return (error);
5059                 }
5060                 /*
5061                  * Remote share reservations owner is a combination of
5062                  * a magic number, hostname, and the local owner
5063                  */
5064                 bzero(&nfs_owner, sizeof (nfs_owner));
5065                 nfs_owner.magic = NFS_OWNER_MAGIC;
5066                 (void) strncpy(nfs_owner.hname, uts_nodename(),
5067                     sizeof (nfs_owner.hname));
5068                 bcopy(shr->s_owner, nfs_owner.lowner, shr->s_own_len);
5069                 nshr.s_access = shr->s_access;
5070                 nshr.s_deny = shr->s_deny;
5071                 nshr.s_sysid = 0;
5072                 nshr.s_pid = ttoproc(curthread)->p_pid;
5073                 nshr.s_own_len = sizeof (nfs_owner);
5074                 nshr.s_owner = (caddr_t)&nfs_owner;
5075 
5076                 if (error = lm_shrlock(vp, cmd, &nshr, flag, &lm_fh)) {
5077                         error = set_errno(error);
5078                 }
5079 
5080                 break;
5081 
5082         case F_HASREMOTELOCKS:
5083                 /*
5084                  * NFS client can't store remote locks itself
5085                  */
5086                 shr->s_access = 0;
5087                 error = 0;
5088                 break;
5089 
5090         default:
5091                 error = EINVAL;
5092                 break;
5093         }
5094 
5095         return (error);
5096 }