NFS4 data corruption (#3508)
If async calls are disabled, nfs4_async_putapage is supposed to do its
work synchronously. Due to a bug, it sometimes just does nothing, leaving
the page for later.
Unfortunately the caller has already reset the R4DIRTY flag.
Without R4DIRTY, nfs4_attrcache_va can't see that there are still
outstanding writes and accepts the file size from the server, which is
too low.
When the dirty page finally gets written back, the page size is truncated
to the file size, leaving some bytes unwritten.
Reviewed by: Marcel Telka <marcel@telka.sk>
Reviewed by: Robert Gordon <rbg@openrbg.com>

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
  23  */
  24 
  25 /*
  26  *      Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
  27  *      All Rights Reserved
  28  */
  29 
  30 #include <sys/param.h>
  31 #include <sys/types.h>
  32 #include <sys/systm.h>
  33 #include <sys/thread.h>
  34 #include <sys/t_lock.h>
  35 #include <sys/time.h>
  36 #include <sys/vnode.h>
  37 #include <sys/vfs.h>
  38 #include <sys/errno.h>
  39 #include <sys/buf.h>
  40 #include <sys/stat.h>
  41 #include <sys/cred.h>
  42 #include <sys/kmem.h>
  43 #include <sys/debug.h>
  44 #include <sys/dnlc.h>
  45 #include <sys/vmsystm.h>
  46 #include <sys/flock.h>
  47 #include <sys/share.h>
  48 #include <sys/cmn_err.h>
  49 #include <sys/tiuser.h>
  50 #include <sys/sysmacros.h>
  51 #include <sys/callb.h>
  52 #include <sys/acl.h>
  53 #include <sys/kstat.h>
  54 #include <sys/signal.h>
  55 #include <sys/disp.h>
  56 #include <sys/atomic.h>
  57 #include <sys/list.h>
  58 #include <sys/sdt.h>
  59 
  60 #include <rpc/types.h>
  61 #include <rpc/xdr.h>
  62 #include <rpc/auth.h>
  63 #include <rpc/clnt.h>
  64 
  65 #include <nfs/nfs.h>
  66 #include <nfs/nfs_clnt.h>
  67 #include <nfs/nfs_acl.h>
  68 
  69 #include <nfs/nfs4.h>
  70 #include <nfs/rnode4.h>
  71 #include <nfs/nfs4_clnt.h>
  72 
  73 #include <vm/hat.h>
  74 #include <vm/as.h>
  75 #include <vm/page.h>
  76 #include <vm/pvn.h>
  77 #include <vm/seg.h>
  78 #include <vm/seg_map.h>
  79 #include <vm/seg_vn.h>
  80 
  81 #include <sys/ddi.h>
  82 
  83 /*
  84  * Arguments to page-flush thread.
  85  */
  86 typedef struct {
  87         vnode_t *vp;
  88         cred_t *cr;
  89 } pgflush_t;
  90 
  91 #ifdef DEBUG
  92 int nfs4_client_lease_debug;
  93 int nfs4_sharedfh_debug;
  94 int nfs4_fname_debug;
  95 
  96 /* temporary: panic if v_type is inconsistent with r_attr va_type */
  97 int nfs4_vtype_debug;
  98 
  99 uint_t nfs4_tsd_key;
 100 #endif
 101 
 102 static time_t   nfs4_client_resumed = 0;
 103 static  callb_id_t cid = 0;
 104 
 105 static int      nfs4renew(nfs4_server_t *);
 106 static void     nfs4_attrcache_va(vnode_t *, nfs4_ga_res_t *, int);
 107 static void     nfs4_pgflush_thread(pgflush_t *);
 108 
 109 static boolean_t nfs4_client_cpr_callb(void *, int);
 110 
 111 struct mi4_globals {
 112         kmutex_t        mig_lock;  /* lock protecting mig_list */
 113         list_t          mig_list;  /* list of NFS v4 mounts in zone */
 114         boolean_t       mig_destructor_called;
 115 };
 116 
 117 static zone_key_t mi4_list_key;
 118 
 119 /*
 120  * Attributes caching:
 121  *
 122  * Attributes are cached in the rnode in struct vattr form.
 123  * There is a time associated with the cached attributes (r_time_attr_inval)
 124  * which tells whether the attributes are valid. The time is initialized
 125  * to the difference between current time and the modify time of the vnode
 126  * when new attributes are cached. This allows the attributes for
 127  * files that have changed recently to be timed out sooner than for files
 128  * that have not changed for a long time. There are minimum and maximum
 129  * timeout values that can be set per mount point.
 130  */
 131 
 132 /*
 133  * If a cache purge is in progress, wait for it to finish.
 134  *
 135  * The current thread must not be in the middle of an
 136  * nfs4_start_op/nfs4_end_op region.  Otherwise, there could be a deadlock
 137  * between this thread, a recovery thread, and the page flush thread.
 138  */
 139 int
 140 nfs4_waitfor_purge_complete(vnode_t *vp)
 141 {
 142         rnode4_t *rp;
 143         k_sigset_t smask;
 144 
 145         rp = VTOR4(vp);
 146         if ((rp->r_serial != NULL && rp->r_serial != curthread) ||
 147             ((rp->r_flags & R4PGFLUSH) && rp->r_pgflush != curthread)) {
 148                 mutex_enter(&rp->r_statelock);
 149                 sigintr(&smask, VTOMI4(vp)->mi_flags & MI4_INT);
 150                 while ((rp->r_serial != NULL && rp->r_serial != curthread) ||
 151                     ((rp->r_flags & R4PGFLUSH) &&
 152                     rp->r_pgflush != curthread)) {
 153                         if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
 154                                 sigunintr(&smask);
 155                                 mutex_exit(&rp->r_statelock);
 156                                 return (EINTR);
 157                         }
 158                 }
 159                 sigunintr(&smask);
 160                 mutex_exit(&rp->r_statelock);
 161         }
 162         return (0);
 163 }
 164 
 165 /*
 166  * Validate caches by checking cached attributes. If they have timed out,
 167  * then get new attributes from the server.  As a side effect, cache
 168  * invalidation is done if the attributes have changed.
 169  *
 170  * If the attributes have not timed out and if there is a cache
 171  * invalidation being done by some other thread, then wait until that
 172  * thread has completed the cache invalidation.
 173  */
 174 int
 175 nfs4_validate_caches(vnode_t *vp, cred_t *cr)
 176 {
 177         int error;
 178         nfs4_ga_res_t gar;
 179 
 180         if (ATTRCACHE4_VALID(vp)) {
 181                 error = nfs4_waitfor_purge_complete(vp);
 182                 if (error)
 183                         return (error);
 184                 return (0);
 185         }
 186 
 187         gar.n4g_va.va_mask = AT_ALL;
 188         return (nfs4_getattr_otw(vp, &gar, cr, 0));
 189 }
 190 
 191 /*
 192  * Fill in attribute from the cache.
 193  * If valid, then return 0 to indicate that no error occurred,
 194  * otherwise return 1 to indicate that an error occurred.
 195  */
 196 static int
 197 nfs4_getattr_cache(vnode_t *vp, struct vattr *vap)
 198 {
 199         rnode4_t *rp;
 200 
 201         rp = VTOR4(vp);
 202         mutex_enter(&rp->r_statelock);
 203         mutex_enter(&rp->r_statev4_lock);
 204         if (ATTRCACHE4_VALID(vp)) {
 205                 mutex_exit(&rp->r_statev4_lock);
 206                 /*
 207                  * Cached attributes are valid
 208                  */
 209                 *vap = rp->r_attr;
 210                 mutex_exit(&rp->r_statelock);
 211                 return (0);
 212         }
 213         mutex_exit(&rp->r_statev4_lock);
 214         mutex_exit(&rp->r_statelock);
 215         return (1);
 216 }
 217 
 218 
 219 /*
 220  * If returned error is ESTALE flush all caches.  The nfs4_purge_caches()
 221  * call is synchronous because all the pages were invalidated by the
 222  * nfs4_invalidate_pages() call.
 223  */
 224 void
 225 nfs4_purge_stale_fh(int errno, vnode_t *vp, cred_t *cr)
 226 {
 227         struct rnode4 *rp = VTOR4(vp);
 228 
 229         /* Ensure that the ..._end_op() call has been done */
 230         ASSERT(tsd_get(nfs4_tsd_key) == NULL);
 231 
 232         if (errno != ESTALE)
 233                 return;
 234 
 235         mutex_enter(&rp->r_statelock);
 236         rp->r_flags |= R4STALE;
 237         if (!rp->r_error)
 238                 rp->r_error = errno;
 239         mutex_exit(&rp->r_statelock);
 240         if (nfs4_has_pages(vp))
 241                 nfs4_invalidate_pages(vp, (u_offset_t)0, cr);
 242         nfs4_purge_caches(vp, NFS4_PURGE_DNLC, cr, FALSE);
 243 }
 244 
 245 /*
 246  * Purge all of the various NFS `data' caches.  If "asyncpg" is TRUE, the
 247  * page purge is done asynchronously.
 248  */
 249 void
 250 nfs4_purge_caches(vnode_t *vp, int purge_dnlc, cred_t *cr, int asyncpg)
 251 {
 252         rnode4_t *rp;
 253         char *contents;
 254         vnode_t *xattr;
 255         int size;
 256         int pgflush;                    /* are we the page flush thread? */
 257 
 258         /*
 259          * Purge the DNLC for any entries which refer to this file.
 260          */
 261         if (vp->v_count > 1 &&
 262             (vp->v_type == VDIR || purge_dnlc == NFS4_PURGE_DNLC))
 263                 dnlc_purge_vp(vp);
 264 
 265         /*
 266          * Clear any readdir state bits and purge the readlink response cache.
 267          */
 268         rp = VTOR4(vp);
 269         mutex_enter(&rp->r_statelock);
 270         rp->r_flags &= ~R4LOOKUP;
 271         contents = rp->r_symlink.contents;
 272         size = rp->r_symlink.size;
 273         rp->r_symlink.contents = NULL;
 274 
 275         xattr = rp->r_xattr_dir;
 276         rp->r_xattr_dir = NULL;
 277 
 278         /*
 279          * Purge pathconf cache too.
 280          */
 281         rp->r_pathconf.pc4_xattr_valid = 0;
 282         rp->r_pathconf.pc4_cache_valid = 0;
 283 
 284         pgflush = (curthread == rp->r_pgflush);
 285         mutex_exit(&rp->r_statelock);
 286 
 287         if (contents != NULL) {
 288 
 289                 kmem_free((void *)contents, size);
 290         }
 291 
 292         if (xattr != NULL)
 293                 VN_RELE(xattr);
 294 
 295         /*
 296          * Flush the page cache.  If the current thread is the page flush
 297          * thread, don't initiate a new page flush.  There's no need for
 298          * it, and doing it correctly is hard.
 299          */
 300         if (nfs4_has_pages(vp) && !pgflush) {
 301                 if (!asyncpg) {
 302                         (void) nfs4_waitfor_purge_complete(vp);
 303                         nfs4_flush_pages(vp, cr);
 304                 } else {
 305                         pgflush_t *args;
 306 
 307                         /*
 308                          * We don't hold r_statelock while creating the
 309                          * thread, in case the call blocks.  So we use a
 310                          * flag to indicate that a page flush thread is
 311                          * active.
 312                          */
 313                         mutex_enter(&rp->r_statelock);
 314                         if (rp->r_flags & R4PGFLUSH) {
 315                                 mutex_exit(&rp->r_statelock);
 316                         } else {
 317                                 rp->r_flags |= R4PGFLUSH;
 318                                 mutex_exit(&rp->r_statelock);
 319 
 320                                 args = kmem_alloc(sizeof (pgflush_t),
 321                                     KM_SLEEP);
 322                                 args->vp = vp;
 323                                 VN_HOLD(args->vp);
 324                                 args->cr = cr;
 325                                 crhold(args->cr);
 326                                 (void) zthread_create(NULL, 0,
 327                                     nfs4_pgflush_thread, args, 0,
 328                                     minclsyspri);
 329                         }
 330                 }
 331         }
 332 
 333         /*
 334          * Flush the readdir response cache.
 335          */
 336         nfs4_purge_rddir_cache(vp);
 337 }
 338 
 339 /*
 340  * Invalidate all pages for the given file, after writing back the dirty
 341  * ones.
 342  */
 343 
 344 void
 345 nfs4_flush_pages(vnode_t *vp, cred_t *cr)
 346 {
 347         int error;
 348         rnode4_t *rp = VTOR4(vp);
 349 
 350         error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_INVAL, cr, NULL);
 351         if (error == ENOSPC || error == EDQUOT) {
 352                 mutex_enter(&rp->r_statelock);
 353                 if (!rp->r_error)
 354                         rp->r_error = error;
 355                 mutex_exit(&rp->r_statelock);
 356         }
 357 }
 358 
 359 /*
 360  * Page flush thread.
 361  */
 362 
 363 static void
 364 nfs4_pgflush_thread(pgflush_t *args)
 365 {
 366         rnode4_t *rp = VTOR4(args->vp);
 367 
 368         /* remember which thread we are, so we don't deadlock ourselves */
 369         mutex_enter(&rp->r_statelock);
 370         ASSERT(rp->r_pgflush == NULL);
 371         rp->r_pgflush = curthread;
 372         mutex_exit(&rp->r_statelock);
 373 
 374         nfs4_flush_pages(args->vp, args->cr);
 375 
 376         mutex_enter(&rp->r_statelock);
 377         rp->r_pgflush = NULL;
 378         rp->r_flags &= ~R4PGFLUSH;
 379         cv_broadcast(&rp->r_cv);
 380         mutex_exit(&rp->r_statelock);
 381 
 382         VN_RELE(args->vp);
 383         crfree(args->cr);
 384         kmem_free(args, sizeof (pgflush_t));
 385         zthread_exit();
 386 }
 387 
 388 /*
 389  * Purge the readdir cache of all entries which are not currently
 390  * being filled.
 391  */
 392 void
 393 nfs4_purge_rddir_cache(vnode_t *vp)
 394 {
 395         rnode4_t *rp;
 396 
 397         rp = VTOR4(vp);
 398 
 399         mutex_enter(&rp->r_statelock);
 400         rp->r_direof = NULL;
 401         rp->r_flags &= ~R4LOOKUP;
 402         rp->r_flags |= R4READDIRWATTR;
 403         rddir4_cache_purge(rp);
 404         mutex_exit(&rp->r_statelock);
 405 }
 406 
 407 /*
 408  * Set attributes cache for given vnode using virtual attributes.  There is
 409  * no cache validation, but if the attributes are deemed to be stale, they
 410  * are ignored.  This corresponds to nfs3_attrcache().
 411  *
 412  * Set the timeout value on the attribute cache and fill it
 413  * with the passed in attributes.
 414  */
 415 void
 416 nfs4_attrcache_noinval(vnode_t *vp, nfs4_ga_res_t *garp, hrtime_t t)
 417 {
 418         rnode4_t *rp = VTOR4(vp);
 419 
 420         mutex_enter(&rp->r_statelock);
 421         if (rp->r_time_attr_saved <= t)
 422                 nfs4_attrcache_va(vp, garp, FALSE);
 423         mutex_exit(&rp->r_statelock);
 424 }
 425 
 426 /*
 427  * Use the passed in virtual attributes to check to see whether the
 428  * data and metadata caches are valid, cache the new attributes, and
 429  * then do the cache invalidation if required.
 430  *
 431  * The cache validation and caching of the new attributes is done
 432  * atomically via the use of the mutex, r_statelock.  If required,
 433  * the cache invalidation is done atomically w.r.t. the cache
 434  * validation and caching of the attributes via the pseudo lock,
 435  * r_serial.
 436  *
 437  * This routine is used to do cache validation and attributes caching
 438  * for operations with a single set of post operation attributes.
 439  */
 440 
 441 void
 442 nfs4_attr_cache(vnode_t *vp, nfs4_ga_res_t *garp,
 443     hrtime_t t, cred_t *cr, int async,
 444     change_info4 *cinfo)
 445 {
 446         rnode4_t *rp;
 447         int mtime_changed = 0;
 448         int ctime_changed = 0;
 449         vsecattr_t *vsp;
 450         int was_serial, set_time_cache_inval, recov;
 451         vattr_t *vap = &garp->n4g_va;
 452         mntinfo4_t *mi = VTOMI4(vp);
 453         len_t preattr_rsize;
 454         boolean_t writemodify_set = B_FALSE;
 455         boolean_t cachepurge_set = B_FALSE;
 456 
 457         ASSERT(mi->mi_vfsp->vfs_dev == garp->n4g_va.va_fsid);
 458 
 459         /* Is curthread the recovery thread? */
 460         mutex_enter(&mi->mi_lock);
 461         recov = (VTOMI4(vp)->mi_recovthread == curthread);
 462         mutex_exit(&mi->mi_lock);
 463 
 464         rp = VTOR4(vp);
 465         mutex_enter(&rp->r_statelock);
 466         was_serial = (rp->r_serial == curthread);
 467         if (rp->r_serial && !was_serial) {
 468                 klwp_t *lwp = ttolwp(curthread);
 469 
 470                 /*
 471                  * If we're the recovery thread, then purge current attrs
 472                  * and bail out to avoid potential deadlock between another
 473                  * thread caching attrs (r_serial thread), recov thread,
 474                  * and an async writer thread.
 475                  */
 476                 if (recov) {
 477                         PURGE_ATTRCACHE4_LOCKED(rp);
 478                         mutex_exit(&rp->r_statelock);
 479                         return;
 480                 }
 481 
 482                 if (lwp != NULL)
 483                         lwp->lwp_nostop++;
 484                 while (rp->r_serial != NULL) {
 485                         if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
 486                                 mutex_exit(&rp->r_statelock);
 487                                 if (lwp != NULL)
 488                                         lwp->lwp_nostop--;
 489                                 return;
 490                         }
 491                 }
 492                 if (lwp != NULL)
 493                         lwp->lwp_nostop--;
 494         }
 495 
 496         /*
 497          * If there is a page flush thread, the current thread needs to
 498          * bail out, to prevent a possible deadlock between the current
 499          * thread (which might be in a start_op/end_op region), the
 500          * recovery thread, and the page flush thread.  Expire the
 501          * attribute cache, so that any attributes the current thread was
 502          * going to set are not lost.
 503          */
 504         if ((rp->r_flags & R4PGFLUSH) && rp->r_pgflush != curthread) {
 505                 PURGE_ATTRCACHE4_LOCKED(rp);
 506                 mutex_exit(&rp->r_statelock);
 507                 return;
 508         }
 509 
 510         if (rp->r_time_attr_saved > t) {
 511                 /*
 512                  * Attributes have been cached since these attributes were
 513                  * probably made. If there is an inconsistency in what is
 514                  * cached, mark them invalid. If not, don't act on them.
 515                  */
 516                 if (!CACHE4_VALID(rp, vap->va_mtime, vap->va_size))
 517                         PURGE_ATTRCACHE4_LOCKED(rp);
 518                 mutex_exit(&rp->r_statelock);
 519                 return;
 520         }
 521         set_time_cache_inval = 0;
 522         if (cinfo) {
 523                 /*
 524                  * Only directory modifying callers pass non-NULL cinfo.
 525                  */
 526                 ASSERT(vp->v_type == VDIR);
 527                 /*
 528                  * If the cache timeout either doesn't exist or hasn't expired,
 529                  * and dir didn't changed on server before dirmod op
 530                  * and dir didn't change after dirmod op but before getattr
 531                  * then there's a chance that the client's cached data for
 532                  * this object is current (not stale).  No immediate cache
 533                  * flush is required.
 534                  *
 535                  */
 536                 if ((! rp->r_time_cache_inval || t < rp->r_time_cache_inval) &&
 537                     cinfo->before == rp->r_change &&
 538                     (garp->n4g_change_valid &&
 539                     cinfo->after == garp->n4g_change)) {
 540 
 541                         /*
 542                          * If atomic isn't set, then the before/after info
 543                          * cannot be blindly trusted.  For this case, we tell
 544                          * nfs4_attrcache_va to cache the attrs but also
 545                          * establish an absolute maximum cache timeout.  When
 546                          * the timeout is reached, caches will be flushed.
 547                          */
 548                         if (! cinfo->atomic)
 549                                 set_time_cache_inval = 1;
 550                 } else {
 551 
 552                         /*
 553                          * We're not sure exactly what changed, but we know
 554                          * what to do.  flush all caches for dir.  remove the
 555                          * attr timeout.
 556                          *
 557                          * a) timeout expired.  flush all caches.
 558                          * b) r_change != cinfo.before.  flush all caches.
 559                          * c) r_change == cinfo.before, but cinfo.after !=
 560                          *    post-op getattr(change).  flush all caches.
 561                          * d) post-op getattr(change) not provided by server.
 562                          *    flush all caches.
 563                          */
 564                         mtime_changed = 1;
 565                         ctime_changed = 1;
 566                         rp->r_time_cache_inval = 0;
 567                 }
 568         } else {
 569                 /*
 570                  * Write thread after writing data to file on remote server,
 571                  * will always set R4WRITEMODIFIED to indicate that file on
 572                  * remote server was modified with a WRITE operation and would
 573                  * have marked attribute cache as timed out. If R4WRITEMODIFIED
 574                  * is set, then do not check for mtime and ctime change.
 575                  */
 576                 if (!(rp->r_flags & R4WRITEMODIFIED)) {
 577                         if (!CACHE4_VALID(rp, vap->va_mtime, vap->va_size))
 578                                 mtime_changed = 1;
 579 
 580                         if (rp->r_attr.va_ctime.tv_sec !=
 581                             vap->va_ctime.tv_sec ||
 582                             rp->r_attr.va_ctime.tv_nsec !=
 583                             vap->va_ctime.tv_nsec)
 584                                 ctime_changed = 1;
 585                 } else {
 586                         writemodify_set = B_TRUE;
 587                 }
 588         }
 589 
 590         preattr_rsize = rp->r_size;
 591 
 592         nfs4_attrcache_va(vp, garp, set_time_cache_inval);
 593 
 594         /*
 595          * If we have updated filesize in nfs4_attrcache_va, as soon as we
 596          * drop statelock we will be in transition of purging all
 597          * our caches and updating them. It is possible for another
 598          * thread to pick this new file size and read in zeroed data.
 599          * stall other threads till cache purge is complete.
 600          */
 601         if ((!cinfo) && (rp->r_size != preattr_rsize)) {
 602                 /*
 603                  * If R4WRITEMODIFIED was set and we have updated the file
 604                  * size, Server's returned file size need not necessarily
 605                  * be because of this Client's WRITE. We need to purge
 606                  * all caches.
 607                  */
 608                 if (writemodify_set)
 609                         mtime_changed = 1;
 610 
 611                 if (mtime_changed && !(rp->r_flags & R4INCACHEPURGE)) {
 612                         rp->r_flags |= R4INCACHEPURGE;
 613                         cachepurge_set = B_TRUE;
 614                 }
 615         }
 616 
 617         if (!mtime_changed && !ctime_changed) {
 618                 mutex_exit(&rp->r_statelock);
 619                 return;
 620         }
 621 
 622         rp->r_serial = curthread;
 623 
 624         mutex_exit(&rp->r_statelock);
 625 
 626         /*
 627          * If we're the recov thread, then force async nfs4_purge_caches
 628          * to avoid potential deadlock.
 629          */
 630         if (mtime_changed)
 631                 nfs4_purge_caches(vp, NFS4_NOPURGE_DNLC, cr, recov ? 1 : async);
 632 
 633         if ((rp->r_flags & R4INCACHEPURGE) && cachepurge_set) {
 634                 mutex_enter(&rp->r_statelock);
 635                 rp->r_flags &= ~R4INCACHEPURGE;
 636                 cv_broadcast(&rp->r_cv);
 637                 mutex_exit(&rp->r_statelock);
 638                 cachepurge_set = B_FALSE;
 639         }
 640 
 641         if (ctime_changed) {
 642                 (void) nfs4_access_purge_rp(rp);
 643                 if (rp->r_secattr != NULL) {
 644                         mutex_enter(&rp->r_statelock);
 645                         vsp = rp->r_secattr;
 646                         rp->r_secattr = NULL;
 647                         mutex_exit(&rp->r_statelock);
 648                         if (vsp != NULL)
 649                                 nfs4_acl_free_cache(vsp);
 650                 }
 651         }
 652 
 653         if (!was_serial) {
 654                 mutex_enter(&rp->r_statelock);
 655                 rp->r_serial = NULL;
 656                 cv_broadcast(&rp->r_cv);
 657                 mutex_exit(&rp->r_statelock);
 658         }
 659 }
 660 
 661 /*
 662  * Set attributes cache for given vnode using virtual attributes.
 663  *
 664  * Set the timeout value on the attribute cache and fill it
 665  * with the passed in attributes.
 666  *
 667  * The caller must be holding r_statelock.
 668  */
 669 static void
 670 nfs4_attrcache_va(vnode_t *vp, nfs4_ga_res_t *garp, int set_cache_timeout)
 671 {
 672         rnode4_t *rp;
 673         mntinfo4_t *mi;
 674         hrtime_t delta;
 675         hrtime_t now;
 676         vattr_t *vap = &garp->n4g_va;
 677 
 678         rp = VTOR4(vp);
 679 
 680         ASSERT(MUTEX_HELD(&rp->r_statelock));
 681         ASSERT(vap->va_mask == AT_ALL);
 682 
 683         /* Switch to master before checking v_flag */
 684         if (IS_SHADOW(vp, rp))
 685                 vp = RTOV4(rp);
 686 
 687         now = gethrtime();
 688 
 689         mi = VTOMI4(vp);
 690 
 691         /*
 692          * Only establish a new cache timeout (if requested).  Never
 693          * extend a timeout.  Never clear a timeout.  Clearing a timeout
 694          * is done by nfs4_update_dircaches (ancestor in our call chain)
 695          */
 696         if (set_cache_timeout && ! rp->r_time_cache_inval)
 697                 rp->r_time_cache_inval = now + mi->mi_acdirmax;
 698 
 699         /*
 700          * Delta is the number of nanoseconds that we will
 701          * cache the attributes of the file.  It is based on
 702          * the number of nanoseconds since the last time that
 703          * we detected a change.  The assumption is that files
 704          * that changed recently are likely to change again.
 705          * There is a minimum and a maximum for regular files
 706          * and for directories which is enforced though.
 707          *
 708          * Using the time since last change was detected
 709          * eliminates direct comparison or calculation
 710          * using mixed client and server times.  NFS does
 711          * not make any assumptions regarding the client
 712          * and server clocks being synchronized.
 713          */
 714         if (vap->va_mtime.tv_sec != rp->r_attr.va_mtime.tv_sec ||
 715             vap->va_mtime.tv_nsec != rp->r_attr.va_mtime.tv_nsec ||
 716             vap->va_size != rp->r_attr.va_size) {
 717                 rp->r_time_attr_saved = now;
 718         }
 719 
 720         if ((mi->mi_flags & MI4_NOAC) || (vp->v_flag & VNOCACHE))
 721                 delta = 0;
 722         else {
 723                 delta = now - rp->r_time_attr_saved;
 724                 if (vp->v_type == VDIR) {
 725                         if (delta < mi->mi_acdirmin)
 726                                 delta = mi->mi_acdirmin;
 727                         else if (delta > mi->mi_acdirmax)
 728                                 delta = mi->mi_acdirmax;
 729                 } else {
 730                         if (delta < mi->mi_acregmin)
 731                                 delta = mi->mi_acregmin;
 732                         else if (delta > mi->mi_acregmax)
 733                                 delta = mi->mi_acregmax;
 734                 }
 735         }
 736         rp->r_time_attr_inval = now + delta;
 737 
 738         rp->r_attr = *vap;
 739         if (garp->n4g_change_valid)
 740                 rp->r_change = garp->n4g_change;
 741 
 742         /*
 743          * The attributes that were returned may be valid and can
 744          * be used, but they may not be allowed to be cached.
 745          * Reset the timers to cause immediate invalidation and
 746          * clear r_change so no VERIFY operations will suceed
 747          */
 748         if (garp->n4g_attrwhy == NFS4_GETATTR_NOCACHE_OK) {
 749                 rp->r_time_attr_inval = now;
 750                 rp->r_time_attr_saved = now;
 751                 rp->r_change = 0;
 752         }
 753 
 754         /*
 755          * If mounted_on_fileid returned AND the object is a stub,
 756          * then set object's va_nodeid to the mounted over fid
 757          * returned by server.
 758          *
 759          * If mounted_on_fileid not provided/supported, then
 760          * just set it to 0 for now.  Eventually it would be
 761          * better to set it to a hashed version of FH.  This
 762          * would probably be good enough to provide a unique
 763          * fid/d_ino within a dir.
 764          *
 765          * We don't need to carry mounted_on_fileid in the
 766          * rnode as long as the client never requests fileid
 767          * without also requesting mounted_on_fileid.  For
 768          * now, it stays.
 769          */
 770         if (garp->n4g_mon_fid_valid) {
 771                 rp->r_mntd_fid = garp->n4g_mon_fid;
 772 
 773                 if (RP_ISSTUB(rp))
 774                         rp->r_attr.va_nodeid = rp->r_mntd_fid;
 775         }
 776 
 777         /*
 778          * Check to see if there are valid pathconf bits to
 779          * cache in the rnode.
 780          */
 781         if (garp->n4g_ext_res) {
 782                 if (garp->n4g_ext_res->n4g_pc4.pc4_cache_valid) {
 783                         rp->r_pathconf = garp->n4g_ext_res->n4g_pc4;
 784                 } else {
 785                         if (garp->n4g_ext_res->n4g_pc4.pc4_xattr_valid) {
 786                                 rp->r_pathconf.pc4_xattr_valid = TRUE;
 787                                 rp->r_pathconf.pc4_xattr_exists =
 788                                     garp->n4g_ext_res->n4g_pc4.pc4_xattr_exists;
 789                         }
 790                 }
 791         }
 792         /*
 793          * Update the size of the file if there is no cached data or if
 794          * the cached data is clean and there is no data being written
 795          * out.
 796          */
 797         if (rp->r_size != vap->va_size &&
 798             (!vn_has_cached_data(vp) ||
 799             (!(rp->r_flags & R4DIRTY) && rp->r_count == 0))) {
 800                 rp->r_size = vap->va_size;
 801         }
 802         nfs_setswaplike(vp, vap);
 803         rp->r_flags &= ~R4WRITEMODIFIED;
 804 }
 805 
 806 /*
 807  * Get attributes over-the-wire and update attributes cache
 808  * if no error occurred in the over-the-wire operation.
 809  * Return 0 if successful, otherwise error.
 810  */
 811 int
 812 nfs4_getattr_otw(vnode_t *vp, nfs4_ga_res_t *garp, cred_t *cr, int get_acl)
 813 {
 814         mntinfo4_t *mi = VTOMI4(vp);
 815         hrtime_t t;
 816         nfs4_recov_state_t recov_state;
 817         nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
 818 
 819         recov_state.rs_flags = 0;
 820         recov_state.rs_num_retry_despite_err = 0;
 821 
 822         /* Save the original mount point security flavor */
 823         (void) save_mnt_secinfo(mi->mi_curr_serv);
 824 
 825 recov_retry:
 826 
 827         if ((e.error = nfs4_start_fop(mi, vp, NULL, OH_GETATTR,
 828             &recov_state, NULL))) {
 829                 (void) check_mnt_secinfo(mi->mi_curr_serv, vp);
 830                 return (e.error);
 831         }
 832 
 833         t = gethrtime();
 834 
 835         nfs4_getattr_otw_norecovery(vp, garp, &e, cr, get_acl);
 836 
 837         if (nfs4_needs_recovery(&e, FALSE, vp->v_vfsp)) {
 838                 if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
 839                     NULL, OP_GETATTR, NULL, NULL, NULL) == FALSE)  {
 840                         nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR,
 841                             &recov_state, 1);
 842                         goto recov_retry;
 843                 }
 844         }
 845 
 846         nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state, 0);
 847 
 848         if (!e.error) {
 849                 if (e.stat == NFS4_OK) {
 850                         nfs4_attr_cache(vp, garp, t, cr, FALSE, NULL);
 851                 } else {
 852                         e.error = geterrno4(e.stat);
 853 
 854                         nfs4_purge_stale_fh(e.error, vp, cr);
 855                 }
 856         }
 857 
 858         /*
 859          * If getattr a node that is a stub for a crossed
 860          * mount point, keep the original secinfo flavor for
 861          * the current file system, not the crossed one.
 862          */
 863         (void) check_mnt_secinfo(mi->mi_curr_serv, vp);
 864 
 865         return (e.error);
 866 }
 867 
 868 /*
 869  * Generate a compound to get attributes over-the-wire.
 870  */
 871 void
 872 nfs4_getattr_otw_norecovery(vnode_t *vp, nfs4_ga_res_t *garp,
 873     nfs4_error_t *ep, cred_t *cr, int get_acl)
 874 {
 875         COMPOUND4args_clnt args;
 876         COMPOUND4res_clnt res;
 877         int doqueue;
 878         rnode4_t *rp = VTOR4(vp);
 879         nfs_argop4 argop[2];
 880 
 881         args.ctag = TAG_GETATTR;
 882 
 883         args.array_len = 2;
 884         args.array = argop;
 885 
 886         /* putfh */
 887         argop[0].argop = OP_CPUTFH;
 888         argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
 889 
 890         /* getattr */
 891         /*
 892          * Unlike nfs version 2 and 3, where getattr returns all the
 893          * attributes, nfs version 4 returns only the ones explicitly
 894          * asked for. This creates problems, as some system functions
 895          * (e.g. cache check) require certain attributes and if the
 896          * cached node lacks some attributes such as uid/gid, it can
 897          * affect system utilities (e.g. "ls") that rely on the information
 898          * to be there. This can lead to anything from system crashes to
 899          * corrupted information processed by user apps.
 900          * So to ensure that all bases are covered, request at least
 901          * the AT_ALL attribute mask.
 902          */
 903         argop[1].argop = OP_GETATTR;
 904         argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
 905         if (get_acl)
 906                 argop[1].nfs_argop4_u.opgetattr.attr_request |= FATTR4_ACL_MASK;
 907         argop[1].nfs_argop4_u.opgetattr.mi = VTOMI4(vp);
 908 
 909         doqueue = 1;
 910 
 911         rfs4call(VTOMI4(vp), &args, &res, cr, &doqueue, 0, ep);
 912 
 913         if (ep->error)
 914                 return;
 915 
 916         if (res.status != NFS4_OK) {
 917                 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
 918                 return;
 919         }
 920 
 921         *garp = res.array[1].nfs_resop4_u.opgetattr.ga_res;
 922 
 923         (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
 924 }
 925 
 926 /*
 927  * Return either cached or remote attributes. If get remote attr
 928  * use them to check and invalidate caches, then cache the new attributes.
 929  */
 930 int
 931 nfs4getattr(vnode_t *vp, vattr_t *vap, cred_t *cr)
 932 {
 933         int error;
 934         rnode4_t *rp;
 935         nfs4_ga_res_t gar;
 936 
 937         ASSERT(nfs4_consistent_type(vp));
 938 
 939         /*
 940          * If we've got cached attributes, we're done, otherwise go
 941          * to the server to get attributes, which will update the cache
 942          * in the process. Either way, use the cached attributes for
 943          * the caller's vattr_t.
 944          *
 945          * Note that we ignore the gar set by the OTW call: the attr caching
 946          * code may make adjustments when storing to the rnode, and we want
 947          * to see those changes here.
 948          */
 949         rp = VTOR4(vp);
 950         error = 0;
 951         mutex_enter(&rp->r_statelock);
 952         if (!ATTRCACHE4_VALID(vp)) {
 953                 mutex_exit(&rp->r_statelock);
 954                 error = nfs4_getattr_otw(vp, &gar, cr, 0);
 955                 mutex_enter(&rp->r_statelock);
 956         }
 957 
 958         if (!error)
 959                 *vap = rp->r_attr;
 960 
 961         /* Return the client's view of file size */
 962         vap->va_size = rp->r_size;
 963 
 964         mutex_exit(&rp->r_statelock);
 965 
 966         ASSERT(nfs4_consistent_type(vp));
 967 
 968         return (error);
 969 }
 970 
 971 int
 972 nfs4_attr_otw(vnode_t *vp, nfs4_tag_type_t tag_type,
 973     nfs4_ga_res_t *garp, bitmap4 reqbitmap, cred_t *cr)
 974 {
 975         COMPOUND4args_clnt args;
 976         COMPOUND4res_clnt res;
 977         int doqueue;
 978         nfs_argop4 argop[2];
 979         mntinfo4_t *mi = VTOMI4(vp);
 980         bool_t needrecov = FALSE;
 981         nfs4_recov_state_t recov_state;
 982         nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
 983         nfs4_ga_ext_res_t *gerp;
 984 
 985         recov_state.rs_flags = 0;
 986         recov_state.rs_num_retry_despite_err = 0;
 987 
 988 recov_retry:
 989         args.ctag = tag_type;
 990 
 991         args.array_len = 2;
 992         args.array = argop;
 993 
 994         e.error = nfs4_start_fop(mi, vp, NULL, OH_GETATTR, &recov_state, NULL);
 995         if (e.error)
 996                 return (e.error);
 997 
 998         /* putfh */
 999         argop[0].argop = OP_CPUTFH;
1000         argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh;
1001 
1002         /* getattr */
1003         argop[1].argop = OP_GETATTR;
1004         argop[1].nfs_argop4_u.opgetattr.attr_request = reqbitmap;
1005         argop[1].nfs_argop4_u.opgetattr.mi = mi;
1006 
1007         doqueue = 1;
1008 
1009         NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
1010             "nfs4_attr_otw: %s call, rp %s", needrecov ? "recov" : "first",
1011             rnode4info(VTOR4(vp))));
1012 
1013         rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
1014 
1015         needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp);
1016         if (!needrecov && e.error) {
1017                 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state,
1018                     needrecov);
1019                 return (e.error);
1020         }
1021 
1022         if (needrecov) {
1023                 bool_t abort;
1024 
1025                 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1026                     "nfs4_attr_otw: initiating recovery\n"));
1027 
1028                 abort = nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
1029                     NULL, OP_GETATTR, NULL, NULL, NULL);
1030                 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state,
1031                     needrecov);
1032                 if (!e.error) {
1033                         (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1034                         e.error = geterrno4(res.status);
1035                 }
1036                 if (abort == FALSE)
1037                         goto recov_retry;
1038                 return (e.error);
1039         }
1040 
1041         if (res.status) {
1042                 e.error = geterrno4(res.status);
1043         } else {
1044                 gerp = garp->n4g_ext_res;
1045                 bcopy(&res.array[1].nfs_resop4_u.opgetattr.ga_res,
1046                     garp, sizeof (nfs4_ga_res_t));
1047                 garp->n4g_ext_res = gerp;
1048                 if (garp->n4g_ext_res &&
1049                     res.array[1].nfs_resop4_u.opgetattr.ga_res.n4g_ext_res)
1050                         bcopy(res.array[1].nfs_resop4_u.opgetattr.
1051                             ga_res.n4g_ext_res,
1052                             garp->n4g_ext_res, sizeof (nfs4_ga_ext_res_t));
1053         }
1054         (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1055         nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state,
1056             needrecov);
1057         return (e.error);
1058 }
1059 
1060 /*
1061  * Asynchronous I/O parameters.  nfs_async_threads is the high-water mark
1062  * for the demand-based allocation of async threads per-mount.  The
1063  * nfs_async_timeout is the amount of time a thread will live after it
1064  * becomes idle, unless new I/O requests are received before the thread
1065  * dies.  See nfs4_async_putpage and nfs4_async_start.
1066  */
1067 
1068 static void     nfs4_async_start(struct vfs *);
1069 static void     nfs4_async_pgops_start(struct vfs *);
1070 static void     nfs4_async_common_start(struct vfs *, int);
1071 
1072 static void
1073 free_async_args4(struct nfs4_async_reqs *args)
1074 {
1075         rnode4_t *rp;
1076 
1077         if (args->a_io != NFS4_INACTIVE) {
1078                 rp = VTOR4(args->a_vp);
1079                 mutex_enter(&rp->r_statelock);
1080                 rp->r_count--;
1081                 if (args->a_io == NFS4_PUTAPAGE ||
1082                     args->a_io == NFS4_PAGEIO)
1083                         rp->r_awcount--;
1084                 cv_broadcast(&rp->r_cv);
1085                 mutex_exit(&rp->r_statelock);
1086                 VN_RELE(args->a_vp);
1087         }
1088         crfree(args->a_cred);
1089         kmem_free(args, sizeof (*args));
1090 }
1091 
1092 /*
1093  * Cross-zone thread creation and NFS access is disallowed, yet fsflush() and
1094  * pageout(), running in the global zone, have legitimate reasons to do
1095  * VOP_PUTPAGE(B_ASYNC) on other zones' NFS mounts.  We avoid the problem by
1096  * use of a a per-mount "asynchronous requests manager thread" which is
1097  * signaled by the various asynchronous work routines when there is
1098  * asynchronous work to be done.  It is responsible for creating new
1099  * worker threads if necessary, and notifying existing worker threads
1100  * that there is work to be done.
1101  *
1102  * In other words, it will "take the specifications from the customers and
1103  * give them to the engineers."
1104  *
1105  * Worker threads die off of their own accord if they are no longer
1106  * needed.
1107  *
1108  * This thread is killed when the zone is going away or the filesystem
1109  * is being unmounted.
1110  */
1111 void
1112 nfs4_async_manager(vfs_t *vfsp)
1113 {
1114         callb_cpr_t cprinfo;
1115         mntinfo4_t *mi;
1116         uint_t max_threads;
1117 
1118         mi = VFTOMI4(vfsp);
1119 
1120         CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr,
1121             "nfs4_async_manager");
1122 
1123         mutex_enter(&mi->mi_async_lock);
1124         /*
1125          * We want to stash the max number of threads that this mount was
1126          * allowed so we can use it later when the variable is set to zero as
1127          * part of the zone/mount going away.
1128          *
1129          * We want to be able to create at least one thread to handle
1130          * asynchronous inactive calls.
1131          */
1132         max_threads = MAX(mi->mi_max_threads, 1);
1133         /*
1134          * We don't want to wait for mi_max_threads to go to zero, since that
1135          * happens as part of a failed unmount, but this thread should only
1136          * exit when the mount is really going away.
1137          *
1138          * Once MI4_ASYNC_MGR_STOP is set, no more async operations will be
1139          * attempted: the various _async_*() functions know to do things
1140          * inline if mi_max_threads == 0.  Henceforth we just drain out the
1141          * outstanding requests.
1142          *
1143          * Note that we still create zthreads even if we notice the zone is
1144          * shutting down (MI4_ASYNC_MGR_STOP is set); this may cause the zone
1145          * shutdown sequence to take slightly longer in some cases, but
1146          * doesn't violate the protocol, as all threads will exit as soon as
1147          * they're done processing the remaining requests.
1148          */
1149         for (;;) {
1150                 while (mi->mi_async_req_count > 0) {
1151                         /*
1152                          * Paranoia: If the mount started out having
1153                          * (mi->mi_max_threads == 0), and the value was
1154                          * later changed (via a debugger or somesuch),
1155                          * we could be confused since we will think we
1156                          * can't create any threads, and the calling
1157                          * code (which looks at the current value of
1158                          * mi->mi_max_threads, now non-zero) thinks we
1159                          * can.
1160                          *
1161                          * So, because we're paranoid, we create threads
1162                          * up to the maximum of the original and the
1163                          * current value. This means that future
1164                          * (debugger-induced) alterations of
1165                          * mi->mi_max_threads are ignored for our
1166                          * purposes, but who told them they could change
1167                          * random values on a live kernel anyhow?
1168                          */
1169                         if (mi->mi_threads[NFS4_ASYNC_QUEUE] <
1170                             MAX(mi->mi_max_threads, max_threads)) {
1171                                 mi->mi_threads[NFS4_ASYNC_QUEUE]++;
1172                                 mutex_exit(&mi->mi_async_lock);
1173                                 MI4_HOLD(mi);
1174                                 VFS_HOLD(vfsp); /* hold for new thread */
1175                                 (void) zthread_create(NULL, 0, nfs4_async_start,
1176                                     vfsp, 0, minclsyspri);
1177                                 mutex_enter(&mi->mi_async_lock);
1178                         } else if (mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] <
1179                             NUM_ASYNC_PGOPS_THREADS) {
1180                                 mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE]++;
1181                                 mutex_exit(&mi->mi_async_lock);
1182                                 MI4_HOLD(mi);
1183                                 VFS_HOLD(vfsp); /* hold for new thread */
1184                                 (void) zthread_create(NULL, 0,
1185                                     nfs4_async_pgops_start, vfsp, 0,
1186                                     minclsyspri);
1187                                 mutex_enter(&mi->mi_async_lock);
1188                         }
1189                         NFS4_WAKE_ASYNC_WORKER(mi->mi_async_work_cv);
1190                         ASSERT(mi->mi_async_req_count != 0);
1191                         mi->mi_async_req_count--;
1192                 }
1193 
1194                 mutex_enter(&mi->mi_lock);
1195                 if (mi->mi_flags & MI4_ASYNC_MGR_STOP) {
1196                         mutex_exit(&mi->mi_lock);
1197                         break;
1198                 }
1199                 mutex_exit(&mi->mi_lock);
1200 
1201                 CALLB_CPR_SAFE_BEGIN(&cprinfo);
1202                 cv_wait(&mi->mi_async_reqs_cv, &mi->mi_async_lock);
1203                 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
1204         }
1205 
1206         NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
1207             "nfs4_async_manager exiting for vfs %p\n", (void *)mi->mi_vfsp));
1208         /*
1209          * Let everyone know we're done.
1210          */
1211         mi->mi_manager_thread = NULL;
1212         /*
1213          * Wake up the inactive thread.
1214          */
1215         cv_broadcast(&mi->mi_inact_req_cv);
1216         /*
1217          * Wake up anyone sitting in nfs4_async_manager_stop()
1218          */
1219         cv_broadcast(&mi->mi_async_cv);
1220         /*
1221          * There is no explicit call to mutex_exit(&mi->mi_async_lock)
1222          * since CALLB_CPR_EXIT is actually responsible for releasing
1223          * 'mi_async_lock'.
1224          */
1225         CALLB_CPR_EXIT(&cprinfo);
1226         VFS_RELE(vfsp); /* release thread's hold */
1227         MI4_RELE(mi);
1228         zthread_exit();
1229 }
1230 
1231 /*
1232  * Signal (and wait for) the async manager thread to clean up and go away.
1233  */
1234 void
1235 nfs4_async_manager_stop(vfs_t *vfsp)
1236 {
1237         mntinfo4_t *mi = VFTOMI4(vfsp);
1238 
1239         mutex_enter(&mi->mi_async_lock);
1240         mutex_enter(&mi->mi_lock);
1241         mi->mi_flags |= MI4_ASYNC_MGR_STOP;
1242         mutex_exit(&mi->mi_lock);
1243         cv_broadcast(&mi->mi_async_reqs_cv);
1244         /*
1245          * Wait for the async manager thread to die.
1246          */
1247         while (mi->mi_manager_thread != NULL)
1248                 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
1249         mutex_exit(&mi->mi_async_lock);
1250 }
1251 
1252 int
1253 nfs4_async_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr,
1254     struct seg *seg, cred_t *cr, void (*readahead)(vnode_t *,
1255     u_offset_t, caddr_t, struct seg *, cred_t *))
1256 {
1257         rnode4_t *rp;
1258         mntinfo4_t *mi;
1259         struct nfs4_async_reqs *args;
1260 
1261         rp = VTOR4(vp);
1262         ASSERT(rp->r_freef == NULL);
1263 
1264         mi = VTOMI4(vp);
1265 
1266         /*
1267          * If addr falls in a different segment, don't bother doing readahead.
1268          */
1269         if (addr >= seg->s_base + seg->s_size)
1270                 return (-1);
1271 
1272         /*
1273          * If we can't allocate a request structure, punt on the readahead.
1274          */
1275         if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1276                 return (-1);
1277 
1278         /*
1279          * If a lock operation is pending, don't initiate any new
1280          * readaheads.  Otherwise, bump r_count to indicate the new
1281          * asynchronous I/O.
1282          */
1283         if (!nfs_rw_tryenter(&rp->r_lkserlock, RW_READER)) {
1284                 kmem_free(args, sizeof (*args));
1285                 return (-1);
1286         }
1287         mutex_enter(&rp->r_statelock);
1288         rp->r_count++;
1289         mutex_exit(&rp->r_statelock);
1290         nfs_rw_exit(&rp->r_lkserlock);
1291 
1292         args->a_next = NULL;
1293 #ifdef DEBUG
1294         args->a_queuer = curthread;
1295 #endif
1296         VN_HOLD(vp);
1297         args->a_vp = vp;
1298         ASSERT(cr != NULL);
1299         crhold(cr);
1300         args->a_cred = cr;
1301         args->a_io = NFS4_READ_AHEAD;
1302         args->a_nfs4_readahead = readahead;
1303         args->a_nfs4_blkoff = blkoff;
1304         args->a_nfs4_seg = seg;
1305         args->a_nfs4_addr = addr;
1306 
1307         mutex_enter(&mi->mi_async_lock);
1308 
1309         /*
1310          * If asyncio has been disabled, don't bother readahead.
1311          */
1312         if (mi->mi_max_threads == 0) {
1313                 mutex_exit(&mi->mi_async_lock);
1314                 goto noasync;
1315         }
1316 
1317         /*
1318          * Link request structure into the async list and
1319          * wakeup async thread to do the i/o.
1320          */
1321         if (mi->mi_async_reqs[NFS4_READ_AHEAD] == NULL) {
1322                 mi->mi_async_reqs[NFS4_READ_AHEAD] = args;
1323                 mi->mi_async_tail[NFS4_READ_AHEAD] = args;
1324         } else {
1325                 mi->mi_async_tail[NFS4_READ_AHEAD]->a_next = args;
1326                 mi->mi_async_tail[NFS4_READ_AHEAD] = args;
1327         }
1328 
1329         if (mi->mi_io_kstats) {
1330                 mutex_enter(&mi->mi_lock);
1331                 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1332                 mutex_exit(&mi->mi_lock);
1333         }
1334 
1335         mi->mi_async_req_count++;
1336         ASSERT(mi->mi_async_req_count != 0);
1337         cv_signal(&mi->mi_async_reqs_cv);
1338         mutex_exit(&mi->mi_async_lock);
1339         return (0);
1340 
1341 noasync:
1342         mutex_enter(&rp->r_statelock);
1343         rp->r_count--;
1344         cv_broadcast(&rp->r_cv);
1345         mutex_exit(&rp->r_statelock);
1346         VN_RELE(vp);
1347         crfree(cr);
1348         kmem_free(args, sizeof (*args));
1349         return (-1);
1350 }
1351 
1352 static void
1353 nfs4_async_start(struct vfs *vfsp)
1354 {
1355         nfs4_async_common_start(vfsp, NFS4_ASYNC_QUEUE);
1356 }
1357 
1358 static void
1359 nfs4_async_pgops_start(struct vfs *vfsp)
1360 {
1361         nfs4_async_common_start(vfsp, NFS4_ASYNC_PGOPS_QUEUE);
1362 }
1363 
1364 /*
1365  * The async queues for each mounted file system are arranged as a
1366  * set of queues, one for each async i/o type.  Requests are taken
1367  * from the queues in a round-robin fashion.  A number of consecutive
1368  * requests are taken from each queue before moving on to the next
1369  * queue.  This functionality may allow the NFS Version 2 server to do
1370  * write clustering, even if the client is mixing writes and reads
1371  * because it will take multiple write requests from the queue
1372  * before processing any of the other async i/o types.
1373  *
1374  * XXX The nfs4_async_common_start thread is unsafe in the light of the present
1375  * model defined by cpr to suspend the system. Specifically over the
1376  * wire calls are cpr-unsafe. The thread should be reevaluated in
1377  * case of future updates to the cpr model.
1378  */
1379 static void
1380 nfs4_async_common_start(struct vfs *vfsp, int async_queue)
1381 {
1382         struct nfs4_async_reqs *args;
1383         mntinfo4_t *mi = VFTOMI4(vfsp);
1384         clock_t time_left = 1;
1385         callb_cpr_t cprinfo;
1386         int i;
1387         extern int nfs_async_timeout;
1388         int async_types;
1389         kcondvar_t *async_work_cv;
1390 
1391         if (async_queue == NFS4_ASYNC_QUEUE) {
1392                 async_types = NFS4_ASYNC_TYPES;
1393                 async_work_cv = &mi->mi_async_work_cv[NFS4_ASYNC_QUEUE];
1394         } else {
1395                 async_types = NFS4_ASYNC_PGOPS_TYPES;
1396                 async_work_cv = &mi->mi_async_work_cv[NFS4_ASYNC_PGOPS_QUEUE];
1397         }
1398 
1399         /*
1400          * Dynamic initialization of nfs_async_timeout to allow nfs to be
1401          * built in an implementation independent manner.
1402          */
1403         if (nfs_async_timeout == -1)
1404                 nfs_async_timeout = NFS_ASYNC_TIMEOUT;
1405 
1406         CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, "nas");
1407 
1408         mutex_enter(&mi->mi_async_lock);
1409         for (;;) {
1410                 /*
1411                  * Find the next queue containing an entry.  We start
1412                  * at the current queue pointer and then round robin
1413                  * through all of them until we either find a non-empty
1414                  * queue or have looked through all of them.
1415                  */
1416                 for (i = 0; i < async_types; i++) {
1417                         args = *mi->mi_async_curr[async_queue];
1418                         if (args != NULL)
1419                                 break;
1420                         mi->mi_async_curr[async_queue]++;
1421                         if (mi->mi_async_curr[async_queue] ==
1422                             &mi->mi_async_reqs[async_types]) {
1423                                 mi->mi_async_curr[async_queue] =
1424                                     &mi->mi_async_reqs[0];
1425                         }
1426                 }
1427                 /*
1428                  * If we didn't find a entry, then block until woken up
1429                  * again and then look through the queues again.
1430                  */
1431                 if (args == NULL) {
1432                         /*
1433                          * Exiting is considered to be safe for CPR as well
1434                          */
1435                         CALLB_CPR_SAFE_BEGIN(&cprinfo);
1436 
1437                         /*
1438                          * Wakeup thread waiting to unmount the file
1439                          * system only if all async threads are inactive.
1440                          *
1441                          * If we've timed-out and there's nothing to do,
1442                          * then get rid of this thread.
1443                          */
1444                         if (mi->mi_max_threads == 0 || time_left <= 0) {
1445                                 --mi->mi_threads[async_queue];
1446 
1447                                 if (mi->mi_threads[NFS4_ASYNC_QUEUE] == 0 &&
1448                                     mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] == 0)
1449                                         cv_signal(&mi->mi_async_cv);
1450                                 CALLB_CPR_EXIT(&cprinfo);
1451                                 VFS_RELE(vfsp); /* release thread's hold */
1452                                 MI4_RELE(mi);
1453                                 zthread_exit();
1454                                 /* NOTREACHED */
1455                         }
1456                         time_left = cv_reltimedwait(async_work_cv,
1457                             &mi->mi_async_lock, nfs_async_timeout,
1458                             TR_CLOCK_TICK);
1459 
1460                         CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
1461 
1462                         continue;
1463                 } else {
1464                         time_left = 1;
1465                 }
1466 
1467                 /*
1468                  * Remove the request from the async queue and then
1469                  * update the current async request queue pointer.  If
1470                  * the current queue is empty or we have removed enough
1471                  * consecutive entries from it, then reset the counter
1472                  * for this queue and then move the current pointer to
1473                  * the next queue.
1474                  */
1475                 *mi->mi_async_curr[async_queue] = args->a_next;
1476                 if (*mi->mi_async_curr[async_queue] == NULL ||
1477                     --mi->mi_async_clusters[args->a_io] == 0) {
1478                         mi->mi_async_clusters[args->a_io] =
1479                             mi->mi_async_init_clusters;
1480                         mi->mi_async_curr[async_queue]++;
1481                         if (mi->mi_async_curr[async_queue] ==
1482                             &mi->mi_async_reqs[async_types]) {
1483                                 mi->mi_async_curr[async_queue] =
1484                                     &mi->mi_async_reqs[0];
1485                         }
1486                 }
1487 
1488                 if (args->a_io != NFS4_INACTIVE && mi->mi_io_kstats) {
1489                         mutex_enter(&mi->mi_lock);
1490                         kstat_waitq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
1491                         mutex_exit(&mi->mi_lock);
1492                 }
1493 
1494                 mutex_exit(&mi->mi_async_lock);
1495 
1496                 /*
1497                  * Obtain arguments from the async request structure.
1498                  */
1499                 if (args->a_io == NFS4_READ_AHEAD && mi->mi_max_threads > 0) {
1500                         (*args->a_nfs4_readahead)(args->a_vp,
1501                             args->a_nfs4_blkoff, args->a_nfs4_addr,
1502                             args->a_nfs4_seg, args->a_cred);
1503                 } else if (args->a_io == NFS4_PUTAPAGE) {
1504                         (void) (*args->a_nfs4_putapage)(args->a_vp,
1505                             args->a_nfs4_pp, args->a_nfs4_off,
1506                             args->a_nfs4_len, args->a_nfs4_flags,
1507                             args->a_cred);
1508                 } else if (args->a_io == NFS4_PAGEIO) {
1509                         (void) (*args->a_nfs4_pageio)(args->a_vp,
1510                             args->a_nfs4_pp, args->a_nfs4_off,
1511                             args->a_nfs4_len, args->a_nfs4_flags,
1512                             args->a_cred);
1513                 } else if (args->a_io == NFS4_READDIR) {
1514                         (void) ((*args->a_nfs4_readdir)(args->a_vp,
1515                             args->a_nfs4_rdc, args->a_cred));
1516                 } else if (args->a_io == NFS4_COMMIT) {
1517                         (*args->a_nfs4_commit)(args->a_vp, args->a_nfs4_plist,
1518                             args->a_nfs4_offset, args->a_nfs4_count,
1519                             args->a_cred);
1520                 } else if (args->a_io == NFS4_INACTIVE) {
1521                         nfs4_inactive_otw(args->a_vp, args->a_cred);
1522                 }
1523 
1524                 /*
1525                  * Now, release the vnode and free the credentials
1526                  * structure.
1527                  */
1528                 free_async_args4(args);
1529                 /*
1530                  * Reacquire the mutex because it will be needed above.
1531                  */
1532                 mutex_enter(&mi->mi_async_lock);
1533         }
1534 }
1535 
1536 /*
1537  * nfs4_inactive_thread - look for vnodes that need over-the-wire calls as
1538  * part of VOP_INACTIVE.
1539  */
1540 
1541 void
1542 nfs4_inactive_thread(mntinfo4_t *mi)
1543 {
1544         struct nfs4_async_reqs *args;
1545         callb_cpr_t cprinfo;
1546         vfs_t *vfsp = mi->mi_vfsp;
1547 
1548         CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr,
1549             "nfs4_inactive_thread");
1550 
1551         for (;;) {
1552                 mutex_enter(&mi->mi_async_lock);
1553                 args = mi->mi_async_reqs[NFS4_INACTIVE];
1554                 if (args == NULL) {
1555                         mutex_enter(&mi->mi_lock);
1556                         /*
1557                          * We don't want to exit until the async manager is done
1558                          * with its work; hence the check for mi_manager_thread
1559                          * being NULL.
1560                          *
1561                          * The async manager thread will cv_broadcast() on
1562                          * mi_inact_req_cv when it's done, at which point we'll
1563                          * wake up and exit.
1564                          */
1565                         if (mi->mi_manager_thread == NULL)
1566                                 goto die;
1567                         mi->mi_flags |= MI4_INACTIVE_IDLE;
1568                         mutex_exit(&mi->mi_lock);
1569                         cv_signal(&mi->mi_async_cv);
1570                         CALLB_CPR_SAFE_BEGIN(&cprinfo);
1571                         cv_wait(&mi->mi_inact_req_cv, &mi->mi_async_lock);
1572                         CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
1573                         mutex_exit(&mi->mi_async_lock);
1574                 } else {
1575                         mutex_enter(&mi->mi_lock);
1576                         mi->mi_flags &= ~MI4_INACTIVE_IDLE;
1577                         mutex_exit(&mi->mi_lock);
1578                         mi->mi_async_reqs[NFS4_INACTIVE] = args->a_next;
1579                         mutex_exit(&mi->mi_async_lock);
1580                         nfs4_inactive_otw(args->a_vp, args->a_cred);
1581                         crfree(args->a_cred);
1582                         kmem_free(args, sizeof (*args));
1583                 }
1584         }
1585 die:
1586         mutex_exit(&mi->mi_lock);
1587         mi->mi_inactive_thread = NULL;
1588         cv_signal(&mi->mi_async_cv);
1589 
1590         /*
1591          * There is no explicit call to mutex_exit(&mi->mi_async_lock) since
1592          * CALLB_CPR_EXIT is actually responsible for releasing 'mi_async_lock'.
1593          */
1594         CALLB_CPR_EXIT(&cprinfo);
1595 
1596         NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
1597             "nfs4_inactive_thread exiting for vfs %p\n", (void *)vfsp));
1598 
1599         MI4_RELE(mi);
1600         zthread_exit();
1601         /* NOTREACHED */
1602 }
1603 
1604 /*
1605  * nfs_async_stop:
1606  * Wait for all outstanding putpage operations and the inactive thread to
1607  * complete; nfs4_async_stop_sig() without interruptibility.
1608  */
1609 void
1610 nfs4_async_stop(struct vfs *vfsp)
1611 {
1612         mntinfo4_t *mi = VFTOMI4(vfsp);
1613 
1614         /*
1615          * Wait for all outstanding async operations to complete and for
1616          * worker threads to exit.
1617          */
1618         mutex_enter(&mi->mi_async_lock);
1619         mi->mi_max_threads = 0;
1620         NFS4_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv);
1621         while (mi->mi_threads[NFS4_ASYNC_QUEUE] != 0 ||
1622             mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] != 0)
1623                 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
1624 
1625         /*
1626          * Wait for the inactive thread to finish doing what it's doing.  It
1627          * won't exit until the last reference to the vfs_t goes away.
1628          */
1629         if (mi->mi_inactive_thread != NULL) {
1630                 mutex_enter(&mi->mi_lock);
1631                 while (!(mi->mi_flags & MI4_INACTIVE_IDLE) ||
1632                     (mi->mi_async_reqs[NFS4_INACTIVE] != NULL)) {
1633                         mutex_exit(&mi->mi_lock);
1634                         cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
1635                         mutex_enter(&mi->mi_lock);
1636                 }
1637                 mutex_exit(&mi->mi_lock);
1638         }
1639         mutex_exit(&mi->mi_async_lock);
1640 }
1641 
1642 /*
1643  * nfs_async_stop_sig:
1644  * Wait for all outstanding putpage operations and the inactive thread to
1645  * complete. If a signal is delivered we will abort and return non-zero;
1646  * otherwise return 0. Since this routine is called from nfs4_unmount, we
1647  * need to make it interruptible.
1648  */
1649 int
1650 nfs4_async_stop_sig(struct vfs *vfsp)
1651 {
1652         mntinfo4_t *mi = VFTOMI4(vfsp);
1653         ushort_t omax;
1654         bool_t intr = FALSE;
1655 
1656         /*
1657          * Wait for all outstanding putpage operations to complete and for
1658          * worker threads to exit.
1659          */
1660         mutex_enter(&mi->mi_async_lock);
1661         omax = mi->mi_max_threads;
1662         mi->mi_max_threads = 0;
1663         NFS4_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv);
1664         while (mi->mi_threads[NFS4_ASYNC_QUEUE] != 0 ||
1665             mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] != 0) {
1666                 if (!cv_wait_sig(&mi->mi_async_cv, &mi->mi_async_lock)) {
1667                         intr = TRUE;
1668                         goto interrupted;
1669                 }
1670         }
1671 
1672         /*
1673          * Wait for the inactive thread to finish doing what it's doing.  It
1674          * won't exit until the a last reference to the vfs_t goes away.
1675          */
1676         if (mi->mi_inactive_thread != NULL) {
1677                 mutex_enter(&mi->mi_lock);
1678                 while (!(mi->mi_flags & MI4_INACTIVE_IDLE) ||
1679                     (mi->mi_async_reqs[NFS4_INACTIVE] != NULL)) {
1680                         mutex_exit(&mi->mi_lock);
1681                         if (!cv_wait_sig(&mi->mi_async_cv,
1682                             &mi->mi_async_lock)) {
1683                                 intr = TRUE;
1684                                 goto interrupted;
1685                         }
1686                         mutex_enter(&mi->mi_lock);
1687                 }
1688                 mutex_exit(&mi->mi_lock);
1689         }
1690 interrupted:
1691         if (intr)
1692                 mi->mi_max_threads = omax;
1693         mutex_exit(&mi->mi_async_lock);
1694 
1695         return (intr);
1696 }
1697 
1698 int
1699 nfs4_async_putapage(vnode_t *vp, page_t *pp, u_offset_t off, size_t len,
1700     int flags, cred_t *cr, int (*putapage)(vnode_t *, page_t *,
1701     u_offset_t, size_t, int, cred_t *))
1702 {
1703         rnode4_t *rp;
1704         mntinfo4_t *mi;
1705         struct nfs4_async_reqs *args;
1706 
1707         ASSERT(flags & B_ASYNC);
1708         ASSERT(vp->v_vfsp != NULL);
1709 
1710         rp = VTOR4(vp);
1711         ASSERT(rp->r_count > 0);
1712 
1713         mi = VTOMI4(vp);
1714 
1715         /*
1716          * If we can't allocate a request structure, do the putpage
1717          * operation synchronously in this thread's context.
1718          */
1719         if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1720                 goto noasync;
1721 
1722         args->a_next = NULL;
1723 #ifdef DEBUG
1724         args->a_queuer = curthread;
1725 #endif
1726         VN_HOLD(vp);
1727         args->a_vp = vp;
1728         ASSERT(cr != NULL);
1729         crhold(cr);
1730         args->a_cred = cr;
1731         args->a_io = NFS4_PUTAPAGE;
1732         args->a_nfs4_putapage = putapage;
1733         args->a_nfs4_pp = pp;
1734         args->a_nfs4_off = off;
1735         args->a_nfs4_len = (uint_t)len;
1736         args->a_nfs4_flags = flags;
1737 
1738         mutex_enter(&mi->mi_async_lock);
1739 
1740         /*
1741          * If asyncio has been disabled, then make a synchronous request.
1742          * This check is done a second time in case async io was diabled
1743          * while this thread was blocked waiting for memory pressure to
1744          * reduce or for the queue to drain.
1745          */
1746         if (mi->mi_max_threads == 0) {
1747                 mutex_exit(&mi->mi_async_lock);
1748 
1749                 VN_RELE(vp);
1750                 crfree(cr);
1751                 kmem_free(args, sizeof (*args));
1752                 goto noasync;
1753         }
1754 
1755         /*
1756          * Link request structure into the async list and
1757          * wakeup async thread to do the i/o.
1758          */
1759         if (mi->mi_async_reqs[NFS4_PUTAPAGE] == NULL) {
1760                 mi->mi_async_reqs[NFS4_PUTAPAGE] = args;
1761                 mi->mi_async_tail[NFS4_PUTAPAGE] = args;
1762         } else {
1763                 mi->mi_async_tail[NFS4_PUTAPAGE]->a_next = args;
1764                 mi->mi_async_tail[NFS4_PUTAPAGE] = args;
1765         }
1766 
1767         mutex_enter(&rp->r_statelock);
1768         rp->r_count++;
1769         rp->r_awcount++;
1770         mutex_exit(&rp->r_statelock);
1771 
1772         if (mi->mi_io_kstats) {
1773                 mutex_enter(&mi->mi_lock);
1774                 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1775                 mutex_exit(&mi->mi_lock);
1776         }
1777 
1778         mi->mi_async_req_count++;
1779         ASSERT(mi->mi_async_req_count != 0);
1780         cv_signal(&mi->mi_async_reqs_cv);
1781         mutex_exit(&mi->mi_async_lock);
1782         return (0);
1783 
1784 noasync:
1785 
1786         if (curproc == proc_pageout || curproc == proc_fsflush) {

1787                 /*
1788                  * If we get here in the context of the pageout/fsflush,
1789                  * or we have run out of memory or we're attempting to
1790                  * unmount we refuse to do a sync write, because this may
1791                  * hang pageout/fsflush and the machine. In this case,
1792                  * we just re-mark the page as dirty and punt on the page.
1793                  *
1794                  * Make sure B_FORCE isn't set.  We can re-mark the
1795                  * pages as dirty and unlock the pages in one swoop by
1796                  * passing in B_ERROR to pvn_write_done().  However,
1797                  * we should make sure B_FORCE isn't set - we don't
1798                  * want the page tossed before it gets written out.
1799                  */
1800                 if (flags & B_FORCE)
1801                         flags &= ~(B_INVAL | B_FORCE);
1802                 pvn_write_done(pp, flags | B_ERROR);
1803                 return (0);
1804         }
1805 
1806         if (nfs_zone() != mi->mi_zone) {
1807                 /*
1808                  * So this was a cross-zone sync putpage.

1809                  *
1810                  * We pass in B_ERROR to pvn_write_done() to re-mark the pages
1811                  * as dirty and unlock them.
1812                  *
1813                  * We don't want to clear B_FORCE here as the caller presumably
1814                  * knows what they're doing if they set it.
1815                  */
1816                 pvn_write_done(pp, flags | B_ERROR);
1817                 return (EPERM);
1818         }
1819         return ((*putapage)(vp, pp, off, len, flags, cr));
1820 }
1821 
1822 int
1823 nfs4_async_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
1824     int flags, cred_t *cr, int (*pageio)(vnode_t *, page_t *, u_offset_t,
1825     size_t, int, cred_t *))
1826 {
1827         rnode4_t *rp;
1828         mntinfo4_t *mi;
1829         struct nfs4_async_reqs *args;
1830 
1831         ASSERT(flags & B_ASYNC);
1832         ASSERT(vp->v_vfsp != NULL);
1833 
1834         rp = VTOR4(vp);
1835         ASSERT(rp->r_count > 0);
1836 
1837         mi = VTOMI4(vp);
1838 
1839         /*
1840          * If we can't allocate a request structure, do the pageio
1841          * request synchronously in this thread's context.
1842          */
1843         if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1844                 goto noasync;
1845 
1846         args->a_next = NULL;
1847 #ifdef DEBUG
1848         args->a_queuer = curthread;
1849 #endif
1850         VN_HOLD(vp);
1851         args->a_vp = vp;
1852         ASSERT(cr != NULL);
1853         crhold(cr);
1854         args->a_cred = cr;
1855         args->a_io = NFS4_PAGEIO;
1856         args->a_nfs4_pageio = pageio;
1857         args->a_nfs4_pp = pp;
1858         args->a_nfs4_off = io_off;
1859         args->a_nfs4_len = (uint_t)io_len;
1860         args->a_nfs4_flags = flags;
1861 
1862         mutex_enter(&mi->mi_async_lock);
1863 
1864         /*
1865          * If asyncio has been disabled, then make a synchronous request.
1866          * This check is done a second time in case async io was diabled
1867          * while this thread was blocked waiting for memory pressure to
1868          * reduce or for the queue to drain.
1869          */
1870         if (mi->mi_max_threads == 0) {
1871                 mutex_exit(&mi->mi_async_lock);
1872 
1873                 VN_RELE(vp);
1874                 crfree(cr);
1875                 kmem_free(args, sizeof (*args));
1876                 goto noasync;
1877         }
1878 
1879         /*
1880          * Link request structure into the async list and
1881          * wakeup async thread to do the i/o.
1882          */
1883         if (mi->mi_async_reqs[NFS4_PAGEIO] == NULL) {
1884                 mi->mi_async_reqs[NFS4_PAGEIO] = args;
1885                 mi->mi_async_tail[NFS4_PAGEIO] = args;
1886         } else {
1887                 mi->mi_async_tail[NFS4_PAGEIO]->a_next = args;
1888                 mi->mi_async_tail[NFS4_PAGEIO] = args;
1889         }
1890 
1891         mutex_enter(&rp->r_statelock);
1892         rp->r_count++;
1893         rp->r_awcount++;
1894         mutex_exit(&rp->r_statelock);
1895 
1896         if (mi->mi_io_kstats) {
1897                 mutex_enter(&mi->mi_lock);
1898                 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1899                 mutex_exit(&mi->mi_lock);
1900         }
1901 
1902         mi->mi_async_req_count++;
1903         ASSERT(mi->mi_async_req_count != 0);
1904         cv_signal(&mi->mi_async_reqs_cv);
1905         mutex_exit(&mi->mi_async_lock);
1906         return (0);
1907 
1908 noasync:
1909         /*
1910          * If we can't do it ASYNC, for reads we do nothing (but cleanup
1911          * the page list), for writes we do it synchronously, except for
1912          * proc_pageout/proc_fsflush as described below.
1913          */
1914         if (flags & B_READ) {
1915                 pvn_read_done(pp, flags | B_ERROR);
1916                 return (0);
1917         }
1918 
1919         if (curproc == proc_pageout || curproc == proc_fsflush) {
1920                 /*
1921                  * If we get here in the context of the pageout/fsflush,
1922                  * we refuse to do a sync write, because this may hang
1923                  * pageout/fsflush (and the machine). In this case, we just
1924                  * re-mark the page as dirty and punt on the page.
1925                  *
1926                  * Make sure B_FORCE isn't set.  We can re-mark the
1927                  * pages as dirty and unlock the pages in one swoop by
1928                  * passing in B_ERROR to pvn_write_done().  However,
1929                  * we should make sure B_FORCE isn't set - we don't
1930                  * want the page tossed before it gets written out.
1931                  */
1932                 if (flags & B_FORCE)
1933                         flags &= ~(B_INVAL | B_FORCE);
1934                 pvn_write_done(pp, flags | B_ERROR);
1935                 return (0);
1936         }
1937 
1938         if (nfs_zone() != mi->mi_zone) {
1939                 /*
1940                  * So this was a cross-zone sync pageio.  We pass in B_ERROR
1941                  * to pvn_write_done() to re-mark the pages as dirty and unlock
1942                  * them.
1943                  *
1944                  * We don't want to clear B_FORCE here as the caller presumably
1945                  * knows what they're doing if they set it.
1946                  */
1947                 pvn_write_done(pp, flags | B_ERROR);
1948                 return (EPERM);
1949         }
1950         return ((*pageio)(vp, pp, io_off, io_len, flags, cr));
1951 }
1952 
1953 void
1954 nfs4_async_readdir(vnode_t *vp, rddir4_cache *rdc, cred_t *cr,
1955     int (*readdir)(vnode_t *, rddir4_cache *, cred_t *))
1956 {
1957         rnode4_t *rp;
1958         mntinfo4_t *mi;
1959         struct nfs4_async_reqs *args;
1960 
1961         rp = VTOR4(vp);
1962         ASSERT(rp->r_freef == NULL);
1963 
1964         mi = VTOMI4(vp);
1965 
1966         /*
1967          * If we can't allocate a request structure, skip the readdir.
1968          */
1969         if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1970                 goto noasync;
1971 
1972         args->a_next = NULL;
1973 #ifdef DEBUG
1974         args->a_queuer = curthread;
1975 #endif
1976         VN_HOLD(vp);
1977         args->a_vp = vp;
1978         ASSERT(cr != NULL);
1979         crhold(cr);
1980         args->a_cred = cr;
1981         args->a_io = NFS4_READDIR;
1982         args->a_nfs4_readdir = readdir;
1983         args->a_nfs4_rdc = rdc;
1984 
1985         mutex_enter(&mi->mi_async_lock);
1986 
1987         /*
1988          * If asyncio has been disabled, then skip this request
1989          */
1990         if (mi->mi_max_threads == 0) {
1991                 mutex_exit(&mi->mi_async_lock);
1992 
1993                 VN_RELE(vp);
1994                 crfree(cr);
1995                 kmem_free(args, sizeof (*args));
1996                 goto noasync;
1997         }
1998 
1999         /*
2000          * Link request structure into the async list and
2001          * wakeup async thread to do the i/o.
2002          */
2003         if (mi->mi_async_reqs[NFS4_READDIR] == NULL) {
2004                 mi->mi_async_reqs[NFS4_READDIR] = args;
2005                 mi->mi_async_tail[NFS4_READDIR] = args;
2006         } else {
2007                 mi->mi_async_tail[NFS4_READDIR]->a_next = args;
2008                 mi->mi_async_tail[NFS4_READDIR] = args;
2009         }
2010 
2011         mutex_enter(&rp->r_statelock);
2012         rp->r_count++;
2013         mutex_exit(&rp->r_statelock);
2014 
2015         if (mi->mi_io_kstats) {
2016                 mutex_enter(&mi->mi_lock);
2017                 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
2018                 mutex_exit(&mi->mi_lock);
2019         }
2020 
2021         mi->mi_async_req_count++;
2022         ASSERT(mi->mi_async_req_count != 0);
2023         cv_signal(&mi->mi_async_reqs_cv);
2024         mutex_exit(&mi->mi_async_lock);
2025         return;
2026 
2027 noasync:
2028         mutex_enter(&rp->r_statelock);
2029         rdc->entries = NULL;
2030         /*
2031          * Indicate that no one is trying to fill this entry and
2032          * it still needs to be filled.
2033          */
2034         rdc->flags &= ~RDDIR;
2035         rdc->flags |= RDDIRREQ;
2036         rddir4_cache_rele(rp, rdc);
2037         mutex_exit(&rp->r_statelock);
2038 }
2039 
2040 void
2041 nfs4_async_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count,
2042     cred_t *cr, void (*commit)(vnode_t *, page_t *, offset3, count3,
2043     cred_t *))
2044 {
2045         rnode4_t *rp;
2046         mntinfo4_t *mi;
2047         struct nfs4_async_reqs *args;
2048         page_t *pp;
2049 
2050         rp = VTOR4(vp);
2051         mi = VTOMI4(vp);
2052 
2053         /*
2054          * If we can't allocate a request structure, do the commit
2055          * operation synchronously in this thread's context.
2056          */
2057         if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
2058                 goto noasync;
2059 
2060         args->a_next = NULL;
2061 #ifdef DEBUG
2062         args->a_queuer = curthread;
2063 #endif
2064         VN_HOLD(vp);
2065         args->a_vp = vp;
2066         ASSERT(cr != NULL);
2067         crhold(cr);
2068         args->a_cred = cr;
2069         args->a_io = NFS4_COMMIT;
2070         args->a_nfs4_commit = commit;
2071         args->a_nfs4_plist = plist;
2072         args->a_nfs4_offset = offset;
2073         args->a_nfs4_count = count;
2074 
2075         mutex_enter(&mi->mi_async_lock);
2076 
2077         /*
2078          * If asyncio has been disabled, then make a synchronous request.
2079          * This check is done a second time in case async io was diabled
2080          * while this thread was blocked waiting for memory pressure to
2081          * reduce or for the queue to drain.
2082          */
2083         if (mi->mi_max_threads == 0) {
2084                 mutex_exit(&mi->mi_async_lock);
2085 
2086                 VN_RELE(vp);
2087                 crfree(cr);
2088                 kmem_free(args, sizeof (*args));
2089                 goto noasync;
2090         }
2091 
2092         /*
2093          * Link request structure into the async list and
2094          * wakeup async thread to do the i/o.
2095          */
2096         if (mi->mi_async_reqs[NFS4_COMMIT] == NULL) {
2097                 mi->mi_async_reqs[NFS4_COMMIT] = args;
2098                 mi->mi_async_tail[NFS4_COMMIT] = args;
2099         } else {
2100                 mi->mi_async_tail[NFS4_COMMIT]->a_next = args;
2101                 mi->mi_async_tail[NFS4_COMMIT] = args;
2102         }
2103 
2104         mutex_enter(&rp->r_statelock);
2105         rp->r_count++;
2106         mutex_exit(&rp->r_statelock);
2107 
2108         if (mi->mi_io_kstats) {
2109                 mutex_enter(&mi->mi_lock);
2110                 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
2111                 mutex_exit(&mi->mi_lock);
2112         }
2113 
2114         mi->mi_async_req_count++;
2115         ASSERT(mi->mi_async_req_count != 0);
2116         cv_signal(&mi->mi_async_reqs_cv);
2117         mutex_exit(&mi->mi_async_lock);
2118         return;
2119 
2120 noasync:
2121         if (curproc == proc_pageout || curproc == proc_fsflush ||
2122             nfs_zone() != mi->mi_zone) {
2123                 while (plist != NULL) {
2124                         pp = plist;
2125                         page_sub(&plist, pp);
2126                         pp->p_fsdata = C_COMMIT;
2127                         page_unlock(pp);
2128                 }
2129                 return;
2130         }
2131         (*commit)(vp, plist, offset, count, cr);
2132 }
2133 
2134 /*
2135  * nfs4_async_inactive - hand off a VOP_INACTIVE call to a thread.  The
2136  * reference to the vnode is handed over to the thread; the caller should
2137  * no longer refer to the vnode.
2138  *
2139  * Unlike most of the async routines, this handoff is needed for
2140  * correctness reasons, not just performance.  So doing operations in the
2141  * context of the current thread is not an option.
2142  */
2143 void
2144 nfs4_async_inactive(vnode_t *vp, cred_t *cr)
2145 {
2146         mntinfo4_t *mi;
2147         struct nfs4_async_reqs *args;
2148         boolean_t signal_inactive_thread = B_FALSE;
2149 
2150         mi = VTOMI4(vp);
2151 
2152         args = kmem_alloc(sizeof (*args), KM_SLEEP);
2153         args->a_next = NULL;
2154 #ifdef DEBUG
2155         args->a_queuer = curthread;
2156 #endif
2157         args->a_vp = vp;
2158         ASSERT(cr != NULL);
2159         crhold(cr);
2160         args->a_cred = cr;
2161         args->a_io = NFS4_INACTIVE;
2162 
2163         /*
2164          * Note that we don't check mi->mi_max_threads here, since we
2165          * *need* to get rid of this vnode regardless of whether someone
2166          * set nfs4_max_threads to zero in /etc/system.
2167          *
2168          * The manager thread knows about this and is willing to create
2169          * at least one thread to accommodate us.
2170          */
2171         mutex_enter(&mi->mi_async_lock);
2172         if (mi->mi_inactive_thread == NULL) {
2173                 rnode4_t *rp;
2174                 vnode_t *unldvp = NULL;
2175                 char *unlname;
2176                 cred_t *unlcred;
2177 
2178                 mutex_exit(&mi->mi_async_lock);
2179                 /*
2180                  * We just need to free up the memory associated with the
2181                  * vnode, which can be safely done from within the current
2182                  * context.
2183                  */
2184                 crfree(cr);     /* drop our reference */
2185                 kmem_free(args, sizeof (*args));
2186                 rp = VTOR4(vp);
2187                 mutex_enter(&rp->r_statelock);
2188                 if (rp->r_unldvp != NULL) {
2189                         unldvp = rp->r_unldvp;
2190                         rp->r_unldvp = NULL;
2191                         unlname = rp->r_unlname;
2192                         rp->r_unlname = NULL;
2193                         unlcred = rp->r_unlcred;
2194                         rp->r_unlcred = NULL;
2195                 }
2196                 mutex_exit(&rp->r_statelock);
2197                 /*
2198                  * No need to explicitly throw away any cached pages.  The
2199                  * eventual r4inactive() will attempt a synchronous
2200                  * VOP_PUTPAGE() which will immediately fail since the request
2201                  * is coming from the wrong zone, and then will proceed to call
2202                  * nfs4_invalidate_pages() which will clean things up for us.
2203                  *
2204                  * Throw away the delegation here so rp4_addfree()'s attempt to
2205                  * return any existing delegations becomes a no-op.
2206                  */
2207                 if (rp->r_deleg_type != OPEN_DELEGATE_NONE) {
2208                         (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER,
2209                             FALSE);
2210                         (void) nfs4delegreturn(rp, NFS4_DR_DISCARD);
2211                         nfs_rw_exit(&mi->mi_recovlock);
2212                 }
2213                 nfs4_clear_open_streams(rp);
2214 
2215                 rp4_addfree(rp, cr);
2216                 if (unldvp != NULL) {
2217                         kmem_free(unlname, MAXNAMELEN);
2218                         VN_RELE(unldvp);
2219                         crfree(unlcred);
2220                 }
2221                 return;
2222         }
2223 
2224         if (mi->mi_manager_thread == NULL) {
2225                 /*
2226                  * We want to talk to the inactive thread.
2227                  */
2228                 signal_inactive_thread = B_TRUE;
2229         }
2230 
2231         /*
2232          * Enqueue the vnode and wake up either the special thread (empty
2233          * list) or an async thread.
2234          */
2235         if (mi->mi_async_reqs[NFS4_INACTIVE] == NULL) {
2236                 mi->mi_async_reqs[NFS4_INACTIVE] = args;
2237                 mi->mi_async_tail[NFS4_INACTIVE] = args;
2238                 signal_inactive_thread = B_TRUE;
2239         } else {
2240                 mi->mi_async_tail[NFS4_INACTIVE]->a_next = args;
2241                 mi->mi_async_tail[NFS4_INACTIVE] = args;
2242         }
2243         if (signal_inactive_thread) {
2244                 cv_signal(&mi->mi_inact_req_cv);
2245         } else  {
2246                 mi->mi_async_req_count++;
2247                 ASSERT(mi->mi_async_req_count != 0);
2248                 cv_signal(&mi->mi_async_reqs_cv);
2249         }
2250 
2251         mutex_exit(&mi->mi_async_lock);
2252 }
2253 
2254 int
2255 writerp4(rnode4_t *rp, caddr_t base, int tcount, struct uio *uio, int pgcreated)
2256 {
2257         int pagecreate;
2258         int n;
2259         int saved_n;
2260         caddr_t saved_base;
2261         u_offset_t offset;
2262         int error;
2263         int sm_error;
2264         vnode_t *vp = RTOV(rp);
2265 
2266         ASSERT(tcount <= MAXBSIZE && tcount <= uio->uio_resid);
2267         ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_WRITER));
2268         if (!vpm_enable) {
2269                 ASSERT(((uintptr_t)base & MAXBOFFSET) + tcount <= MAXBSIZE);
2270         }
2271 
2272         /*
2273          * Move bytes in at most PAGESIZE chunks. We must avoid
2274          * spanning pages in uiomove() because page faults may cause
2275          * the cache to be invalidated out from under us. The r_size is not
2276          * updated until after the uiomove. If we push the last page of a
2277          * file before r_size is correct, we will lose the data written past
2278          * the current (and invalid) r_size.
2279          */
2280         do {
2281                 offset = uio->uio_loffset;
2282                 pagecreate = 0;
2283 
2284                 /*
2285                  * n is the number of bytes required to satisfy the request
2286                  *   or the number of bytes to fill out the page.
2287                  */
2288                 n = (int)MIN((PAGESIZE - (offset & PAGEOFFSET)), tcount);
2289 
2290                 /*
2291                  * Check to see if we can skip reading in the page
2292                  * and just allocate the memory.  We can do this
2293                  * if we are going to rewrite the entire mapping
2294                  * or if we are going to write to or beyond the current
2295                  * end of file from the beginning of the mapping.
2296                  *
2297                  * The read of r_size is now protected by r_statelock.
2298                  */
2299                 mutex_enter(&rp->r_statelock);
2300                 /*
2301                  * When pgcreated is nonzero the caller has already done
2302                  * a segmap_getmapflt with forcefault 0 and S_WRITE. With
2303                  * segkpm this means we already have at least one page
2304                  * created and mapped at base.
2305                  */
2306                 pagecreate = pgcreated ||
2307                     ((offset & PAGEOFFSET) == 0 &&
2308                     (n == PAGESIZE || ((offset + n) >= rp->r_size)));
2309 
2310                 mutex_exit(&rp->r_statelock);
2311 
2312                 if (!vpm_enable && pagecreate) {
2313                         /*
2314                          * The last argument tells segmap_pagecreate() to
2315                          * always lock the page, as opposed to sometimes
2316                          * returning with the page locked. This way we avoid a
2317                          * fault on the ensuing uiomove(), but also
2318                          * more importantly (to fix bug 1094402) we can
2319                          * call segmap_fault() to unlock the page in all
2320                          * cases. An alternative would be to modify
2321                          * segmap_pagecreate() to tell us when it is
2322                          * locking a page, but that's a fairly major
2323                          * interface change.
2324                          */
2325                         if (pgcreated == 0)
2326                                 (void) segmap_pagecreate(segkmap, base,
2327                                     (uint_t)n, 1);
2328                         saved_base = base;
2329                         saved_n = n;
2330                 }
2331 
2332                 /*
2333                  * The number of bytes of data in the last page can not
2334                  * be accurately be determined while page is being
2335                  * uiomove'd to and the size of the file being updated.
2336                  * Thus, inform threads which need to know accurately
2337                  * how much data is in the last page of the file.  They
2338                  * will not do the i/o immediately, but will arrange for
2339                  * the i/o to happen later when this modify operation
2340                  * will have finished.
2341                  */
2342                 ASSERT(!(rp->r_flags & R4MODINPROGRESS));
2343                 mutex_enter(&rp->r_statelock);
2344                 rp->r_flags |= R4MODINPROGRESS;
2345                 rp->r_modaddr = (offset & MAXBMASK);
2346                 mutex_exit(&rp->r_statelock);
2347 
2348                 if (vpm_enable) {
2349                         /*
2350                          * Copy data. If new pages are created, part of
2351                          * the page that is not written will be initizliazed
2352                          * with zeros.
2353                          */
2354                         error = vpm_data_copy(vp, offset, n, uio,
2355                             !pagecreate, NULL, 0, S_WRITE);
2356                 } else {
2357                         error = uiomove(base, n, UIO_WRITE, uio);
2358                 }
2359 
2360                 /*
2361                  * r_size is the maximum number of
2362                  * bytes known to be in the file.
2363                  * Make sure it is at least as high as the
2364                  * first unwritten byte pointed to by uio_loffset.
2365                  */
2366                 mutex_enter(&rp->r_statelock);
2367                 if (rp->r_size < uio->uio_loffset)
2368                         rp->r_size = uio->uio_loffset;
2369                 rp->r_flags &= ~R4MODINPROGRESS;
2370                 rp->r_flags |= R4DIRTY;
2371                 mutex_exit(&rp->r_statelock);
2372 
2373                 /* n = # of bytes written */
2374                 n = (int)(uio->uio_loffset - offset);
2375 
2376                 if (!vpm_enable) {
2377                         base += n;
2378                 }
2379 
2380                 tcount -= n;
2381                 /*
2382                  * If we created pages w/o initializing them completely,
2383                  * we need to zero the part that wasn't set up.
2384                  * This happens on a most EOF write cases and if
2385                  * we had some sort of error during the uiomove.
2386                  */
2387                 if (!vpm_enable && pagecreate) {
2388                         if ((uio->uio_loffset & PAGEOFFSET) || n == 0)
2389                                 (void) kzero(base, PAGESIZE - n);
2390 
2391                         if (pgcreated) {
2392                                 /*
2393                                  * Caller is responsible for this page,
2394                                  * it was not created in this loop.
2395                                  */
2396                                 pgcreated = 0;
2397                         } else {
2398                                 /*
2399                                  * For bug 1094402: segmap_pagecreate locks
2400                                  * page. Unlock it. This also unlocks the
2401                                  * pages allocated by page_create_va() in
2402                                  * segmap_pagecreate().
2403                                  */
2404                                 sm_error = segmap_fault(kas.a_hat, segkmap,
2405                                     saved_base, saved_n,
2406                                     F_SOFTUNLOCK, S_WRITE);
2407                                 if (error == 0)
2408                                         error = sm_error;
2409                         }
2410                 }
2411         } while (tcount > 0 && error == 0);
2412 
2413         return (error);
2414 }
2415 
2416 int
2417 nfs4_putpages(vnode_t *vp, u_offset_t off, size_t len, int flags, cred_t *cr)
2418 {
2419         rnode4_t *rp;
2420         page_t *pp;
2421         u_offset_t eoff;
2422         u_offset_t io_off;
2423         size_t io_len;
2424         int error;
2425         int rdirty;
2426         int err;
2427 
2428         rp = VTOR4(vp);
2429         ASSERT(rp->r_count > 0);
2430 
2431         if (!nfs4_has_pages(vp))
2432                 return (0);
2433 
2434         ASSERT(vp->v_type != VCHR);
2435 
2436         /*
2437          * If R4OUTOFSPACE is set, then all writes turn into B_INVAL
2438          * writes.  B_FORCE is set to force the VM system to actually
2439          * invalidate the pages, even if the i/o failed.  The pages
2440          * need to get invalidated because they can't be written out
2441          * because there isn't any space left on either the server's
2442          * file system or in the user's disk quota.  The B_FREE bit
2443          * is cleared to avoid confusion as to whether this is a
2444          * request to place the page on the freelist or to destroy
2445          * it.
2446          */
2447         if ((rp->r_flags & R4OUTOFSPACE) ||
2448             (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED))
2449                 flags = (flags & ~B_FREE) | B_INVAL | B_FORCE;
2450 
2451         if (len == 0) {
2452                 /*
2453                  * If doing a full file synchronous operation, then clear
2454                  * the R4DIRTY bit.  If a page gets dirtied while the flush
2455                  * is happening, then R4DIRTY will get set again.  The
2456                  * R4DIRTY bit must get cleared before the flush so that
2457                  * we don't lose this information.
2458                  *
2459                  * If there are no full file async write operations
2460                  * pending and RDIRTY bit is set, clear it.
2461                  */
2462                 if (off == (u_offset_t)0 &&
2463                     !(flags & B_ASYNC) &&
2464                     (rp->r_flags & R4DIRTY)) {
2465                         mutex_enter(&rp->r_statelock);
2466                         rdirty = (rp->r_flags & R4DIRTY);
2467                         rp->r_flags &= ~R4DIRTY;
2468                         mutex_exit(&rp->r_statelock);
2469                 } else if (flags & B_ASYNC && off == (u_offset_t)0) {
2470                         mutex_enter(&rp->r_statelock);
2471                         if (rp->r_flags & R4DIRTY && rp->r_awcount == 0) {
2472                                 rdirty = (rp->r_flags & R4DIRTY);
2473                                 rp->r_flags &= ~R4DIRTY;
2474                         }
2475                         mutex_exit(&rp->r_statelock);
2476                 } else
2477                         rdirty = 0;
2478 
2479                 /*
2480                  * Search the entire vp list for pages >= off, and flush
2481                  * the dirty pages.
2482                  */
2483                 error = pvn_vplist_dirty(vp, off, rp->r_putapage,
2484                     flags, cr);
2485 
2486                 /*
2487                  * If an error occurred and the file was marked as dirty
2488                  * before and we aren't forcibly invalidating pages, then
2489                  * reset the R4DIRTY flag.
2490                  */
2491                 if (error && rdirty &&
2492                     (flags & (B_INVAL | B_FORCE)) != (B_INVAL | B_FORCE)) {
2493                         mutex_enter(&rp->r_statelock);
2494                         rp->r_flags |= R4DIRTY;
2495                         mutex_exit(&rp->r_statelock);
2496                 }
2497         } else {
2498                 /*
2499                  * Do a range from [off...off + len) looking for pages
2500                  * to deal with.
2501                  */
2502                 error = 0;
2503                 io_len = 0;
2504                 eoff = off + len;
2505                 mutex_enter(&rp->r_statelock);
2506                 for (io_off = off; io_off < eoff && io_off < rp->r_size;
2507                     io_off += io_len) {
2508                         mutex_exit(&rp->r_statelock);
2509                         /*
2510                          * If we are not invalidating, synchronously
2511                          * freeing or writing pages use the routine
2512                          * page_lookup_nowait() to prevent reclaiming
2513                          * them from the free list.
2514                          */
2515                         if ((flags & B_INVAL) || !(flags & B_ASYNC)) {
2516                                 pp = page_lookup(vp, io_off,
2517                                     (flags & (B_INVAL | B_FREE)) ?
2518                                     SE_EXCL : SE_SHARED);
2519                         } else {
2520                                 pp = page_lookup_nowait(vp, io_off,
2521                                     (flags & B_FREE) ? SE_EXCL : SE_SHARED);
2522                         }
2523 
2524                         if (pp == NULL || !pvn_getdirty(pp, flags))
2525                                 io_len = PAGESIZE;
2526                         else {
2527                                 err = (*rp->r_putapage)(vp, pp, &io_off,
2528                                     &io_len, flags, cr);
2529                                 if (!error)
2530                                         error = err;
2531                                 /*
2532                                  * "io_off" and "io_len" are returned as
2533                                  * the range of pages we actually wrote.
2534                                  * This allows us to skip ahead more quickly
2535                                  * since several pages may've been dealt
2536                                  * with by this iteration of the loop.
2537                                  */
2538                         }
2539                         mutex_enter(&rp->r_statelock);
2540                 }
2541                 mutex_exit(&rp->r_statelock);
2542         }
2543 
2544         return (error);
2545 }
2546 
2547 void
2548 nfs4_invalidate_pages(vnode_t *vp, u_offset_t off, cred_t *cr)
2549 {
2550         rnode4_t *rp;
2551 
2552         rp = VTOR4(vp);
2553         if (IS_SHADOW(vp, rp))
2554                 vp = RTOV4(rp);
2555         mutex_enter(&rp->r_statelock);
2556         while (rp->r_flags & R4TRUNCATE)
2557                 cv_wait(&rp->r_cv, &rp->r_statelock);
2558         rp->r_flags |= R4TRUNCATE;
2559         if (off == (u_offset_t)0) {
2560                 rp->r_flags &= ~R4DIRTY;
2561                 if (!(rp->r_flags & R4STALE))
2562                         rp->r_error = 0;
2563         }
2564         rp->r_truncaddr = off;
2565         mutex_exit(&rp->r_statelock);
2566         (void) pvn_vplist_dirty(vp, off, rp->r_putapage,
2567             B_INVAL | B_TRUNC, cr);
2568         mutex_enter(&rp->r_statelock);
2569         rp->r_flags &= ~R4TRUNCATE;
2570         cv_broadcast(&rp->r_cv);
2571         mutex_exit(&rp->r_statelock);
2572 }
2573 
2574 static int
2575 nfs4_mnt_kstat_update(kstat_t *ksp, int rw)
2576 {
2577         mntinfo4_t *mi;
2578         struct mntinfo_kstat *mik;
2579         vfs_t *vfsp;
2580 
2581         /* this is a read-only kstat. Bail out on a write */
2582         if (rw == KSTAT_WRITE)
2583                 return (EACCES);
2584 
2585 
2586         /*
2587          * We don't want to wait here as kstat_chain_lock could be held by
2588          * dounmount(). dounmount() takes vfs_reflock before the chain lock
2589          * and thus could lead to a deadlock.
2590          */
2591         vfsp = (struct vfs *)ksp->ks_private;
2592 
2593         mi = VFTOMI4(vfsp);
2594         mik = (struct mntinfo_kstat *)ksp->ks_data;
2595 
2596         (void) strcpy(mik->mik_proto, mi->mi_curr_serv->sv_knconf->knc_proto);
2597 
2598         mik->mik_vers = (uint32_t)mi->mi_vers;
2599         mik->mik_flags = mi->mi_flags;
2600         /*
2601          * The sv_secdata holds the flavor the client specifies.
2602          * If the client uses default and a security negotiation
2603          * occurs, sv_currsec will point to the current flavor
2604          * selected from the server flavor list.
2605          * sv_currsec is NULL if no security negotiation takes place.
2606          */
2607         mik->mik_secmod = mi->mi_curr_serv->sv_currsec ?
2608             mi->mi_curr_serv->sv_currsec->secmod :
2609             mi->mi_curr_serv->sv_secdata->secmod;
2610         mik->mik_curread = (uint32_t)mi->mi_curread;
2611         mik->mik_curwrite = (uint32_t)mi->mi_curwrite;
2612         mik->mik_retrans = mi->mi_retrans;
2613         mik->mik_timeo = mi->mi_timeo;
2614         mik->mik_acregmin = HR2SEC(mi->mi_acregmin);
2615         mik->mik_acregmax = HR2SEC(mi->mi_acregmax);
2616         mik->mik_acdirmin = HR2SEC(mi->mi_acdirmin);
2617         mik->mik_acdirmax = HR2SEC(mi->mi_acdirmax);
2618         mik->mik_noresponse = (uint32_t)mi->mi_noresponse;
2619         mik->mik_failover = (uint32_t)mi->mi_failover;
2620         mik->mik_remap = (uint32_t)mi->mi_remap;
2621 
2622         (void) strcpy(mik->mik_curserver, mi->mi_curr_serv->sv_hostname);
2623 
2624         return (0);
2625 }
2626 
2627 void
2628 nfs4_mnt_kstat_init(struct vfs *vfsp)
2629 {
2630         mntinfo4_t *mi = VFTOMI4(vfsp);
2631 
2632         /*
2633          * PSARC 2001/697 Contract Private Interface
2634          * All nfs kstats are under SunMC contract
2635          * Please refer to the PSARC listed above and contact
2636          * SunMC before making any changes!
2637          *
2638          * Changes must be reviewed by Solaris File Sharing
2639          * Changes must be communicated to contract-2001-697@sun.com
2640          *
2641          */
2642 
2643         mi->mi_io_kstats = kstat_create_zone("nfs", getminor(vfsp->vfs_dev),
2644             NULL, "nfs", KSTAT_TYPE_IO, 1, 0, mi->mi_zone->zone_id);
2645         if (mi->mi_io_kstats) {
2646                 if (mi->mi_zone->zone_id != GLOBAL_ZONEID)
2647                         kstat_zone_add(mi->mi_io_kstats, GLOBAL_ZONEID);
2648                 mi->mi_io_kstats->ks_lock = &mi->mi_lock;
2649                 kstat_install(mi->mi_io_kstats);
2650         }
2651 
2652         if ((mi->mi_ro_kstats = kstat_create_zone("nfs",
2653             getminor(vfsp->vfs_dev), "mntinfo", "misc", KSTAT_TYPE_RAW,
2654             sizeof (struct mntinfo_kstat), 0, mi->mi_zone->zone_id)) != NULL) {
2655                 if (mi->mi_zone->zone_id != GLOBAL_ZONEID)
2656                         kstat_zone_add(mi->mi_ro_kstats, GLOBAL_ZONEID);
2657                 mi->mi_ro_kstats->ks_update = nfs4_mnt_kstat_update;
2658                 mi->mi_ro_kstats->ks_private = (void *)vfsp;
2659                 kstat_install(mi->mi_ro_kstats);
2660         }
2661 
2662         nfs4_mnt_recov_kstat_init(vfsp);
2663 }
2664 
2665 void
2666 nfs4_write_error(vnode_t *vp, int error, cred_t *cr)
2667 {
2668         mntinfo4_t *mi;
2669         clock_t now = ddi_get_lbolt();
2670 
2671         mi = VTOMI4(vp);
2672         /*
2673          * In case of forced unmount, do not print any messages
2674          * since it can flood the console with error messages.
2675          */
2676         if (mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED)
2677                 return;
2678 
2679         /*
2680          * If the mount point is dead, not recoverable, do not
2681          * print error messages that can flood the console.
2682          */
2683         if (mi->mi_flags & MI4_RECOV_FAIL)
2684                 return;
2685 
2686         /*
2687          * No use in flooding the console with ENOSPC
2688          * messages from the same file system.
2689          */
2690         if ((error != ENOSPC && error != EDQUOT) ||
2691             now - mi->mi_printftime > 0) {
2692                 zoneid_t zoneid = mi->mi_zone->zone_id;
2693 
2694 #ifdef DEBUG
2695                 nfs_perror(error, "NFS%ld write error on host %s: %m.\n",
2696                     mi->mi_vers, VTOR4(vp)->r_server->sv_hostname, NULL);
2697 #else
2698                 nfs_perror(error, "NFS write error on host %s: %m.\n",
2699                     VTOR4(vp)->r_server->sv_hostname, NULL);
2700 #endif
2701                 if (error == ENOSPC || error == EDQUOT) {
2702                         zcmn_err(zoneid, CE_CONT,
2703                             "^File: userid=%d, groupid=%d\n",
2704                             crgetuid(cr), crgetgid(cr));
2705                         if (crgetuid(curthread->t_cred) != crgetuid(cr) ||
2706                             crgetgid(curthread->t_cred) != crgetgid(cr)) {
2707                                 zcmn_err(zoneid, CE_CONT,
2708                                     "^User: userid=%d, groupid=%d\n",
2709                                     crgetuid(curthread->t_cred),
2710                                     crgetgid(curthread->t_cred));
2711                         }
2712                         mi->mi_printftime = now +
2713                             nfs_write_error_interval * hz;
2714                 }
2715                 sfh4_printfhandle(VTOR4(vp)->r_fh);
2716 #ifdef DEBUG
2717                 if (error == EACCES) {
2718                         zcmn_err(zoneid, CE_CONT,
2719                             "nfs_bio: cred is%s kcred\n",
2720                             cr == kcred ? "" : " not");
2721                 }
2722 #endif
2723         }
2724 }
2725 
2726 /*
2727  * Return non-zero if the given file can be safely memory mapped.  Locks
2728  * are safe if whole-file (length and offset are both zero).
2729  */
2730 
2731 #define SAFE_LOCK(flk)  ((flk).l_start == 0 && (flk).l_len == 0)
2732 
2733 static int
2734 nfs4_safemap(const vnode_t *vp)
2735 {
2736         locklist_t      *llp, *next_llp;
2737         int             safe = 1;
2738         rnode4_t        *rp = VTOR4(vp);
2739 
2740         ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER));
2741 
2742         NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, "nfs4_safemap: "
2743             "vp = %p", (void *)vp));
2744 
2745         /*
2746          * Review all the locks for the vnode, both ones that have been
2747          * acquired and ones that are pending.  We assume that
2748          * flk_active_locks_for_vp() has merged any locks that can be
2749          * merged (so that if a process has the entire file locked, it is
2750          * represented as a single lock).
2751          *
2752          * Note that we can't bail out of the loop if we find a non-safe
2753          * lock, because we have to free all the elements in the llp list.
2754          * We might be able to speed up this code slightly by not looking
2755          * at each lock's l_start and l_len fields once we've found a
2756          * non-safe lock.
2757          */
2758 
2759         llp = flk_active_locks_for_vp(vp);
2760         while (llp) {
2761                 NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE,
2762                     "nfs4_safemap: active lock (%" PRId64 ", %" PRId64 ")",
2763                     llp->ll_flock.l_start, llp->ll_flock.l_len));
2764                 if (!SAFE_LOCK(llp->ll_flock)) {
2765                         safe = 0;
2766                         NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE,
2767                             "nfs4_safemap: unsafe active lock (%" PRId64
2768                             ", %" PRId64 ")", llp->ll_flock.l_start,
2769                             llp->ll_flock.l_len));
2770                 }
2771                 next_llp = llp->ll_next;
2772                 VN_RELE(llp->ll_vp);
2773                 kmem_free(llp, sizeof (*llp));
2774                 llp = next_llp;
2775         }
2776 
2777         NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, "nfs4_safemap: %s",
2778             safe ? "safe" : "unsafe"));
2779         return (safe);
2780 }
2781 
2782 /*
2783  * Return whether there is a lost LOCK or LOCKU queued up for the given
2784  * file that would make an mmap request unsafe.  cf. nfs4_safemap().
2785  */
2786 
2787 bool_t
2788 nfs4_map_lost_lock_conflict(vnode_t *vp)
2789 {
2790         bool_t conflict = FALSE;
2791         nfs4_lost_rqst_t *lrp;
2792         mntinfo4_t *mi = VTOMI4(vp);
2793 
2794         mutex_enter(&mi->mi_lock);
2795         for (lrp = list_head(&mi->mi_lost_state); lrp != NULL;
2796             lrp = list_next(&mi->mi_lost_state, lrp)) {
2797                 if (lrp->lr_op != OP_LOCK && lrp->lr_op != OP_LOCKU)
2798                         continue;
2799                 ASSERT(lrp->lr_vp != NULL);
2800                 if (!VOP_CMP(lrp->lr_vp, vp, NULL))
2801                         continue;       /* different file */
2802                 if (!SAFE_LOCK(*lrp->lr_flk)) {
2803                         conflict = TRUE;
2804                         break;
2805                 }
2806         }
2807 
2808         mutex_exit(&mi->mi_lock);
2809         return (conflict);
2810 }
2811 
2812 /*
2813  * nfs_lockcompletion:
2814  *
2815  * If the vnode has a lock that makes it unsafe to cache the file, mark it
2816  * as non cachable (set VNOCACHE bit).
2817  */
2818 
2819 void
2820 nfs4_lockcompletion(vnode_t *vp, int cmd)
2821 {
2822         rnode4_t *rp = VTOR4(vp);
2823 
2824         ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER));
2825         ASSERT(!IS_SHADOW(vp, rp));
2826 
2827         if (cmd == F_SETLK || cmd == F_SETLKW) {
2828 
2829                 if (!nfs4_safemap(vp)) {
2830                         mutex_enter(&vp->v_lock);
2831                         vp->v_flag |= VNOCACHE;
2832                         mutex_exit(&vp->v_lock);
2833                 } else {
2834                         mutex_enter(&vp->v_lock);
2835                         vp->v_flag &= ~VNOCACHE;
2836                         mutex_exit(&vp->v_lock);
2837                 }
2838         }
2839         /*
2840          * The cached attributes of the file are stale after acquiring
2841          * the lock on the file. They were updated when the file was
2842          * opened, but not updated when the lock was acquired. Therefore the
2843          * cached attributes are invalidated after the lock is obtained.
2844          */
2845         PURGE_ATTRCACHE4(vp);
2846 }
2847 
2848 /* ARGSUSED */
2849 static void *
2850 nfs4_mi_init(zoneid_t zoneid)
2851 {
2852         struct mi4_globals *mig;
2853 
2854         mig = kmem_alloc(sizeof (*mig), KM_SLEEP);
2855         mutex_init(&mig->mig_lock, NULL, MUTEX_DEFAULT, NULL);
2856         list_create(&mig->mig_list, sizeof (mntinfo4_t),
2857             offsetof(mntinfo4_t, mi_zone_node));
2858         mig->mig_destructor_called = B_FALSE;
2859         return (mig);
2860 }
2861 
2862 /*
2863  * Callback routine to tell all NFSv4 mounts in the zone to start tearing down
2864  * state and killing off threads.
2865  */
2866 /* ARGSUSED */
2867 static void
2868 nfs4_mi_shutdown(zoneid_t zoneid, void *data)
2869 {
2870         struct mi4_globals *mig = data;
2871         mntinfo4_t *mi;
2872         nfs4_server_t *np;
2873 
2874         NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
2875             "nfs4_mi_shutdown zone %d\n", zoneid));
2876         ASSERT(mig != NULL);
2877         for (;;) {
2878                 mutex_enter(&mig->mig_lock);
2879                 mi = list_head(&mig->mig_list);
2880                 if (mi == NULL) {
2881                         mutex_exit(&mig->mig_lock);
2882                         break;
2883                 }
2884 
2885                 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
2886                     "nfs4_mi_shutdown stopping vfs %p\n", (void *)mi->mi_vfsp));
2887                 /*
2888                  * purge the DNLC for this filesystem
2889                  */
2890                 (void) dnlc_purge_vfsp(mi->mi_vfsp, 0);
2891                 /*
2892                  * Tell existing async worker threads to exit.
2893                  */
2894                 mutex_enter(&mi->mi_async_lock);
2895                 mi->mi_max_threads = 0;
2896                 NFS4_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv);
2897                 /*
2898                  * Set the appropriate flags, signal and wait for both the
2899                  * async manager and the inactive thread to exit when they're
2900                  * done with their current work.
2901                  */
2902                 mutex_enter(&mi->mi_lock);
2903                 mi->mi_flags |= (MI4_ASYNC_MGR_STOP|MI4_DEAD);
2904                 mutex_exit(&mi->mi_lock);
2905                 mutex_exit(&mi->mi_async_lock);
2906                 if (mi->mi_manager_thread) {
2907                         nfs4_async_manager_stop(mi->mi_vfsp);
2908                 }
2909                 if (mi->mi_inactive_thread) {
2910                         mutex_enter(&mi->mi_async_lock);
2911                         cv_signal(&mi->mi_inact_req_cv);
2912                         /*
2913                          * Wait for the inactive thread to exit.
2914                          */
2915                         while (mi->mi_inactive_thread != NULL) {
2916                                 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
2917                         }
2918                         mutex_exit(&mi->mi_async_lock);
2919                 }
2920                 /*
2921                  * Wait for the recovery thread to complete, that is, it will
2922                  * signal when it is done using the "mi" structure and about
2923                  * to exit
2924                  */
2925                 mutex_enter(&mi->mi_lock);
2926                 while (mi->mi_in_recovery > 0)
2927                         cv_wait(&mi->mi_cv_in_recov, &mi->mi_lock);
2928                 mutex_exit(&mi->mi_lock);
2929                 /*
2930                  * We're done when every mi has been done or the list is empty.
2931                  * This one is done, remove it from the list.
2932                  */
2933                 list_remove(&mig->mig_list, mi);
2934                 mutex_exit(&mig->mig_lock);
2935                 zone_rele_ref(&mi->mi_zone_ref, ZONE_REF_NFSV4);
2936 
2937                 /*
2938                  * Release hold on vfs and mi done to prevent race with zone
2939                  * shutdown. This releases the hold in nfs4_mi_zonelist_add.
2940                  */
2941                 VFS_RELE(mi->mi_vfsp);
2942                 MI4_RELE(mi);
2943         }
2944         /*
2945          * Tell each renew thread in the zone to exit
2946          */
2947         mutex_enter(&nfs4_server_lst_lock);
2948         for (np = nfs4_server_lst.forw; np != &nfs4_server_lst; np = np->forw) {
2949                 mutex_enter(&np->s_lock);
2950                 if (np->zoneid == zoneid) {
2951                         /*
2952                          * We add another hold onto the nfs4_server_t
2953                          * because this will make sure tha the nfs4_server_t
2954                          * stays around until nfs4_callback_fini_zone destroys
2955                          * the zone. This way, the renew thread can
2956                          * unconditionally release its holds on the
2957                          * nfs4_server_t.
2958                          */
2959                         np->s_refcnt++;
2960                         nfs4_mark_srv_dead(np);
2961                 }
2962                 mutex_exit(&np->s_lock);
2963         }
2964         mutex_exit(&nfs4_server_lst_lock);
2965 }
2966 
2967 static void
2968 nfs4_mi_free_globals(struct mi4_globals *mig)
2969 {
2970         list_destroy(&mig->mig_list);    /* makes sure the list is empty */
2971         mutex_destroy(&mig->mig_lock);
2972         kmem_free(mig, sizeof (*mig));
2973 }
2974 
2975 /* ARGSUSED */
2976 static void
2977 nfs4_mi_destroy(zoneid_t zoneid, void *data)
2978 {
2979         struct mi4_globals *mig = data;
2980 
2981         NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
2982             "nfs4_mi_destroy zone %d\n", zoneid));
2983         ASSERT(mig != NULL);
2984         mutex_enter(&mig->mig_lock);
2985         if (list_head(&mig->mig_list) != NULL) {
2986                 /* Still waiting for VFS_FREEVFS() */
2987                 mig->mig_destructor_called = B_TRUE;
2988                 mutex_exit(&mig->mig_lock);
2989                 return;
2990         }
2991         nfs4_mi_free_globals(mig);
2992 }
2993 
2994 /*
2995  * Add an NFS mount to the per-zone list of NFS mounts.
2996  */
2997 void
2998 nfs4_mi_zonelist_add(mntinfo4_t *mi)
2999 {
3000         struct mi4_globals *mig;
3001 
3002         mig = zone_getspecific(mi4_list_key, mi->mi_zone);
3003         mutex_enter(&mig->mig_lock);
3004         list_insert_head(&mig->mig_list, mi);
3005         /*
3006          * hold added to eliminate race with zone shutdown -this will be
3007          * released in mi_shutdown
3008          */
3009         MI4_HOLD(mi);
3010         VFS_HOLD(mi->mi_vfsp);
3011         mutex_exit(&mig->mig_lock);
3012 }
3013 
3014 /*
3015  * Remove an NFS mount from the per-zone list of NFS mounts.
3016  */
3017 int
3018 nfs4_mi_zonelist_remove(mntinfo4_t *mi)
3019 {
3020         struct mi4_globals *mig;
3021         int ret = 0;
3022 
3023         mig = zone_getspecific(mi4_list_key, mi->mi_zone);
3024         mutex_enter(&mig->mig_lock);
3025         mutex_enter(&mi->mi_lock);
3026         /* if this mi is marked dead, then the zone already released it */
3027         if (!(mi->mi_flags & MI4_DEAD)) {
3028                 list_remove(&mig->mig_list, mi);
3029                 mutex_exit(&mi->mi_lock);
3030 
3031                 /* release the holds put on in zonelist_add(). */
3032                 VFS_RELE(mi->mi_vfsp);
3033                 MI4_RELE(mi);
3034                 ret = 1;
3035         } else {
3036                 mutex_exit(&mi->mi_lock);
3037         }
3038 
3039         /*
3040          * We can be called asynchronously by VFS_FREEVFS() after the zone
3041          * shutdown/destroy callbacks have executed; if so, clean up the zone's
3042          * mi globals.
3043          */
3044         if (list_head(&mig->mig_list) == NULL &&
3045             mig->mig_destructor_called == B_TRUE) {
3046                 nfs4_mi_free_globals(mig);
3047                 return (ret);
3048         }
3049         mutex_exit(&mig->mig_lock);
3050         return (ret);
3051 }
3052 
3053 void
3054 nfs_free_mi4(mntinfo4_t *mi)
3055 {
3056         nfs4_open_owner_t       *foop;
3057         nfs4_oo_hash_bucket_t   *bucketp;
3058         nfs4_debug_msg_t        *msgp;
3059         int i;
3060         servinfo4_t             *svp;
3061 
3062         /*
3063          * Code introduced here should be carefully evaluated to make
3064          * sure none of the freed resources are accessed either directly
3065          * or indirectly after freeing them. For eg: Introducing calls to
3066          * NFS4_DEBUG that use mntinfo4_t structure member after freeing
3067          * the structure members or other routines calling back into NFS
3068          * accessing freed mntinfo4_t structure member.
3069          */
3070         mutex_enter(&mi->mi_lock);
3071         ASSERT(mi->mi_recovthread == NULL);
3072         ASSERT(mi->mi_flags & MI4_ASYNC_MGR_STOP);
3073         mutex_exit(&mi->mi_lock);
3074         mutex_enter(&mi->mi_async_lock);
3075         ASSERT(mi->mi_threads[NFS4_ASYNC_QUEUE] == 0 &&
3076             mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] == 0);
3077         ASSERT(mi->mi_manager_thread == NULL);
3078         mutex_exit(&mi->mi_async_lock);
3079         if (mi->mi_io_kstats) {
3080                 kstat_delete(mi->mi_io_kstats);
3081                 mi->mi_io_kstats = NULL;
3082         }
3083         if (mi->mi_ro_kstats) {
3084                 kstat_delete(mi->mi_ro_kstats);
3085                 mi->mi_ro_kstats = NULL;
3086         }
3087         if (mi->mi_recov_ksp) {
3088                 kstat_delete(mi->mi_recov_ksp);
3089                 mi->mi_recov_ksp = NULL;
3090         }
3091         mutex_enter(&mi->mi_msg_list_lock);
3092         while (msgp = list_head(&mi->mi_msg_list)) {
3093                 list_remove(&mi->mi_msg_list, msgp);
3094                 nfs4_free_msg(msgp);
3095         }
3096         mutex_exit(&mi->mi_msg_list_lock);
3097         list_destroy(&mi->mi_msg_list);
3098         if (mi->mi_fname != NULL)
3099                 fn_rele(&mi->mi_fname);
3100         if (mi->mi_rootfh != NULL)
3101                 sfh4_rele(&mi->mi_rootfh);
3102         if (mi->mi_srvparentfh != NULL)
3103                 sfh4_rele(&mi->mi_srvparentfh);
3104         svp = mi->mi_servers;
3105         sv4_free(svp);
3106         mutex_destroy(&mi->mi_lock);
3107         mutex_destroy(&mi->mi_async_lock);
3108         mutex_destroy(&mi->mi_msg_list_lock);
3109         nfs_rw_destroy(&mi->mi_recovlock);
3110         nfs_rw_destroy(&mi->mi_rename_lock);
3111         nfs_rw_destroy(&mi->mi_fh_lock);
3112         cv_destroy(&mi->mi_failover_cv);
3113         cv_destroy(&mi->mi_async_reqs_cv);
3114         cv_destroy(&mi->mi_async_work_cv[NFS4_ASYNC_QUEUE]);
3115         cv_destroy(&mi->mi_async_work_cv[NFS4_ASYNC_PGOPS_QUEUE]);
3116         cv_destroy(&mi->mi_async_cv);
3117         cv_destroy(&mi->mi_inact_req_cv);
3118         /*
3119          * Destroy the oo hash lists and mutexes for the cred hash table.
3120          */
3121         for (i = 0; i < NFS4_NUM_OO_BUCKETS; i++) {
3122                 bucketp = &(mi->mi_oo_list[i]);
3123                 /* Destroy any remaining open owners on the list */
3124                 foop = list_head(&bucketp->b_oo_hash_list);
3125                 while (foop != NULL) {
3126                         list_remove(&bucketp->b_oo_hash_list, foop);
3127                         nfs4_destroy_open_owner(foop);
3128                         foop = list_head(&bucketp->b_oo_hash_list);
3129                 }
3130                 list_destroy(&bucketp->b_oo_hash_list);
3131                 mutex_destroy(&bucketp->b_lock);
3132         }
3133         /*
3134          * Empty and destroy the freed open owner list.
3135          */
3136         foop = list_head(&mi->mi_foo_list);
3137         while (foop != NULL) {
3138                 list_remove(&mi->mi_foo_list, foop);
3139                 nfs4_destroy_open_owner(foop);
3140                 foop = list_head(&mi->mi_foo_list);
3141         }
3142         list_destroy(&mi->mi_foo_list);
3143         list_destroy(&mi->mi_bseqid_list);
3144         list_destroy(&mi->mi_lost_state);
3145         avl_destroy(&mi->mi_filehandles);
3146         kmem_free(mi, sizeof (*mi));
3147 }
3148 void
3149 mi_hold(mntinfo4_t *mi)
3150 {
3151         atomic_add_32(&mi->mi_count, 1);
3152         ASSERT(mi->mi_count != 0);
3153 }
3154 
3155 void
3156 mi_rele(mntinfo4_t *mi)
3157 {
3158         ASSERT(mi->mi_count != 0);
3159         if (atomic_add_32_nv(&mi->mi_count, -1) == 0) {
3160                 nfs_free_mi4(mi);
3161         }
3162 }
3163 
3164 vnode_t    nfs4_xattr_notsupp_vnode;
3165 
3166 void
3167 nfs4_clnt_init(void)
3168 {
3169         nfs4_vnops_init();
3170         (void) nfs4_rnode_init();
3171         (void) nfs4_shadow_init();
3172         (void) nfs4_acache_init();
3173         (void) nfs4_subr_init();
3174         nfs4_acl_init();
3175         nfs_idmap_init();
3176         nfs4_callback_init();
3177         nfs4_secinfo_init();
3178 #ifdef  DEBUG
3179         tsd_create(&nfs4_tsd_key, NULL);
3180 #endif
3181 
3182         /*
3183          * Add a CPR callback so that we can update client
3184          * lease after a suspend and resume.
3185          */
3186         cid = callb_add(nfs4_client_cpr_callb, 0, CB_CL_CPR_RPC, "nfs4");
3187 
3188         zone_key_create(&mi4_list_key, nfs4_mi_init, nfs4_mi_shutdown,
3189             nfs4_mi_destroy);
3190 
3191         /*
3192          * Initialise the reference count of the notsupp xattr cache vnode to 1
3193          * so that it never goes away (VOP_INACTIVE isn't called on it).
3194          */
3195         nfs4_xattr_notsupp_vnode.v_count = 1;
3196 }
3197 
3198 void
3199 nfs4_clnt_fini(void)
3200 {
3201         (void) zone_key_delete(mi4_list_key);
3202         nfs4_vnops_fini();
3203         (void) nfs4_rnode_fini();
3204         (void) nfs4_shadow_fini();
3205         (void) nfs4_acache_fini();
3206         (void) nfs4_subr_fini();
3207         nfs_idmap_fini();
3208         nfs4_callback_fini();
3209         nfs4_secinfo_fini();
3210 #ifdef  DEBUG
3211         tsd_destroy(&nfs4_tsd_key);
3212 #endif
3213         if (cid)
3214                 (void) callb_delete(cid);
3215 }
3216 
3217 /*ARGSUSED*/
3218 static boolean_t
3219 nfs4_client_cpr_callb(void *arg, int code)
3220 {
3221         /*
3222          * We get called for Suspend and Resume events.
3223          * For the suspend case we simply don't care!
3224          */
3225         if (code == CB_CODE_CPR_CHKPT) {
3226                 return (B_TRUE);
3227         }
3228 
3229         /*
3230          * When we get to here we are in the process of
3231          * resuming the system from a previous suspend.
3232          */
3233         nfs4_client_resumed = gethrestime_sec();
3234         return (B_TRUE);
3235 }
3236 
3237 void
3238 nfs4_renew_lease_thread(nfs4_server_t *sp)
3239 {
3240         int     error = 0;
3241         time_t  tmp_last_renewal_time, tmp_time, tmp_now_time, kip_secs;
3242         clock_t tick_delay = 0;
3243         clock_t time_left = 0;
3244         callb_cpr_t cpr_info;
3245         kmutex_t cpr_lock;
3246 
3247         NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3248             "nfs4_renew_lease_thread: acting on sp 0x%p", (void*)sp));
3249         mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL);
3250         CALLB_CPR_INIT(&cpr_info, &cpr_lock, callb_generic_cpr, "nfsv4Lease");
3251 
3252         mutex_enter(&sp->s_lock);
3253         /* sp->s_lease_time is set via a GETATTR */
3254         sp->last_renewal_time = gethrestime_sec();
3255         sp->lease_valid = NFS4_LEASE_UNINITIALIZED;
3256         ASSERT(sp->s_refcnt >= 1);
3257 
3258         for (;;) {
3259                 if (!sp->state_ref_count ||
3260                     sp->lease_valid != NFS4_LEASE_VALID) {
3261 
3262                         kip_secs = MAX((sp->s_lease_time >> 1) -
3263                             (3 * sp->propagation_delay.tv_sec), 1);
3264 
3265                         tick_delay = SEC_TO_TICK(kip_secs);
3266 
3267                         NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3268                             "nfs4_renew_lease_thread: no renew : thread "
3269                             "wait %ld secs", kip_secs));
3270 
3271                         NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3272                             "nfs4_renew_lease_thread: no renew : "
3273                             "state_ref_count %d, lease_valid %d",
3274                             sp->state_ref_count, sp->lease_valid));
3275 
3276                         mutex_enter(&cpr_lock);
3277                         CALLB_CPR_SAFE_BEGIN(&cpr_info);
3278                         mutex_exit(&cpr_lock);
3279                         time_left = cv_reltimedwait(&sp->cv_thread_exit,
3280                             &sp->s_lock, tick_delay, TR_CLOCK_TICK);
3281                         mutex_enter(&cpr_lock);
3282                         CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock);
3283                         mutex_exit(&cpr_lock);
3284 
3285                         NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3286                             "nfs4_renew_lease_thread: no renew: "
3287                             "time left %ld", time_left));
3288 
3289                         if (sp->s_thread_exit == NFS4_THREAD_EXIT)
3290                                 goto die;
3291                         continue;
3292                 }
3293 
3294                 tmp_last_renewal_time = sp->last_renewal_time;
3295 
3296                 tmp_time = gethrestime_sec() - sp->last_renewal_time +
3297                     (3 * sp->propagation_delay.tv_sec);
3298 
3299                 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3300                     "nfs4_renew_lease_thread: tmp_time %ld, "
3301                     "sp->last_renewal_time %ld", tmp_time,
3302                     sp->last_renewal_time));
3303 
3304                 kip_secs = MAX((sp->s_lease_time >> 1) - tmp_time, 1);
3305 
3306                 tick_delay = SEC_TO_TICK(kip_secs);
3307 
3308                 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3309                     "nfs4_renew_lease_thread: valid lease: sleep for %ld "
3310                     "secs", kip_secs));
3311 
3312                 mutex_enter(&cpr_lock);
3313                 CALLB_CPR_SAFE_BEGIN(&cpr_info);
3314                 mutex_exit(&cpr_lock);
3315                 time_left = cv_reltimedwait(&sp->cv_thread_exit, &sp->s_lock,
3316                     tick_delay, TR_CLOCK_TICK);
3317                 mutex_enter(&cpr_lock);
3318                 CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock);
3319                 mutex_exit(&cpr_lock);
3320 
3321                 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3322                     "nfs4_renew_lease_thread: valid lease: time left %ld :"
3323                     "sp last_renewal_time %ld, nfs4_client_resumed %ld, "
3324                     "tmp_last_renewal_time %ld", time_left,
3325                     sp->last_renewal_time, nfs4_client_resumed,
3326                     tmp_last_renewal_time));
3327 
3328                 if (sp->s_thread_exit == NFS4_THREAD_EXIT)
3329                         goto die;
3330 
3331                 if (tmp_last_renewal_time == sp->last_renewal_time ||
3332                     (nfs4_client_resumed != 0 &&
3333                     nfs4_client_resumed > sp->last_renewal_time)) {
3334                         /*
3335                          * Issue RENEW op since we haven't renewed the lease
3336                          * since we slept.
3337                          */
3338                         tmp_now_time = gethrestime_sec();
3339                         error = nfs4renew(sp);
3340                         /*
3341                          * Need to re-acquire sp's lock, nfs4renew()
3342                          * relinqueshes it.
3343                          */
3344                         mutex_enter(&sp->s_lock);
3345 
3346                         /*
3347                          * See if someone changed s_thread_exit while we gave
3348                          * up s_lock.
3349                          */
3350                         if (sp->s_thread_exit == NFS4_THREAD_EXIT)
3351                                 goto die;
3352 
3353                         if (!error) {
3354                                 /*
3355                                  * check to see if we implicitly renewed while
3356                                  * we waited for a reply for our RENEW call.
3357                                  */
3358                                 if (tmp_last_renewal_time ==
3359                                     sp->last_renewal_time) {
3360                                         /* no implicit renew came */
3361                                         sp->last_renewal_time = tmp_now_time;
3362                                 } else {
3363                                         NFS4_DEBUG(nfs4_client_lease_debug,
3364                                             (CE_NOTE, "renew_thread: did "
3365                                             "implicit renewal before reply "
3366                                             "from server for RENEW"));
3367                                 }
3368                         } else {
3369                                 /* figure out error */
3370                                 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3371                                     "renew_thread: nfs4renew returned error"
3372                                     " %d", error));
3373                         }
3374 
3375                 }
3376         }
3377 
3378 die:
3379         NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3380             "nfs4_renew_lease_thread: thread exiting"));
3381 
3382         while (sp->s_otw_call_count != 0) {
3383                 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3384                     "nfs4_renew_lease_thread: waiting for outstanding "
3385                     "otw calls to finish for sp 0x%p, current "
3386                     "s_otw_call_count %d", (void *)sp,
3387                     sp->s_otw_call_count));
3388                 mutex_enter(&cpr_lock);
3389                 CALLB_CPR_SAFE_BEGIN(&cpr_info);
3390                 mutex_exit(&cpr_lock);
3391                 cv_wait(&sp->s_cv_otw_count, &sp->s_lock);
3392                 mutex_enter(&cpr_lock);
3393                 CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock);
3394                 mutex_exit(&cpr_lock);
3395         }
3396         mutex_exit(&sp->s_lock);
3397 
3398         nfs4_server_rele(sp);           /* free the thread's reference */
3399         nfs4_server_rele(sp);           /* free the list's reference */
3400         sp = NULL;
3401 
3402 done:
3403         mutex_enter(&cpr_lock);
3404         CALLB_CPR_EXIT(&cpr_info);  /* drops cpr_lock */
3405         mutex_destroy(&cpr_lock);
3406 
3407         NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3408             "nfs4_renew_lease_thread: renew thread exit officially"));
3409 
3410         zthread_exit();
3411         /* NOT REACHED */
3412 }
3413 
3414 /*
3415  * Send out a RENEW op to the server.
3416  * Assumes sp is locked down.
3417  */
3418 static int
3419 nfs4renew(nfs4_server_t *sp)
3420 {
3421         COMPOUND4args_clnt args;
3422         COMPOUND4res_clnt res;
3423         nfs_argop4 argop[1];
3424         int doqueue = 1;
3425         int rpc_error;
3426         cred_t *cr;
3427         mntinfo4_t *mi;
3428         timespec_t prop_time, after_time;
3429         int needrecov = FALSE;
3430         nfs4_recov_state_t recov_state;
3431         nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
3432 
3433         NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "nfs4renew"));
3434 
3435         recov_state.rs_flags = 0;
3436         recov_state.rs_num_retry_despite_err = 0;
3437 
3438 recov_retry:
3439         mi = sp->mntinfo4_list;
3440         VFS_HOLD(mi->mi_vfsp);
3441         mutex_exit(&sp->s_lock);
3442         ASSERT(mi != NULL);
3443 
3444         e.error = nfs4_start_op(mi, NULL, NULL, &recov_state);
3445         if (e.error) {
3446                 VFS_RELE(mi->mi_vfsp);
3447                 return (e.error);
3448         }
3449 
3450         /* Check to see if we're dealing with a marked-dead sp */
3451         mutex_enter(&sp->s_lock);
3452         if (sp->s_thread_exit == NFS4_THREAD_EXIT) {
3453                 mutex_exit(&sp->s_lock);
3454                 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3455                 VFS_RELE(mi->mi_vfsp);
3456                 return (0);
3457         }
3458 
3459         /* Make sure mi hasn't changed on us */
3460         if (mi != sp->mntinfo4_list) {
3461                 /* Must drop sp's lock to avoid a recursive mutex enter */
3462                 mutex_exit(&sp->s_lock);
3463                 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3464                 VFS_RELE(mi->mi_vfsp);
3465                 mutex_enter(&sp->s_lock);
3466                 goto recov_retry;
3467         }
3468         mutex_exit(&sp->s_lock);
3469 
3470         args.ctag = TAG_RENEW;
3471 
3472         args.array_len = 1;
3473         args.array = argop;
3474 
3475         argop[0].argop = OP_RENEW;
3476 
3477         mutex_enter(&sp->s_lock);
3478         argop[0].nfs_argop4_u.oprenew.clientid = sp->clientid;
3479         cr = sp->s_cred;
3480         crhold(cr);
3481         mutex_exit(&sp->s_lock);
3482 
3483         ASSERT(cr != NULL);
3484 
3485         /* used to figure out RTT for sp */
3486         gethrestime(&prop_time);
3487 
3488         NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
3489             "nfs4renew: %s call, sp 0x%p", needrecov ? "recov" : "first",
3490             (void*)sp));
3491         NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "before: %ld s %ld ns ",
3492             prop_time.tv_sec, prop_time.tv_nsec));
3493 
3494         DTRACE_PROBE2(nfs4__renew__start, nfs4_server_t *, sp,
3495             mntinfo4_t *, mi);
3496 
3497         rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
3498         crfree(cr);
3499 
3500         DTRACE_PROBE2(nfs4__renew__end, nfs4_server_t *, sp,
3501             mntinfo4_t *, mi);
3502 
3503         gethrestime(&after_time);
3504 
3505         mutex_enter(&sp->s_lock);
3506         sp->propagation_delay.tv_sec =
3507             MAX(1, after_time.tv_sec - prop_time.tv_sec);
3508         mutex_exit(&sp->s_lock);
3509 
3510         NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "after : %ld s %ld ns ",
3511             after_time.tv_sec, after_time.tv_nsec));
3512 
3513         if (e.error == 0 && res.status == NFS4ERR_CB_PATH_DOWN) {
3514                 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3515                 nfs4_delegreturn_all(sp);
3516                 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3517                 VFS_RELE(mi->mi_vfsp);
3518                 /*
3519                  * If the server returns CB_PATH_DOWN, it has renewed
3520                  * the lease and informed us that the callback path is
3521                  * down.  Since the lease is renewed, just return 0 and
3522                  * let the renew thread proceed as normal.
3523                  */
3524                 return (0);
3525         }
3526 
3527         needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
3528         if (!needrecov && e.error) {
3529                 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3530                 VFS_RELE(mi->mi_vfsp);
3531                 return (e.error);
3532         }
3533 
3534         rpc_error = e.error;
3535 
3536         if (needrecov) {
3537                 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
3538                     "nfs4renew: initiating recovery\n"));
3539 
3540                 if (nfs4_start_recovery(&e, mi, NULL, NULL, NULL, NULL,
3541                     OP_RENEW, NULL, NULL, NULL) == FALSE) {
3542                         nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3543                         VFS_RELE(mi->mi_vfsp);
3544                         if (!e.error)
3545                                 (void) xdr_free(xdr_COMPOUND4res_clnt,
3546                                     (caddr_t)&res);
3547                         mutex_enter(&sp->s_lock);
3548                         goto recov_retry;
3549                 }
3550                 /* fall through for res.status case */
3551         }
3552 
3553         if (res.status) {
3554                 if (res.status == NFS4ERR_LEASE_MOVED) {
3555                         /*EMPTY*/
3556                         /*
3557                          * XXX need to try every mntinfo4 in sp->mntinfo4_list
3558                          * to renew the lease on that server
3559                          */
3560                 }
3561                 e.error = geterrno4(res.status);
3562         }
3563 
3564         if (!rpc_error)
3565                 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3566 
3567         nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3568 
3569         VFS_RELE(mi->mi_vfsp);
3570 
3571         return (e.error);
3572 }
3573 
3574 void
3575 nfs4_inc_state_ref_count(mntinfo4_t *mi)
3576 {
3577         nfs4_server_t   *sp;
3578 
3579         /* this locks down sp if it is found */
3580         sp = find_nfs4_server(mi);
3581 
3582         if (sp != NULL) {
3583                 nfs4_inc_state_ref_count_nolock(sp, mi);
3584                 mutex_exit(&sp->s_lock);
3585                 nfs4_server_rele(sp);
3586         }
3587 }
3588 
3589 /*
3590  * Bump the number of OPEN files (ie: those with state) so we know if this
3591  * nfs4_server has any state to maintain a lease for or not.
3592  *
3593  * Also, marks the nfs4_server's lease valid if it hasn't been done so already.
3594  */
3595 void
3596 nfs4_inc_state_ref_count_nolock(nfs4_server_t *sp, mntinfo4_t *mi)
3597 {
3598         ASSERT(mutex_owned(&sp->s_lock));
3599 
3600         sp->state_ref_count++;
3601         NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3602             "nfs4_inc_state_ref_count: state_ref_count now %d",
3603             sp->state_ref_count));
3604 
3605         if (sp->lease_valid == NFS4_LEASE_UNINITIALIZED)
3606                 sp->lease_valid = NFS4_LEASE_VALID;
3607 
3608         /*
3609          * If this call caused the lease to be marked valid and/or
3610          * took the state_ref_count from 0 to 1, then start the time
3611          * on lease renewal.
3612          */
3613         if (sp->lease_valid == NFS4_LEASE_VALID && sp->state_ref_count == 1)
3614                 sp->last_renewal_time = gethrestime_sec();
3615 
3616         /* update the number of open files for mi */
3617         mi->mi_open_files++;
3618 }
3619 
3620 void
3621 nfs4_dec_state_ref_count(mntinfo4_t *mi)
3622 {
3623         nfs4_server_t   *sp;
3624 
3625         /* this locks down sp if it is found */
3626         sp = find_nfs4_server_all(mi, 1);
3627 
3628         if (sp != NULL) {
3629                 nfs4_dec_state_ref_count_nolock(sp, mi);
3630                 mutex_exit(&sp->s_lock);
3631                 nfs4_server_rele(sp);
3632         }
3633 }
3634 
3635 /*
3636  * Decrement the number of OPEN files (ie: those with state) so we know if
3637  * this nfs4_server has any state to maintain a lease for or not.
3638  */
3639 void
3640 nfs4_dec_state_ref_count_nolock(nfs4_server_t *sp, mntinfo4_t *mi)
3641 {
3642         ASSERT(mutex_owned(&sp->s_lock));
3643         ASSERT(sp->state_ref_count != 0);
3644         sp->state_ref_count--;
3645 
3646         NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3647             "nfs4_dec_state_ref_count: state ref count now %d",
3648             sp->state_ref_count));
3649 
3650         mi->mi_open_files--;
3651         NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3652             "nfs4_dec_state_ref_count: mi open files %d, v4 flags 0x%x",
3653             mi->mi_open_files, mi->mi_flags));
3654 
3655         /* We don't have to hold the mi_lock to test mi_flags */
3656         if (mi->mi_open_files == 0 &&
3657             (mi->mi_flags & MI4_REMOVE_ON_LAST_CLOSE)) {
3658                 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3659                     "nfs4_dec_state_ref_count: remove mntinfo4 %p since "
3660                     "we have closed the last open file", (void*)mi));
3661                 nfs4_remove_mi_from_server(mi, sp);
3662         }
3663 }
3664 
3665 bool_t
3666 inlease(nfs4_server_t *sp)
3667 {
3668         bool_t result;
3669 
3670         ASSERT(mutex_owned(&sp->s_lock));
3671 
3672         if (sp->lease_valid == NFS4_LEASE_VALID &&
3673             gethrestime_sec() < sp->last_renewal_time + sp->s_lease_time)
3674                 result = TRUE;
3675         else
3676                 result = FALSE;
3677 
3678         return (result);
3679 }
3680 
3681 
3682 /*
3683  * Return non-zero if the given nfs4_server_t is going through recovery.
3684  */
3685 
3686 int
3687 nfs4_server_in_recovery(nfs4_server_t *sp)
3688 {
3689         return (nfs_rw_lock_held(&sp->s_recovlock, RW_WRITER));
3690 }
3691 
3692 /*
3693  * Compare two shared filehandle objects.  Returns -1, 0, or +1, if the
3694  * first is less than, equal to, or greater than the second.
3695  */
3696 
3697 int
3698 sfh4cmp(const void *p1, const void *p2)
3699 {
3700         const nfs4_sharedfh_t *sfh1 = (const nfs4_sharedfh_t *)p1;
3701         const nfs4_sharedfh_t *sfh2 = (const nfs4_sharedfh_t *)p2;
3702 
3703         return (nfs4cmpfh(&sfh1->sfh_fh, &sfh2->sfh_fh));
3704 }
3705 
3706 /*
3707  * Create a table for shared filehandle objects.
3708  */
3709 
3710 void
3711 sfh4_createtab(avl_tree_t *tab)
3712 {
3713         avl_create(tab, sfh4cmp, sizeof (nfs4_sharedfh_t),
3714             offsetof(nfs4_sharedfh_t, sfh_tree));
3715 }
3716 
3717 /*
3718  * Return a shared filehandle object for the given filehandle.  The caller
3719  * is responsible for eventually calling sfh4_rele().
3720  */
3721 
3722 nfs4_sharedfh_t *
3723 sfh4_put(const nfs_fh4 *fh, mntinfo4_t *mi, nfs4_sharedfh_t *key)
3724 {
3725         nfs4_sharedfh_t *sfh, *nsfh;
3726         avl_index_t where;
3727         nfs4_sharedfh_t skey;
3728 
3729         if (!key) {
3730                 skey.sfh_fh = *fh;
3731                 key = &skey;
3732         }
3733 
3734         nsfh = kmem_alloc(sizeof (nfs4_sharedfh_t), KM_SLEEP);
3735         nsfh->sfh_fh.nfs_fh4_len = fh->nfs_fh4_len;
3736         /*
3737          * We allocate the largest possible filehandle size because it's
3738          * not that big, and it saves us from possibly having to resize the
3739          * buffer later.
3740          */
3741         nsfh->sfh_fh.nfs_fh4_val = kmem_alloc(NFS4_FHSIZE, KM_SLEEP);
3742         bcopy(fh->nfs_fh4_val, nsfh->sfh_fh.nfs_fh4_val, fh->nfs_fh4_len);
3743         mutex_init(&nsfh->sfh_lock, NULL, MUTEX_DEFAULT, NULL);
3744         nsfh->sfh_refcnt = 1;
3745         nsfh->sfh_flags = SFH4_IN_TREE;
3746         nsfh->sfh_mi = mi;
3747         NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE, "sfh4_get: new object (%p)",
3748             (void *)nsfh));
3749 
3750         (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0);
3751         sfh = avl_find(&mi->mi_filehandles, key, &where);
3752         if (sfh != NULL) {
3753                 mutex_enter(&sfh->sfh_lock);
3754                 sfh->sfh_refcnt++;
3755                 mutex_exit(&sfh->sfh_lock);
3756                 nfs_rw_exit(&mi->mi_fh_lock);
3757                 /* free our speculative allocs */
3758                 kmem_free(nsfh->sfh_fh.nfs_fh4_val, NFS4_FHSIZE);
3759                 kmem_free(nsfh, sizeof (nfs4_sharedfh_t));
3760                 return (sfh);
3761         }
3762 
3763         avl_insert(&mi->mi_filehandles, nsfh, where);
3764         nfs_rw_exit(&mi->mi_fh_lock);
3765 
3766         return (nsfh);
3767 }
3768 
3769 /*
3770  * Return a shared filehandle object for the given filehandle.  The caller
3771  * is responsible for eventually calling sfh4_rele().
3772  */
3773 
3774 nfs4_sharedfh_t *
3775 sfh4_get(const nfs_fh4 *fh, mntinfo4_t *mi)
3776 {
3777         nfs4_sharedfh_t *sfh;
3778         nfs4_sharedfh_t key;
3779 
3780         ASSERT(fh->nfs_fh4_len <= NFS4_FHSIZE);
3781 
3782 #ifdef DEBUG
3783         if (nfs4_sharedfh_debug) {
3784                 nfs4_fhandle_t fhandle;
3785 
3786                 fhandle.fh_len = fh->nfs_fh4_len;
3787                 bcopy(fh->nfs_fh4_val, fhandle.fh_buf, fhandle.fh_len);
3788                 zcmn_err(mi->mi_zone->zone_id, CE_NOTE, "sfh4_get:");
3789                 nfs4_printfhandle(&fhandle);
3790         }
3791 #endif
3792 
3793         /*
3794          * If there's already an object for the given filehandle, bump the
3795          * reference count and return it.  Otherwise, create a new object
3796          * and add it to the AVL tree.
3797          */
3798 
3799         key.sfh_fh = *fh;
3800 
3801         (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0);
3802         sfh = avl_find(&mi->mi_filehandles, &key, NULL);
3803         if (sfh != NULL) {
3804                 mutex_enter(&sfh->sfh_lock);
3805                 sfh->sfh_refcnt++;
3806                 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE,
3807                     "sfh4_get: found existing %p, new refcnt=%d",
3808                     (void *)sfh, sfh->sfh_refcnt));
3809                 mutex_exit(&sfh->sfh_lock);
3810                 nfs_rw_exit(&mi->mi_fh_lock);
3811                 return (sfh);
3812         }
3813         nfs_rw_exit(&mi->mi_fh_lock);
3814 
3815         return (sfh4_put(fh, mi, &key));
3816 }
3817 
3818 /*
3819  * Get a reference to the given shared filehandle object.
3820  */
3821 
3822 void
3823 sfh4_hold(nfs4_sharedfh_t *sfh)
3824 {
3825         ASSERT(sfh->sfh_refcnt > 0);
3826 
3827         mutex_enter(&sfh->sfh_lock);
3828         sfh->sfh_refcnt++;
3829         NFS4_DEBUG(nfs4_sharedfh_debug,
3830             (CE_NOTE, "sfh4_hold %p, new refcnt=%d",
3831             (void *)sfh, sfh->sfh_refcnt));
3832         mutex_exit(&sfh->sfh_lock);
3833 }
3834 
3835 /*
3836  * Release a reference to the given shared filehandle object and null out
3837  * the given pointer.
3838  */
3839 
3840 void
3841 sfh4_rele(nfs4_sharedfh_t **sfhpp)
3842 {
3843         mntinfo4_t *mi;
3844         nfs4_sharedfh_t *sfh = *sfhpp;
3845 
3846         ASSERT(sfh->sfh_refcnt > 0);
3847 
3848         mutex_enter(&sfh->sfh_lock);
3849         if (sfh->sfh_refcnt > 1) {
3850                 sfh->sfh_refcnt--;
3851                 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE,
3852                     "sfh4_rele %p, new refcnt=%d",
3853                     (void *)sfh, sfh->sfh_refcnt));
3854                 mutex_exit(&sfh->sfh_lock);
3855                 goto finish;
3856         }
3857         mutex_exit(&sfh->sfh_lock);
3858 
3859         /*
3860          * Possibly the last reference, so get the lock for the table in
3861          * case it's time to remove the object from the table.
3862          */
3863         mi = sfh->sfh_mi;
3864         (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0);
3865         mutex_enter(&sfh->sfh_lock);
3866         sfh->sfh_refcnt--;
3867         if (sfh->sfh_refcnt > 0) {
3868                 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE,
3869                     "sfh4_rele %p, new refcnt=%d",
3870                     (void *)sfh, sfh->sfh_refcnt));
3871                 mutex_exit(&sfh->sfh_lock);
3872                 nfs_rw_exit(&mi->mi_fh_lock);
3873                 goto finish;
3874         }
3875 
3876         NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE,
3877             "sfh4_rele %p, last ref", (void *)sfh));
3878         if (sfh->sfh_flags & SFH4_IN_TREE) {
3879                 avl_remove(&mi->mi_filehandles, sfh);
3880                 sfh->sfh_flags &= ~SFH4_IN_TREE;
3881         }
3882         mutex_exit(&sfh->sfh_lock);
3883         nfs_rw_exit(&mi->mi_fh_lock);
3884         mutex_destroy(&sfh->sfh_lock);
3885         kmem_free(sfh->sfh_fh.nfs_fh4_val, NFS4_FHSIZE);
3886         kmem_free(sfh, sizeof (nfs4_sharedfh_t));
3887 
3888 finish:
3889         *sfhpp = NULL;
3890 }
3891 
3892 /*
3893  * Update the filehandle for the given shared filehandle object.
3894  */
3895 
3896 int nfs4_warn_dupfh = 0;        /* if set, always warn about dup fhs below */
3897 
3898 void
3899 sfh4_update(nfs4_sharedfh_t *sfh, const nfs_fh4 *newfh)
3900 {
3901         mntinfo4_t *mi = sfh->sfh_mi;
3902         nfs4_sharedfh_t *dupsfh;
3903         avl_index_t where;
3904         nfs4_sharedfh_t key;
3905 
3906 #ifdef DEBUG
3907         mutex_enter(&sfh->sfh_lock);
3908         ASSERT(sfh->sfh_refcnt > 0);
3909         mutex_exit(&sfh->sfh_lock);
3910 #endif
3911         ASSERT(newfh->nfs_fh4_len <= NFS4_FHSIZE);
3912 
3913         /*
3914          * The basic plan is to remove the shared filehandle object from
3915          * the table, update it to have the new filehandle, then reinsert
3916          * it.
3917          */
3918 
3919         (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0);
3920         mutex_enter(&sfh->sfh_lock);
3921         if (sfh->sfh_flags & SFH4_IN_TREE) {
3922                 avl_remove(&mi->mi_filehandles, sfh);
3923                 sfh->sfh_flags &= ~SFH4_IN_TREE;
3924         }
3925         mutex_exit(&sfh->sfh_lock);
3926         sfh->sfh_fh.nfs_fh4_len = newfh->nfs_fh4_len;
3927         bcopy(newfh->nfs_fh4_val, sfh->sfh_fh.nfs_fh4_val,
3928             sfh->sfh_fh.nfs_fh4_len);
3929 
3930         /*
3931          * XXX If there is already a shared filehandle object with the new
3932          * filehandle, we're in trouble, because the rnode code assumes
3933          * that there is only one shared filehandle object for a given
3934          * filehandle.  So issue a warning (for read-write mounts only)
3935          * and don't try to re-insert the given object into the table.
3936          * Hopefully the given object will quickly go away and everyone
3937          * will use the new object.
3938          */
3939         key.sfh_fh = *newfh;
3940         dupsfh = avl_find(&mi->mi_filehandles, &key, &where);
3941         if (dupsfh != NULL) {
3942                 if (!(mi->mi_vfsp->vfs_flag & VFS_RDONLY) || nfs4_warn_dupfh) {
3943                         zcmn_err(mi->mi_zone->zone_id, CE_WARN, "sfh4_update: "
3944                             "duplicate filehandle detected");
3945                         sfh4_printfhandle(dupsfh);
3946                 }
3947         } else {
3948                 avl_insert(&mi->mi_filehandles, sfh, where);
3949                 mutex_enter(&sfh->sfh_lock);
3950                 sfh->sfh_flags |= SFH4_IN_TREE;
3951                 mutex_exit(&sfh->sfh_lock);
3952         }
3953         nfs_rw_exit(&mi->mi_fh_lock);
3954 }
3955 
3956 /*
3957  * Copy out the current filehandle for the given shared filehandle object.
3958  */
3959 
3960 void
3961 sfh4_copyval(const nfs4_sharedfh_t *sfh, nfs4_fhandle_t *fhp)
3962 {
3963         mntinfo4_t *mi = sfh->sfh_mi;
3964 
3965         ASSERT(sfh->sfh_refcnt > 0);
3966 
3967         (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0);
3968         fhp->fh_len = sfh->sfh_fh.nfs_fh4_len;
3969         ASSERT(fhp->fh_len <= NFS4_FHSIZE);
3970         bcopy(sfh->sfh_fh.nfs_fh4_val, fhp->fh_buf, fhp->fh_len);
3971         nfs_rw_exit(&mi->mi_fh_lock);
3972 }
3973 
3974 /*
3975  * Print out the filehandle for the given shared filehandle object.
3976  */
3977 
3978 void
3979 sfh4_printfhandle(const nfs4_sharedfh_t *sfh)
3980 {
3981         nfs4_fhandle_t fhandle;
3982 
3983         sfh4_copyval(sfh, &fhandle);
3984         nfs4_printfhandle(&fhandle);
3985 }
3986 
3987 /*
3988  * Compare 2 fnames.  Returns -1 if the first is "less" than the second, 0
3989  * if they're the same, +1 if the first is "greater" than the second.  The
3990  * caller (or whoever's calling the AVL package) is responsible for
3991  * handling locking issues.
3992  */
3993 
3994 static int
3995 fncmp(const void *p1, const void *p2)
3996 {
3997         const nfs4_fname_t *f1 = p1;
3998         const nfs4_fname_t *f2 = p2;
3999         int res;
4000 
4001         res = strcmp(f1->fn_name, f2->fn_name);
4002         /*
4003          * The AVL package wants +/-1, not arbitrary positive or negative
4004          * integers.
4005          */
4006         if (res > 0)
4007                 res = 1;
4008         else if (res < 0)
4009                 res = -1;
4010         return (res);
4011 }
4012 
4013 /*
4014  * Get or create an fname with the given name, as a child of the given
4015  * fname.  The caller is responsible for eventually releasing the reference
4016  * (fn_rele()).  parent may be NULL.
4017  */
4018 
4019 nfs4_fname_t *
4020 fn_get(nfs4_fname_t *parent, char *name, nfs4_sharedfh_t *sfh)
4021 {
4022         nfs4_fname_t key;
4023         nfs4_fname_t *fnp;
4024         avl_index_t where;
4025 
4026         key.fn_name = name;
4027 
4028         /*
4029          * If there's already an fname registered with the given name, bump
4030          * its reference count and return it.  Otherwise, create a new one
4031          * and add it to the parent's AVL tree.
4032          *
4033          * fname entries we are looking for should match both name
4034          * and sfh stored in the fname.
4035          */
4036 again:
4037         if (parent != NULL) {
4038                 mutex_enter(&parent->fn_lock);
4039                 fnp = avl_find(&parent->fn_children, &key, &where);
4040                 if (fnp != NULL) {
4041                         /*
4042                          * This hold on fnp is released below later,
4043                          * in case this is not the fnp we want.
4044                          */
4045                         fn_hold(fnp);
4046 
4047                         if (fnp->fn_sfh == sfh) {
4048                                 /*
4049                                  * We have found our entry.
4050                                  * put an hold and return it.
4051                                  */
4052                                 mutex_exit(&parent->fn_lock);
4053                                 return (fnp);
4054                         }
4055 
4056                         /*
4057                          * We have found an entry that has a mismatching
4058                          * fn_sfh. This could be a stale entry due to
4059                          * server side rename. We will remove this entry
4060                          * and make sure no such entries exist.
4061                          */
4062                         mutex_exit(&parent->fn_lock);
4063                         mutex_enter(&fnp->fn_lock);
4064                         if (fnp->fn_parent == parent) {
4065                                 /*
4066                                  * Remove ourselves from parent's
4067                                  * fn_children tree.
4068                                  */
4069                                 mutex_enter(&parent->fn_lock);
4070                                 avl_remove(&parent->fn_children, fnp);
4071                                 mutex_exit(&parent->fn_lock);
4072                                 fn_rele(&fnp->fn_parent);
4073                         }
4074                         mutex_exit(&fnp->fn_lock);
4075                         fn_rele(&fnp);
4076                         goto again;
4077                 }
4078         }
4079 
4080         fnp = kmem_alloc(sizeof (nfs4_fname_t), KM_SLEEP);
4081         mutex_init(&fnp->fn_lock, NULL, MUTEX_DEFAULT, NULL);
4082         fnp->fn_parent = parent;
4083         if (parent != NULL)
4084                 fn_hold(parent);
4085         fnp->fn_len = strlen(name);
4086         ASSERT(fnp->fn_len < MAXNAMELEN);
4087         fnp->fn_name = kmem_alloc(fnp->fn_len + 1, KM_SLEEP);
4088         (void) strcpy(fnp->fn_name, name);
4089         fnp->fn_refcnt = 1;
4090 
4091         /*
4092          * This hold on sfh is later released
4093          * when we do the final fn_rele() on this fname.
4094          */
4095         sfh4_hold(sfh);
4096         fnp->fn_sfh = sfh;
4097 
4098         avl_create(&fnp->fn_children, fncmp, sizeof (nfs4_fname_t),
4099             offsetof(nfs4_fname_t, fn_tree));
4100         NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE,
4101             "fn_get %p:%s, a new nfs4_fname_t!",
4102             (void *)fnp, fnp->fn_name));
4103         if (parent != NULL) {
4104                 avl_insert(&parent->fn_children, fnp, where);
4105                 mutex_exit(&parent->fn_lock);
4106         }
4107 
4108         return (fnp);
4109 }
4110 
4111 void
4112 fn_hold(nfs4_fname_t *fnp)
4113 {
4114         atomic_add_32(&fnp->fn_refcnt, 1);
4115         NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE,
4116             "fn_hold %p:%s, new refcnt=%d",
4117             (void *)fnp, fnp->fn_name, fnp->fn_refcnt));
4118 }
4119 
4120 /*
4121  * Decrement the reference count of the given fname, and destroy it if its
4122  * reference count goes to zero.  Nulls out the given pointer.
4123  */
4124 
4125 void
4126 fn_rele(nfs4_fname_t **fnpp)
4127 {
4128         nfs4_fname_t *parent;
4129         uint32_t newref;
4130         nfs4_fname_t *fnp;
4131 
4132 recur:
4133         fnp = *fnpp;
4134         *fnpp = NULL;
4135 
4136         mutex_enter(&fnp->fn_lock);
4137         parent = fnp->fn_parent;
4138         if (parent != NULL)
4139                 mutex_enter(&parent->fn_lock);   /* prevent new references */
4140         newref = atomic_add_32_nv(&fnp->fn_refcnt, -1);
4141         if (newref > 0) {
4142                 NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE,
4143                     "fn_rele %p:%s, new refcnt=%d",
4144                     (void *)fnp, fnp->fn_name, fnp->fn_refcnt));
4145                 if (parent != NULL)
4146                         mutex_exit(&parent->fn_lock);
4147                 mutex_exit(&fnp->fn_lock);
4148                 return;
4149         }
4150 
4151         NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE,
4152             "fn_rele %p:%s, last reference, deleting...",
4153             (void *)fnp, fnp->fn_name));
4154         if (parent != NULL) {
4155                 avl_remove(&parent->fn_children, fnp);
4156                 mutex_exit(&parent->fn_lock);
4157         }
4158         kmem_free(fnp->fn_name, fnp->fn_len + 1);
4159         sfh4_rele(&fnp->fn_sfh);
4160         mutex_destroy(&fnp->fn_lock);
4161         avl_destroy(&fnp->fn_children);
4162         kmem_free(fnp, sizeof (nfs4_fname_t));
4163         /*
4164          * Recursivly fn_rele the parent.
4165          * Use goto instead of a recursive call to avoid stack overflow.
4166          */
4167         if (parent != NULL) {
4168                 fnpp = &parent;
4169                 goto recur;
4170         }
4171 }
4172 
4173 /*
4174  * Returns the single component name of the given fname, in a MAXNAMELEN
4175  * string buffer, which the caller is responsible for freeing.  Note that
4176  * the name may become invalid as a result of fn_move().
4177  */
4178 
4179 char *
4180 fn_name(nfs4_fname_t *fnp)
4181 {
4182         char *name;
4183 
4184         ASSERT(fnp->fn_len < MAXNAMELEN);
4185         name = kmem_alloc(MAXNAMELEN, KM_SLEEP);
4186         mutex_enter(&fnp->fn_lock);
4187         (void) strcpy(name, fnp->fn_name);
4188         mutex_exit(&fnp->fn_lock);
4189 
4190         return (name);
4191 }
4192 
4193 
4194 /*
4195  * fn_path_realloc
4196  *
4197  * This function, used only by fn_path, constructs
4198  * a new string which looks like "prepend" + "/" + "current".
4199  * by allocating a new string and freeing the old one.
4200  */
4201 static void
4202 fn_path_realloc(char **curses, char *prepend)
4203 {
4204         int len, curlen = 0;
4205         char *news;
4206 
4207         if (*curses == NULL) {
4208                 /*
4209                  * Prime the pump, allocate just the
4210                  * space for prepend and return that.
4211                  */
4212                 len = strlen(prepend) + 1;
4213                 news = kmem_alloc(len, KM_SLEEP);
4214                 (void) strncpy(news, prepend, len);
4215         } else {
4216                 /*
4217                  * Allocate the space  for a new string
4218                  * +1 +1 is for the "/" and the NULL
4219                  * byte at the end of it all.
4220                  */
4221                 curlen = strlen(*curses);
4222                 len = curlen + strlen(prepend) + 1 + 1;
4223                 news = kmem_alloc(len, KM_SLEEP);
4224                 (void) strncpy(news, prepend, len);
4225                 (void) strcat(news, "/");
4226                 (void) strcat(news, *curses);
4227                 kmem_free(*curses, curlen + 1);
4228         }
4229         *curses = news;
4230 }
4231 
4232 /*
4233  * Returns the path name (starting from the fs root) for the given fname.
4234  * The caller is responsible for freeing.  Note that the path may be or
4235  * become invalid as a result of fn_move().
4236  */
4237 
4238 char *
4239 fn_path(nfs4_fname_t *fnp)
4240 {
4241         char *path;
4242         nfs4_fname_t *nextfnp;
4243 
4244         if (fnp == NULL)
4245                 return (NULL);
4246 
4247         path = NULL;
4248 
4249         /* walk up the tree constructing the pathname.  */
4250 
4251         fn_hold(fnp);                   /* adjust for later rele */
4252         do {
4253                 mutex_enter(&fnp->fn_lock);
4254                 /*
4255                  * Add fn_name in front of the current path
4256                  */
4257                 fn_path_realloc(&path, fnp->fn_name);
4258                 nextfnp = fnp->fn_parent;
4259                 if (nextfnp != NULL)
4260                         fn_hold(nextfnp);
4261                 mutex_exit(&fnp->fn_lock);
4262                 fn_rele(&fnp);
4263                 fnp = nextfnp;
4264         } while (fnp != NULL);
4265 
4266         return (path);
4267 }
4268 
4269 /*
4270  * Return a reference to the parent of the given fname, which the caller is
4271  * responsible for eventually releasing.
4272  */
4273 
4274 nfs4_fname_t *
4275 fn_parent(nfs4_fname_t *fnp)
4276 {
4277         nfs4_fname_t *parent;
4278 
4279         mutex_enter(&fnp->fn_lock);
4280         parent = fnp->fn_parent;
4281         if (parent != NULL)
4282                 fn_hold(parent);
4283         mutex_exit(&fnp->fn_lock);
4284 
4285         return (parent);
4286 }
4287 
4288 /*
4289  * Update fnp so that its parent is newparent and its name is newname.
4290  */
4291 
4292 void
4293 fn_move(nfs4_fname_t *fnp, nfs4_fname_t *newparent, char *newname)
4294 {
4295         nfs4_fname_t *parent, *tmpfnp;
4296         ssize_t newlen;
4297         nfs4_fname_t key;
4298         avl_index_t where;
4299 
4300         /*
4301          * This assert exists to catch the client trying to rename
4302          * a dir to be a child of itself.  This happened at a recent
4303          * bakeoff against a 3rd party (broken) server which allowed
4304          * the rename to succeed.  If it trips it means that:
4305          *      a) the code in nfs4rename that detects this case is broken
4306          *      b) the server is broken (since it allowed the bogus rename)
4307          *
4308          * For non-DEBUG kernels, prepare for a recursive mutex_enter
4309          * panic below from:  mutex_enter(&newparent->fn_lock);
4310          */
4311         ASSERT(fnp != newparent);
4312 
4313         /*
4314          * Remove fnp from its current parent, change its name, then add it
4315          * to newparent. It might happen that fnp was replaced by another
4316          * nfs4_fname_t with the same fn_name in parent->fn_children.
4317          * In such case, fnp->fn_parent is NULL and we skip the removal
4318          * of fnp from its current parent.
4319          */
4320         mutex_enter(&fnp->fn_lock);
4321         parent = fnp->fn_parent;
4322         if (parent != NULL) {
4323                 mutex_enter(&parent->fn_lock);
4324                 avl_remove(&parent->fn_children, fnp);
4325                 mutex_exit(&parent->fn_lock);
4326                 fn_rele(&fnp->fn_parent);
4327         }
4328 
4329         newlen = strlen(newname);
4330         if (newlen != fnp->fn_len) {
4331                 ASSERT(newlen < MAXNAMELEN);
4332                 kmem_free(fnp->fn_name, fnp->fn_len + 1);
4333                 fnp->fn_name = kmem_alloc(newlen + 1, KM_SLEEP);
4334                 fnp->fn_len = newlen;
4335         }
4336         (void) strcpy(fnp->fn_name, newname);
4337 
4338 again:
4339         mutex_enter(&newparent->fn_lock);
4340         key.fn_name = fnp->fn_name;
4341         tmpfnp = avl_find(&newparent->fn_children, &key, &where);
4342         if (tmpfnp != NULL) {
4343                 /*
4344                  * This could be due to a file that was unlinked while
4345                  * open, or perhaps the rnode is in the free list.  Remove
4346                  * it from newparent and let it go away on its own.  The
4347                  * contorted code is to deal with lock order issues and
4348                  * race conditions.
4349                  */
4350                 fn_hold(tmpfnp);
4351                 mutex_exit(&newparent->fn_lock);
4352                 mutex_enter(&tmpfnp->fn_lock);
4353                 if (tmpfnp->fn_parent == newparent) {
4354                         mutex_enter(&newparent->fn_lock);
4355                         avl_remove(&newparent->fn_children, tmpfnp);
4356                         mutex_exit(&newparent->fn_lock);
4357                         fn_rele(&tmpfnp->fn_parent);
4358                 }
4359                 mutex_exit(&tmpfnp->fn_lock);
4360                 fn_rele(&tmpfnp);
4361                 goto again;
4362         }
4363         fnp->fn_parent = newparent;
4364         fn_hold(newparent);
4365         avl_insert(&newparent->fn_children, fnp, where);
4366         mutex_exit(&newparent->fn_lock);
4367         mutex_exit(&fnp->fn_lock);
4368 }
4369 
4370 #ifdef DEBUG
4371 /*
4372  * Return non-zero if the type information makes sense for the given vnode.
4373  * Otherwise panic.
4374  */
4375 int
4376 nfs4_consistent_type(vnode_t *vp)
4377 {
4378         rnode4_t *rp = VTOR4(vp);
4379 
4380         if (nfs4_vtype_debug && vp->v_type != VNON &&
4381             rp->r_attr.va_type != VNON && vp->v_type != rp->r_attr.va_type) {
4382                 cmn_err(CE_PANIC, "vnode %p type mismatch; v_type=%d, "
4383                     "rnode attr type=%d", (void *)vp, vp->v_type,
4384                     rp->r_attr.va_type);
4385         }
4386 
4387         return (1);
4388 }
4389 #endif /* DEBUG */
--- EOF ---