NFS4 data corruption (#3508)
If async calls are disabled, nfs4_async_putapage is supposed to do its
work synchronously. Due to a bug, it sometimes just does nothing, leaving
the page for later.
Unfortunately the caller has already reset the R4DIRTY flag.
Without R4DIRTY, nfs4_attrcache_va can't see that there are still
outstanding writes and accepts the file size from the server, which is
too low.
When the dirty page finally gets written back, the page size is truncated
to the file size, leaving some bytes unwritten.
Reviewed by: Marcel Telka <marcel@telka.sk>
Reviewed by: Robert Gordon <rbg@openrbg.com>

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
  23  */
  24 
  25 /*
  26  *      Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
  27  *      All Rights Reserved
  28  */
  29 
  30 #include <sys/param.h>
  31 #include <sys/types.h>
  32 #include <sys/systm.h>
  33 #include <sys/thread.h>
  34 #include <sys/t_lock.h>
  35 #include <sys/time.h>
  36 #include <sys/vnode.h>
  37 #include <sys/vfs.h>
  38 #include <sys/errno.h>
  39 #include <sys/buf.h>
  40 #include <sys/stat.h>
  41 #include <sys/cred.h>
  42 #include <sys/kmem.h>
  43 #include <sys/debug.h>
  44 #include <sys/dnlc.h>
  45 #include <sys/vmsystm.h>
  46 #include <sys/flock.h>
  47 #include <sys/share.h>
  48 #include <sys/cmn_err.h>
  49 #include <sys/tiuser.h>
  50 #include <sys/sysmacros.h>
  51 #include <sys/callb.h>
  52 #include <sys/acl.h>
  53 #include <sys/kstat.h>
  54 #include <sys/signal.h>
  55 #include <sys/disp.h>
  56 #include <sys/atomic.h>
  57 #include <sys/list.h>
  58 #include <sys/sdt.h>
  59 
  60 #include <rpc/types.h>
  61 #include <rpc/xdr.h>
  62 #include <rpc/auth.h>
  63 #include <rpc/clnt.h>
  64 
  65 #include <nfs/nfs.h>
  66 #include <nfs/nfs_clnt.h>
  67 #include <nfs/nfs_acl.h>
  68 
  69 #include <nfs/nfs4.h>
  70 #include <nfs/rnode4.h>
  71 #include <nfs/nfs4_clnt.h>
  72 
  73 #include <vm/hat.h>
  74 #include <vm/as.h>
  75 #include <vm/page.h>
  76 #include <vm/pvn.h>
  77 #include <vm/seg.h>
  78 #include <vm/seg_map.h>
  79 #include <vm/seg_vn.h>
  80 
  81 #include <sys/ddi.h>
  82 
  83 /*
  84  * Arguments to page-flush thread.
  85  */
  86 typedef struct {
  87         vnode_t *vp;
  88         cred_t *cr;
  89 } pgflush_t;
  90 
  91 #ifdef DEBUG
  92 int nfs4_client_lease_debug;
  93 int nfs4_sharedfh_debug;
  94 int nfs4_fname_debug;
  95 
  96 /* temporary: panic if v_type is inconsistent with r_attr va_type */
  97 int nfs4_vtype_debug;
  98 
  99 uint_t nfs4_tsd_key;
 100 #endif
 101 
 102 static time_t   nfs4_client_resumed = 0;
 103 static  callb_id_t cid = 0;
 104 
 105 static int      nfs4renew(nfs4_server_t *);
 106 static void     nfs4_attrcache_va(vnode_t *, nfs4_ga_res_t *, int);
 107 static void     nfs4_pgflush_thread(pgflush_t *);
 108 
 109 static boolean_t nfs4_client_cpr_callb(void *, int);
 110 
 111 struct mi4_globals {
 112         kmutex_t        mig_lock;  /* lock protecting mig_list */
 113         list_t          mig_list;  /* list of NFS v4 mounts in zone */
 114         boolean_t       mig_destructor_called;
 115 };
 116 
 117 static zone_key_t mi4_list_key;
 118 
 119 /*
 120  * Attributes caching:
 121  *
 122  * Attributes are cached in the rnode in struct vattr form.
 123  * There is a time associated with the cached attributes (r_time_attr_inval)
 124  * which tells whether the attributes are valid. The time is initialized
 125  * to the difference between current time and the modify time of the vnode
 126  * when new attributes are cached. This allows the attributes for
 127  * files that have changed recently to be timed out sooner than for files
 128  * that have not changed for a long time. There are minimum and maximum
 129  * timeout values that can be set per mount point.
 130  */
 131 
 132 /*
 133  * If a cache purge is in progress, wait for it to finish.
 134  *
 135  * The current thread must not be in the middle of an
 136  * nfs4_start_op/nfs4_end_op region.  Otherwise, there could be a deadlock
 137  * between this thread, a recovery thread, and the page flush thread.
 138  */
 139 int
 140 nfs4_waitfor_purge_complete(vnode_t *vp)
 141 {
 142         rnode4_t *rp;
 143         k_sigset_t smask;
 144 
 145         rp = VTOR4(vp);
 146         if ((rp->r_serial != NULL && rp->r_serial != curthread) ||
 147             ((rp->r_flags & R4PGFLUSH) && rp->r_pgflush != curthread)) {
 148                 mutex_enter(&rp->r_statelock);
 149                 sigintr(&smask, VTOMI4(vp)->mi_flags & MI4_INT);
 150                 while ((rp->r_serial != NULL && rp->r_serial != curthread) ||
 151                     ((rp->r_flags & R4PGFLUSH) &&
 152                     rp->r_pgflush != curthread)) {
 153                         if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
 154                                 sigunintr(&smask);
 155                                 mutex_exit(&rp->r_statelock);
 156                                 return (EINTR);
 157                         }
 158                 }
 159                 sigunintr(&smask);
 160                 mutex_exit(&rp->r_statelock);
 161         }
 162         return (0);
 163 }
 164 
 165 /*
 166  * Validate caches by checking cached attributes. If they have timed out,
 167  * then get new attributes from the server.  As a side effect, cache
 168  * invalidation is done if the attributes have changed.
 169  *
 170  * If the attributes have not timed out and if there is a cache
 171  * invalidation being done by some other thread, then wait until that
 172  * thread has completed the cache invalidation.
 173  */
 174 int
 175 nfs4_validate_caches(vnode_t *vp, cred_t *cr)
 176 {
 177         int error;
 178         nfs4_ga_res_t gar;
 179 
 180         if (ATTRCACHE4_VALID(vp)) {
 181                 error = nfs4_waitfor_purge_complete(vp);
 182                 if (error)
 183                         return (error);
 184                 return (0);
 185         }
 186 
 187         gar.n4g_va.va_mask = AT_ALL;
 188         return (nfs4_getattr_otw(vp, &gar, cr, 0));
 189 }
 190 
 191 /*
 192  * Fill in attribute from the cache.
 193  * If valid, then return 0 to indicate that no error occurred,
 194  * otherwise return 1 to indicate that an error occurred.
 195  */
 196 static int
 197 nfs4_getattr_cache(vnode_t *vp, struct vattr *vap)
 198 {
 199         rnode4_t *rp;
 200 
 201         rp = VTOR4(vp);
 202         mutex_enter(&rp->r_statelock);
 203         mutex_enter(&rp->r_statev4_lock);
 204         if (ATTRCACHE4_VALID(vp)) {
 205                 mutex_exit(&rp->r_statev4_lock);
 206                 /*
 207                  * Cached attributes are valid
 208                  */
 209                 *vap = rp->r_attr;
 210                 mutex_exit(&rp->r_statelock);
 211                 return (0);
 212         }
 213         mutex_exit(&rp->r_statev4_lock);
 214         mutex_exit(&rp->r_statelock);
 215         return (1);
 216 }
 217 
 218 
 219 /*
 220  * If returned error is ESTALE flush all caches.  The nfs4_purge_caches()
 221  * call is synchronous because all the pages were invalidated by the
 222  * nfs4_invalidate_pages() call.
 223  */
 224 void
 225 nfs4_purge_stale_fh(int errno, vnode_t *vp, cred_t *cr)
 226 {
 227         struct rnode4 *rp = VTOR4(vp);
 228 
 229         /* Ensure that the ..._end_op() call has been done */
 230         ASSERT(tsd_get(nfs4_tsd_key) == NULL);
 231 
 232         if (errno != ESTALE)
 233                 return;
 234 
 235         mutex_enter(&rp->r_statelock);
 236         rp->r_flags |= R4STALE;
 237         if (!rp->r_error)
 238                 rp->r_error = errno;
 239         mutex_exit(&rp->r_statelock);
 240         if (nfs4_has_pages(vp))
 241                 nfs4_invalidate_pages(vp, (u_offset_t)0, cr);
 242         nfs4_purge_caches(vp, NFS4_PURGE_DNLC, cr, FALSE);
 243 }
 244 
 245 /*
 246  * Purge all of the various NFS `data' caches.  If "asyncpg" is TRUE, the
 247  * page purge is done asynchronously.
 248  */
 249 void
 250 nfs4_purge_caches(vnode_t *vp, int purge_dnlc, cred_t *cr, int asyncpg)
 251 {
 252         rnode4_t *rp;
 253         char *contents;
 254         vnode_t *xattr;
 255         int size;
 256         int pgflush;                    /* are we the page flush thread? */
 257 
 258         /*
 259          * Purge the DNLC for any entries which refer to this file.
 260          */
 261         if (vp->v_count > 1 &&
 262             (vp->v_type == VDIR || purge_dnlc == NFS4_PURGE_DNLC))
 263                 dnlc_purge_vp(vp);
 264 
 265         /*
 266          * Clear any readdir state bits and purge the readlink response cache.
 267          */
 268         rp = VTOR4(vp);
 269         mutex_enter(&rp->r_statelock);
 270         rp->r_flags &= ~R4LOOKUP;
 271         contents = rp->r_symlink.contents;
 272         size = rp->r_symlink.size;
 273         rp->r_symlink.contents = NULL;
 274 
 275         xattr = rp->r_xattr_dir;
 276         rp->r_xattr_dir = NULL;
 277 
 278         /*
 279          * Purge pathconf cache too.
 280          */
 281         rp->r_pathconf.pc4_xattr_valid = 0;
 282         rp->r_pathconf.pc4_cache_valid = 0;
 283 
 284         pgflush = (curthread == rp->r_pgflush);
 285         mutex_exit(&rp->r_statelock);
 286 
 287         if (contents != NULL) {
 288 
 289                 kmem_free((void *)contents, size);
 290         }
 291 
 292         if (xattr != NULL)
 293                 VN_RELE(xattr);
 294 
 295         /*
 296          * Flush the page cache.  If the current thread is the page flush
 297          * thread, don't initiate a new page flush.  There's no need for
 298          * it, and doing it correctly is hard.
 299          */
 300         if (nfs4_has_pages(vp) && !pgflush) {
 301                 if (!asyncpg) {
 302                         (void) nfs4_waitfor_purge_complete(vp);
 303                         nfs4_flush_pages(vp, cr);
 304                 } else {
 305                         pgflush_t *args;
 306 
 307                         /*
 308                          * We don't hold r_statelock while creating the
 309                          * thread, in case the call blocks.  So we use a
 310                          * flag to indicate that a page flush thread is
 311                          * active.
 312                          */
 313                         mutex_enter(&rp->r_statelock);
 314                         if (rp->r_flags & R4PGFLUSH) {
 315                                 mutex_exit(&rp->r_statelock);
 316                         } else {
 317                                 rp->r_flags |= R4PGFLUSH;
 318                                 mutex_exit(&rp->r_statelock);
 319 
 320                                 args = kmem_alloc(sizeof (pgflush_t),
 321                                     KM_SLEEP);
 322                                 args->vp = vp;
 323                                 VN_HOLD(args->vp);
 324                                 args->cr = cr;
 325                                 crhold(args->cr);
 326                                 (void) zthread_create(NULL, 0,
 327                                     nfs4_pgflush_thread, args, 0,
 328                                     minclsyspri);
 329                         }
 330                 }
 331         }
 332 
 333         /*
 334          * Flush the readdir response cache.
 335          */
 336         nfs4_purge_rddir_cache(vp);
 337 }
 338 
 339 /*
 340  * Invalidate all pages for the given file, after writing back the dirty
 341  * ones.
 342  */
 343 
 344 void
 345 nfs4_flush_pages(vnode_t *vp, cred_t *cr)
 346 {
 347         int error;
 348         rnode4_t *rp = VTOR4(vp);
 349 
 350         error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_INVAL, cr, NULL);
 351         if (error == ENOSPC || error == EDQUOT) {
 352                 mutex_enter(&rp->r_statelock);
 353                 if (!rp->r_error)
 354                         rp->r_error = error;
 355                 mutex_exit(&rp->r_statelock);
 356         }
 357 }
 358 
 359 /*
 360  * Page flush thread.
 361  */
 362 
 363 static void
 364 nfs4_pgflush_thread(pgflush_t *args)
 365 {
 366         rnode4_t *rp = VTOR4(args->vp);
 367 
 368         /* remember which thread we are, so we don't deadlock ourselves */
 369         mutex_enter(&rp->r_statelock);
 370         ASSERT(rp->r_pgflush == NULL);
 371         rp->r_pgflush = curthread;
 372         mutex_exit(&rp->r_statelock);
 373 
 374         nfs4_flush_pages(args->vp, args->cr);
 375 
 376         mutex_enter(&rp->r_statelock);
 377         rp->r_pgflush = NULL;
 378         rp->r_flags &= ~R4PGFLUSH;
 379         cv_broadcast(&rp->r_cv);
 380         mutex_exit(&rp->r_statelock);
 381 
 382         VN_RELE(args->vp);
 383         crfree(args->cr);
 384         kmem_free(args, sizeof (pgflush_t));
 385         zthread_exit();
 386 }
 387 
 388 /*
 389  * Purge the readdir cache of all entries which are not currently
 390  * being filled.
 391  */
 392 void
 393 nfs4_purge_rddir_cache(vnode_t *vp)
 394 {
 395         rnode4_t *rp;
 396 
 397         rp = VTOR4(vp);
 398 
 399         mutex_enter(&rp->r_statelock);
 400         rp->r_direof = NULL;
 401         rp->r_flags &= ~R4LOOKUP;
 402         rp->r_flags |= R4READDIRWATTR;
 403         rddir4_cache_purge(rp);
 404         mutex_exit(&rp->r_statelock);
 405 }
 406 
 407 /*
 408  * Set attributes cache for given vnode using virtual attributes.  There is
 409  * no cache validation, but if the attributes are deemed to be stale, they
 410  * are ignored.  This corresponds to nfs3_attrcache().
 411  *
 412  * Set the timeout value on the attribute cache and fill it
 413  * with the passed in attributes.
 414  */
 415 void
 416 nfs4_attrcache_noinval(vnode_t *vp, nfs4_ga_res_t *garp, hrtime_t t)
 417 {
 418         rnode4_t *rp = VTOR4(vp);
 419 
 420         mutex_enter(&rp->r_statelock);
 421         if (rp->r_time_attr_saved <= t)
 422                 nfs4_attrcache_va(vp, garp, FALSE);
 423         mutex_exit(&rp->r_statelock);
 424 }
 425 
 426 /*
 427  * Use the passed in virtual attributes to check to see whether the
 428  * data and metadata caches are valid, cache the new attributes, and
 429  * then do the cache invalidation if required.
 430  *
 431  * The cache validation and caching of the new attributes is done
 432  * atomically via the use of the mutex, r_statelock.  If required,
 433  * the cache invalidation is done atomically w.r.t. the cache
 434  * validation and caching of the attributes via the pseudo lock,
 435  * r_serial.
 436  *
 437  * This routine is used to do cache validation and attributes caching
 438  * for operations with a single set of post operation attributes.
 439  */
 440 
 441 void
 442 nfs4_attr_cache(vnode_t *vp, nfs4_ga_res_t *garp,
 443     hrtime_t t, cred_t *cr, int async,
 444     change_info4 *cinfo)
 445 {
 446         rnode4_t *rp;
 447         int mtime_changed = 0;
 448         int ctime_changed = 0;
 449         vsecattr_t *vsp;
 450         int was_serial, set_time_cache_inval, recov;
 451         vattr_t *vap = &garp->n4g_va;
 452         mntinfo4_t *mi = VTOMI4(vp);
 453         len_t preattr_rsize;
 454         boolean_t writemodify_set = B_FALSE;
 455         boolean_t cachepurge_set = B_FALSE;
 456 
 457         ASSERT(mi->mi_vfsp->vfs_dev == garp->n4g_va.va_fsid);
 458 
 459         /* Is curthread the recovery thread? */
 460         mutex_enter(&mi->mi_lock);
 461         recov = (VTOMI4(vp)->mi_recovthread == curthread);
 462         mutex_exit(&mi->mi_lock);
 463 
 464         rp = VTOR4(vp);
 465         mutex_enter(&rp->r_statelock);
 466         was_serial = (rp->r_serial == curthread);
 467         if (rp->r_serial && !was_serial) {
 468                 klwp_t *lwp = ttolwp(curthread);
 469 
 470                 /*
 471                  * If we're the recovery thread, then purge current attrs
 472                  * and bail out to avoid potential deadlock between another
 473                  * thread caching attrs (r_serial thread), recov thread,
 474                  * and an async writer thread.
 475                  */
 476                 if (recov) {
 477                         PURGE_ATTRCACHE4_LOCKED(rp);
 478                         mutex_exit(&rp->r_statelock);
 479                         return;
 480                 }
 481 
 482                 if (lwp != NULL)
 483                         lwp->lwp_nostop++;
 484                 while (rp->r_serial != NULL) {
 485                         if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
 486                                 mutex_exit(&rp->r_statelock);
 487                                 if (lwp != NULL)
 488                                         lwp->lwp_nostop--;
 489                                 return;
 490                         }
 491                 }
 492                 if (lwp != NULL)
 493                         lwp->lwp_nostop--;
 494         }
 495 
 496         /*
 497          * If there is a page flush thread, the current thread needs to
 498          * bail out, to prevent a possible deadlock between the current
 499          * thread (which might be in a start_op/end_op region), the
 500          * recovery thread, and the page flush thread.  Expire the
 501          * attribute cache, so that any attributes the current thread was
 502          * going to set are not lost.
 503          */
 504         if ((rp->r_flags & R4PGFLUSH) && rp->r_pgflush != curthread) {
 505                 PURGE_ATTRCACHE4_LOCKED(rp);
 506                 mutex_exit(&rp->r_statelock);
 507                 return;
 508         }
 509 
 510         if (rp->r_time_attr_saved > t) {
 511                 /*
 512                  * Attributes have been cached since these attributes were
 513                  * probably made. If there is an inconsistency in what is
 514                  * cached, mark them invalid. If not, don't act on them.
 515                  */
 516                 if (!CACHE4_VALID(rp, vap->va_mtime, vap->va_size))
 517                         PURGE_ATTRCACHE4_LOCKED(rp);
 518                 mutex_exit(&rp->r_statelock);
 519                 return;
 520         }
 521         set_time_cache_inval = 0;
 522         if (cinfo) {
 523                 /*
 524                  * Only directory modifying callers pass non-NULL cinfo.
 525                  */
 526                 ASSERT(vp->v_type == VDIR);
 527                 /*
 528                  * If the cache timeout either doesn't exist or hasn't expired,
 529                  * and dir didn't changed on server before dirmod op
 530                  * and dir didn't change after dirmod op but before getattr
 531                  * then there's a chance that the client's cached data for
 532                  * this object is current (not stale).  No immediate cache
 533                  * flush is required.
 534                  *
 535                  */
 536                 if ((! rp->r_time_cache_inval || t < rp->r_time_cache_inval) &&
 537                     cinfo->before == rp->r_change &&
 538                     (garp->n4g_change_valid &&
 539                     cinfo->after == garp->n4g_change)) {
 540 
 541                         /*
 542                          * If atomic isn't set, then the before/after info
 543                          * cannot be blindly trusted.  For this case, we tell
 544                          * nfs4_attrcache_va to cache the attrs but also
 545                          * establish an absolute maximum cache timeout.  When
 546                          * the timeout is reached, caches will be flushed.
 547                          */
 548                         if (! cinfo->atomic)
 549                                 set_time_cache_inval = 1;
 550                 } else {
 551 
 552                         /*
 553                          * We're not sure exactly what changed, but we know
 554                          * what to do.  flush all caches for dir.  remove the
 555                          * attr timeout.
 556                          *
 557                          * a) timeout expired.  flush all caches.
 558                          * b) r_change != cinfo.before.  flush all caches.
 559                          * c) r_change == cinfo.before, but cinfo.after !=
 560                          *    post-op getattr(change).  flush all caches.
 561                          * d) post-op getattr(change) not provided by server.
 562                          *    flush all caches.
 563                          */
 564                         mtime_changed = 1;
 565                         ctime_changed = 1;
 566                         rp->r_time_cache_inval = 0;
 567                 }
 568         } else {
 569                 /*
 570                  * Write thread after writing data to file on remote server,
 571                  * will always set R4WRITEMODIFIED to indicate that file on
 572                  * remote server was modified with a WRITE operation and would
 573                  * have marked attribute cache as timed out. If R4WRITEMODIFIED
 574                  * is set, then do not check for mtime and ctime change.
 575                  */
 576                 if (!(rp->r_flags & R4WRITEMODIFIED)) {
 577                         if (!CACHE4_VALID(rp, vap->va_mtime, vap->va_size))
 578                                 mtime_changed = 1;
 579 
 580                         if (rp->r_attr.va_ctime.tv_sec !=
 581                             vap->va_ctime.tv_sec ||
 582                             rp->r_attr.va_ctime.tv_nsec !=
 583                             vap->va_ctime.tv_nsec)
 584                                 ctime_changed = 1;
 585                 } else {
 586                         writemodify_set = B_TRUE;
 587                 }
 588         }
 589 
 590         preattr_rsize = rp->r_size;
 591 
 592         nfs4_attrcache_va(vp, garp, set_time_cache_inval);
 593 
 594         /*
 595          * If we have updated filesize in nfs4_attrcache_va, as soon as we
 596          * drop statelock we will be in transition of purging all
 597          * our caches and updating them. It is possible for another
 598          * thread to pick this new file size and read in zeroed data.
 599          * stall other threads till cache purge is complete.
 600          */
 601         if ((!cinfo) && (rp->r_size != preattr_rsize)) {
 602                 /*
 603                  * If R4WRITEMODIFIED was set and we have updated the file
 604                  * size, Server's returned file size need not necessarily
 605                  * be because of this Client's WRITE. We need to purge
 606                  * all caches.
 607                  */
 608                 if (writemodify_set)
 609                         mtime_changed = 1;
 610 
 611                 if (mtime_changed && !(rp->r_flags & R4INCACHEPURGE)) {
 612                         rp->r_flags |= R4INCACHEPURGE;
 613                         cachepurge_set = B_TRUE;
 614                 }
 615         }
 616 
 617         if (!mtime_changed && !ctime_changed) {
 618                 mutex_exit(&rp->r_statelock);
 619                 return;
 620         }
 621 
 622         rp->r_serial = curthread;
 623 
 624         mutex_exit(&rp->r_statelock);
 625 
 626         /*
 627          * If we're the recov thread, then force async nfs4_purge_caches
 628          * to avoid potential deadlock.
 629          */
 630         if (mtime_changed)
 631                 nfs4_purge_caches(vp, NFS4_NOPURGE_DNLC, cr, recov ? 1 : async);
 632 
 633         if ((rp->r_flags & R4INCACHEPURGE) && cachepurge_set) {
 634                 mutex_enter(&rp->r_statelock);
 635                 rp->r_flags &= ~R4INCACHEPURGE;
 636                 cv_broadcast(&rp->r_cv);
 637                 mutex_exit(&rp->r_statelock);
 638                 cachepurge_set = B_FALSE;
 639         }
 640 
 641         if (ctime_changed) {
 642                 (void) nfs4_access_purge_rp(rp);
 643                 if (rp->r_secattr != NULL) {
 644                         mutex_enter(&rp->r_statelock);
 645                         vsp = rp->r_secattr;
 646                         rp->r_secattr = NULL;
 647                         mutex_exit(&rp->r_statelock);
 648                         if (vsp != NULL)
 649                                 nfs4_acl_free_cache(vsp);
 650                 }
 651         }
 652 
 653         if (!was_serial) {
 654                 mutex_enter(&rp->r_statelock);
 655                 rp->r_serial = NULL;
 656                 cv_broadcast(&rp->r_cv);
 657                 mutex_exit(&rp->r_statelock);
 658         }
 659 }
 660 
 661 /*
 662  * Set attributes cache for given vnode using virtual attributes.
 663  *
 664  * Set the timeout value on the attribute cache and fill it
 665  * with the passed in attributes.
 666  *
 667  * The caller must be holding r_statelock.
 668  */
 669 static void
 670 nfs4_attrcache_va(vnode_t *vp, nfs4_ga_res_t *garp, int set_cache_timeout)
 671 {
 672         rnode4_t *rp;
 673         mntinfo4_t *mi;
 674         hrtime_t delta;
 675         hrtime_t now;
 676         vattr_t *vap = &garp->n4g_va;
 677 
 678         rp = VTOR4(vp);
 679 
 680         ASSERT(MUTEX_HELD(&rp->r_statelock));
 681         ASSERT(vap->va_mask == AT_ALL);
 682 
 683         /* Switch to master before checking v_flag */
 684         if (IS_SHADOW(vp, rp))
 685                 vp = RTOV4(rp);
 686 
 687         now = gethrtime();
 688 
 689         mi = VTOMI4(vp);
 690 
 691         /*
 692          * Only establish a new cache timeout (if requested).  Never
 693          * extend a timeout.  Never clear a timeout.  Clearing a timeout
 694          * is done by nfs4_update_dircaches (ancestor in our call chain)
 695          */
 696         if (set_cache_timeout && ! rp->r_time_cache_inval)
 697                 rp->r_time_cache_inval = now + mi->mi_acdirmax;
 698 
 699         /*
 700          * Delta is the number of nanoseconds that we will
 701          * cache the attributes of the file.  It is based on
 702          * the number of nanoseconds since the last time that
 703          * we detected a change.  The assumption is that files
 704          * that changed recently are likely to change again.
 705          * There is a minimum and a maximum for regular files
 706          * and for directories which is enforced though.
 707          *
 708          * Using the time since last change was detected
 709          * eliminates direct comparison or calculation
 710          * using mixed client and server times.  NFS does
 711          * not make any assumptions regarding the client
 712          * and server clocks being synchronized.
 713          */
 714         if (vap->va_mtime.tv_sec != rp->r_attr.va_mtime.tv_sec ||
 715             vap->va_mtime.tv_nsec != rp->r_attr.va_mtime.tv_nsec ||
 716             vap->va_size != rp->r_attr.va_size) {
 717                 rp->r_time_attr_saved = now;
 718         }
 719 
 720         if ((mi->mi_flags & MI4_NOAC) || (vp->v_flag & VNOCACHE))
 721                 delta = 0;
 722         else {
 723                 delta = now - rp->r_time_attr_saved;
 724                 if (vp->v_type == VDIR) {
 725                         if (delta < mi->mi_acdirmin)
 726                                 delta = mi->mi_acdirmin;
 727                         else if (delta > mi->mi_acdirmax)
 728                                 delta = mi->mi_acdirmax;
 729                 } else {
 730                         if (delta < mi->mi_acregmin)
 731                                 delta = mi->mi_acregmin;
 732                         else if (delta > mi->mi_acregmax)
 733                                 delta = mi->mi_acregmax;
 734                 }
 735         }
 736         rp->r_time_attr_inval = now + delta;
 737 
 738         rp->r_attr = *vap;
 739         if (garp->n4g_change_valid)
 740                 rp->r_change = garp->n4g_change;
 741 
 742         /*
 743          * The attributes that were returned may be valid and can
 744          * be used, but they may not be allowed to be cached.
 745          * Reset the timers to cause immediate invalidation and
 746          * clear r_change so no VERIFY operations will suceed
 747          */
 748         if (garp->n4g_attrwhy == NFS4_GETATTR_NOCACHE_OK) {
 749                 rp->r_time_attr_inval = now;
 750                 rp->r_time_attr_saved = now;
 751                 rp->r_change = 0;
 752         }
 753 
 754         /*
 755          * If mounted_on_fileid returned AND the object is a stub,
 756          * then set object's va_nodeid to the mounted over fid
 757          * returned by server.
 758          *
 759          * If mounted_on_fileid not provided/supported, then
 760          * just set it to 0 for now.  Eventually it would be
 761          * better to set it to a hashed version of FH.  This
 762          * would probably be good enough to provide a unique
 763          * fid/d_ino within a dir.
 764          *
 765          * We don't need to carry mounted_on_fileid in the
 766          * rnode as long as the client never requests fileid
 767          * without also requesting mounted_on_fileid.  For
 768          * now, it stays.
 769          */
 770         if (garp->n4g_mon_fid_valid) {
 771                 rp->r_mntd_fid = garp->n4g_mon_fid;
 772 
 773                 if (RP_ISSTUB(rp))
 774                         rp->r_attr.va_nodeid = rp->r_mntd_fid;
 775         }
 776 
 777         /*
 778          * Check to see if there are valid pathconf bits to
 779          * cache in the rnode.
 780          */
 781         if (garp->n4g_ext_res) {
 782                 if (garp->n4g_ext_res->n4g_pc4.pc4_cache_valid) {
 783                         rp->r_pathconf = garp->n4g_ext_res->n4g_pc4;
 784                 } else {
 785                         if (garp->n4g_ext_res->n4g_pc4.pc4_xattr_valid) {
 786                                 rp->r_pathconf.pc4_xattr_valid = TRUE;
 787                                 rp->r_pathconf.pc4_xattr_exists =
 788                                     garp->n4g_ext_res->n4g_pc4.pc4_xattr_exists;
 789                         }
 790                 }
 791         }
 792         /*
 793          * Update the size of the file if there is no cached data or if
 794          * the cached data is clean and there is no data being written
 795          * out.
 796          */
 797         if (rp->r_size != vap->va_size &&
 798             (!vn_has_cached_data(vp) ||
 799             (!(rp->r_flags & R4DIRTY) && rp->r_count == 0))) {
 800                 rp->r_size = vap->va_size;
 801         }
 802         nfs_setswaplike(vp, vap);
 803         rp->r_flags &= ~R4WRITEMODIFIED;
 804 }
 805 
 806 /*
 807  * Get attributes over-the-wire and update attributes cache
 808  * if no error occurred in the over-the-wire operation.
 809  * Return 0 if successful, otherwise error.
 810  */
 811 int
 812 nfs4_getattr_otw(vnode_t *vp, nfs4_ga_res_t *garp, cred_t *cr, int get_acl)
 813 {
 814         mntinfo4_t *mi = VTOMI4(vp);
 815         hrtime_t t;
 816         nfs4_recov_state_t recov_state;
 817         nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
 818 
 819         recov_state.rs_flags = 0;
 820         recov_state.rs_num_retry_despite_err = 0;
 821 
 822         /* Save the original mount point security flavor */
 823         (void) save_mnt_secinfo(mi->mi_curr_serv);
 824 
 825 recov_retry:
 826 
 827         if ((e.error = nfs4_start_fop(mi, vp, NULL, OH_GETATTR,
 828             &recov_state, NULL))) {
 829                 (void) check_mnt_secinfo(mi->mi_curr_serv, vp);
 830                 return (e.error);
 831         }
 832 
 833         t = gethrtime();
 834 
 835         nfs4_getattr_otw_norecovery(vp, garp, &e, cr, get_acl);
 836 
 837         if (nfs4_needs_recovery(&e, FALSE, vp->v_vfsp)) {
 838                 if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
 839                     NULL, OP_GETATTR, NULL, NULL, NULL) == FALSE)  {
 840                         nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR,
 841                             &recov_state, 1);
 842                         goto recov_retry;
 843                 }
 844         }
 845 
 846         nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state, 0);
 847 
 848         if (!e.error) {
 849                 if (e.stat == NFS4_OK) {
 850                         nfs4_attr_cache(vp, garp, t, cr, FALSE, NULL);
 851                 } else {
 852                         e.error = geterrno4(e.stat);
 853 
 854                         nfs4_purge_stale_fh(e.error, vp, cr);
 855                 }
 856         }
 857 
 858         /*
 859          * If getattr a node that is a stub for a crossed
 860          * mount point, keep the original secinfo flavor for
 861          * the current file system, not the crossed one.
 862          */
 863         (void) check_mnt_secinfo(mi->mi_curr_serv, vp);
 864 
 865         return (e.error);
 866 }
 867 
 868 /*
 869  * Generate a compound to get attributes over-the-wire.
 870  */
 871 void
 872 nfs4_getattr_otw_norecovery(vnode_t *vp, nfs4_ga_res_t *garp,
 873     nfs4_error_t *ep, cred_t *cr, int get_acl)
 874 {
 875         COMPOUND4args_clnt args;
 876         COMPOUND4res_clnt res;
 877         int doqueue;
 878         rnode4_t *rp = VTOR4(vp);
 879         nfs_argop4 argop[2];
 880 
 881         args.ctag = TAG_GETATTR;
 882 
 883         args.array_len = 2;
 884         args.array = argop;
 885 
 886         /* putfh */
 887         argop[0].argop = OP_CPUTFH;
 888         argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
 889 
 890         /* getattr */
 891         /*
 892          * Unlike nfs version 2 and 3, where getattr returns all the
 893          * attributes, nfs version 4 returns only the ones explicitly
 894          * asked for. This creates problems, as some system functions
 895          * (e.g. cache check) require certain attributes and if the
 896          * cached node lacks some attributes such as uid/gid, it can
 897          * affect system utilities (e.g. "ls") that rely on the information
 898          * to be there. This can lead to anything from system crashes to
 899          * corrupted information processed by user apps.
 900          * So to ensure that all bases are covered, request at least
 901          * the AT_ALL attribute mask.
 902          */
 903         argop[1].argop = OP_GETATTR;
 904         argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
 905         if (get_acl)
 906                 argop[1].nfs_argop4_u.opgetattr.attr_request |= FATTR4_ACL_MASK;
 907         argop[1].nfs_argop4_u.opgetattr.mi = VTOMI4(vp);
 908 
 909         doqueue = 1;
 910 
 911         rfs4call(VTOMI4(vp), &args, &res, cr, &doqueue, 0, ep);
 912 
 913         if (ep->error)
 914                 return;
 915 
 916         if (res.status != NFS4_OK) {
 917                 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
 918                 return;
 919         }
 920 
 921         *garp = res.array[1].nfs_resop4_u.opgetattr.ga_res;
 922 
 923         (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
 924 }
 925 
 926 /*
 927  * Return either cached or remote attributes. If get remote attr
 928  * use them to check and invalidate caches, then cache the new attributes.
 929  */
 930 int
 931 nfs4getattr(vnode_t *vp, vattr_t *vap, cred_t *cr)
 932 {
 933         int error;
 934         rnode4_t *rp;
 935         nfs4_ga_res_t gar;
 936 
 937         ASSERT(nfs4_consistent_type(vp));
 938 
 939         /*
 940          * If we've got cached attributes, we're done, otherwise go
 941          * to the server to get attributes, which will update the cache
 942          * in the process. Either way, use the cached attributes for
 943          * the caller's vattr_t.
 944          *
 945          * Note that we ignore the gar set by the OTW call: the attr caching
 946          * code may make adjustments when storing to the rnode, and we want
 947          * to see those changes here.
 948          */
 949         rp = VTOR4(vp);
 950         error = 0;
 951         mutex_enter(&rp->r_statelock);
 952         if (!ATTRCACHE4_VALID(vp)) {
 953                 mutex_exit(&rp->r_statelock);
 954                 error = nfs4_getattr_otw(vp, &gar, cr, 0);
 955                 mutex_enter(&rp->r_statelock);
 956         }
 957 
 958         if (!error)
 959                 *vap = rp->r_attr;
 960 
 961         /* Return the client's view of file size */
 962         vap->va_size = rp->r_size;
 963 
 964         mutex_exit(&rp->r_statelock);
 965 
 966         ASSERT(nfs4_consistent_type(vp));
 967 
 968         return (error);
 969 }
 970 
 971 int
 972 nfs4_attr_otw(vnode_t *vp, nfs4_tag_type_t tag_type,
 973     nfs4_ga_res_t *garp, bitmap4 reqbitmap, cred_t *cr)
 974 {
 975         COMPOUND4args_clnt args;
 976         COMPOUND4res_clnt res;
 977         int doqueue;
 978         nfs_argop4 argop[2];
 979         mntinfo4_t *mi = VTOMI4(vp);
 980         bool_t needrecov = FALSE;
 981         nfs4_recov_state_t recov_state;
 982         nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
 983         nfs4_ga_ext_res_t *gerp;
 984 
 985         recov_state.rs_flags = 0;
 986         recov_state.rs_num_retry_despite_err = 0;
 987 
 988 recov_retry:
 989         args.ctag = tag_type;
 990 
 991         args.array_len = 2;
 992         args.array = argop;
 993 
 994         e.error = nfs4_start_fop(mi, vp, NULL, OH_GETATTR, &recov_state, NULL);
 995         if (e.error)
 996                 return (e.error);
 997 
 998         /* putfh */
 999         argop[0].argop = OP_CPUTFH;
1000         argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh;
1001 
1002         /* getattr */
1003         argop[1].argop = OP_GETATTR;
1004         argop[1].nfs_argop4_u.opgetattr.attr_request = reqbitmap;
1005         argop[1].nfs_argop4_u.opgetattr.mi = mi;
1006 
1007         doqueue = 1;
1008 
1009         NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
1010             "nfs4_attr_otw: %s call, rp %s", needrecov ? "recov" : "first",
1011             rnode4info(VTOR4(vp))));
1012 
1013         rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
1014 
1015         needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp);
1016         if (!needrecov && e.error) {
1017                 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state,
1018                     needrecov);
1019                 return (e.error);
1020         }
1021 
1022         if (needrecov) {
1023                 bool_t abort;
1024 
1025                 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1026                     "nfs4_attr_otw: initiating recovery\n"));
1027 
1028                 abort = nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
1029                     NULL, OP_GETATTR, NULL, NULL, NULL);
1030                 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state,
1031                     needrecov);
1032                 if (!e.error) {
1033                         (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1034                         e.error = geterrno4(res.status);
1035                 }
1036                 if (abort == FALSE)
1037                         goto recov_retry;
1038                 return (e.error);
1039         }
1040 
1041         if (res.status) {
1042                 e.error = geterrno4(res.status);
1043         } else {
1044                 gerp = garp->n4g_ext_res;
1045                 bcopy(&res.array[1].nfs_resop4_u.opgetattr.ga_res,
1046                     garp, sizeof (nfs4_ga_res_t));
1047                 garp->n4g_ext_res = gerp;
1048                 if (garp->n4g_ext_res &&
1049                     res.array[1].nfs_resop4_u.opgetattr.ga_res.n4g_ext_res)
1050                         bcopy(res.array[1].nfs_resop4_u.opgetattr.
1051                             ga_res.n4g_ext_res,
1052                             garp->n4g_ext_res, sizeof (nfs4_ga_ext_res_t));
1053         }
1054         (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1055         nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state,
1056             needrecov);
1057         return (e.error);
1058 }
1059 
1060 /*
1061  * Asynchronous I/O parameters.  nfs_async_threads is the high-water mark
1062  * for the demand-based allocation of async threads per-mount.  The
1063  * nfs_async_timeout is the amount of time a thread will live after it
1064  * becomes idle, unless new I/O requests are received before the thread
1065  * dies.  See nfs4_async_putpage and nfs4_async_start.
1066  */
1067 
1068 static void     nfs4_async_start(struct vfs *);
1069 static void     nfs4_async_pgops_start(struct vfs *);
1070 static void     nfs4_async_common_start(struct vfs *, int);
1071 
1072 static void
1073 free_async_args4(struct nfs4_async_reqs *args)
1074 {
1075         rnode4_t *rp;
1076 
1077         if (args->a_io != NFS4_INACTIVE) {
1078                 rp = VTOR4(args->a_vp);
1079                 mutex_enter(&rp->r_statelock);
1080                 rp->r_count--;
1081                 if (args->a_io == NFS4_PUTAPAGE ||
1082                     args->a_io == NFS4_PAGEIO)
1083                         rp->r_awcount--;
1084                 cv_broadcast(&rp->r_cv);
1085                 mutex_exit(&rp->r_statelock);
1086                 VN_RELE(args->a_vp);
1087         }
1088         crfree(args->a_cred);
1089         kmem_free(args, sizeof (*args));
1090 }
1091 
1092 /*
1093  * Cross-zone thread creation and NFS access is disallowed, yet fsflush() and
1094  * pageout(), running in the global zone, have legitimate reasons to do
1095  * VOP_PUTPAGE(B_ASYNC) on other zones' NFS mounts.  We avoid the problem by
1096  * use of a a per-mount "asynchronous requests manager thread" which is
1097  * signaled by the various asynchronous work routines when there is
1098  * asynchronous work to be done.  It is responsible for creating new
1099  * worker threads if necessary, and notifying existing worker threads
1100  * that there is work to be done.
1101  *
1102  * In other words, it will "take the specifications from the customers and
1103  * give them to the engineers."
1104  *
1105  * Worker threads die off of their own accord if they are no longer
1106  * needed.
1107  *
1108  * This thread is killed when the zone is going away or the filesystem
1109  * is being unmounted.
1110  */
1111 void
1112 nfs4_async_manager(vfs_t *vfsp)
1113 {
1114         callb_cpr_t cprinfo;
1115         mntinfo4_t *mi;
1116         uint_t max_threads;
1117 
1118         mi = VFTOMI4(vfsp);
1119 
1120         CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr,
1121             "nfs4_async_manager");
1122 
1123         mutex_enter(&mi->mi_async_lock);
1124         /*
1125          * We want to stash the max number of threads that this mount was
1126          * allowed so we can use it later when the variable is set to zero as
1127          * part of the zone/mount going away.
1128          *
1129          * We want to be able to create at least one thread to handle
1130          * asynchronous inactive calls.
1131          */
1132         max_threads = MAX(mi->mi_max_threads, 1);
1133         /*
1134          * We don't want to wait for mi_max_threads to go to zero, since that
1135          * happens as part of a failed unmount, but this thread should only
1136          * exit when the mount is really going away.
1137          *
1138          * Once MI4_ASYNC_MGR_STOP is set, no more async operations will be
1139          * attempted: the various _async_*() functions know to do things
1140          * inline if mi_max_threads == 0.  Henceforth we just drain out the
1141          * outstanding requests.
1142          *
1143          * Note that we still create zthreads even if we notice the zone is
1144          * shutting down (MI4_ASYNC_MGR_STOP is set); this may cause the zone
1145          * shutdown sequence to take slightly longer in some cases, but
1146          * doesn't violate the protocol, as all threads will exit as soon as
1147          * they're done processing the remaining requests.
1148          */
1149         for (;;) {
1150                 while (mi->mi_async_req_count > 0) {
1151                         /*
1152                          * Paranoia: If the mount started out having
1153                          * (mi->mi_max_threads == 0), and the value was
1154                          * later changed (via a debugger or somesuch),
1155                          * we could be confused since we will think we
1156                          * can't create any threads, and the calling
1157                          * code (which looks at the current value of
1158                          * mi->mi_max_threads, now non-zero) thinks we
1159                          * can.
1160                          *
1161                          * So, because we're paranoid, we create threads
1162                          * up to the maximum of the original and the
1163                          * current value. This means that future
1164                          * (debugger-induced) alterations of
1165                          * mi->mi_max_threads are ignored for our
1166                          * purposes, but who told them they could change
1167                          * random values on a live kernel anyhow?
1168                          */
1169                         if (mi->mi_threads[NFS4_ASYNC_QUEUE] <
1170                             MAX(mi->mi_max_threads, max_threads)) {
1171                                 mi->mi_threads[NFS4_ASYNC_QUEUE]++;
1172                                 mutex_exit(&mi->mi_async_lock);
1173                                 MI4_HOLD(mi);
1174                                 VFS_HOLD(vfsp); /* hold for new thread */
1175                                 (void) zthread_create(NULL, 0, nfs4_async_start,
1176                                     vfsp, 0, minclsyspri);
1177                                 mutex_enter(&mi->mi_async_lock);
1178                         } else if (mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] <
1179                             NUM_ASYNC_PGOPS_THREADS) {
1180                                 mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE]++;
1181                                 mutex_exit(&mi->mi_async_lock);
1182                                 MI4_HOLD(mi);
1183                                 VFS_HOLD(vfsp); /* hold for new thread */
1184                                 (void) zthread_create(NULL, 0,
1185                                     nfs4_async_pgops_start, vfsp, 0,
1186                                     minclsyspri);
1187                                 mutex_enter(&mi->mi_async_lock);
1188                         }
1189                         NFS4_WAKE_ASYNC_WORKER(mi->mi_async_work_cv);
1190                         ASSERT(mi->mi_async_req_count != 0);
1191                         mi->mi_async_req_count--;
1192                 }
1193 
1194                 mutex_enter(&mi->mi_lock);
1195                 if (mi->mi_flags & MI4_ASYNC_MGR_STOP) {
1196                         mutex_exit(&mi->mi_lock);
1197                         break;
1198                 }
1199                 mutex_exit(&mi->mi_lock);
1200 
1201                 CALLB_CPR_SAFE_BEGIN(&cprinfo);
1202                 cv_wait(&mi->mi_async_reqs_cv, &mi->mi_async_lock);
1203                 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
1204         }
1205 
1206         NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
1207             "nfs4_async_manager exiting for vfs %p\n", (void *)mi->mi_vfsp));
1208         /*
1209          * Let everyone know we're done.
1210          */
1211         mi->mi_manager_thread = NULL;
1212         /*
1213          * Wake up the inactive thread.
1214          */
1215         cv_broadcast(&mi->mi_inact_req_cv);
1216         /*
1217          * Wake up anyone sitting in nfs4_async_manager_stop()
1218          */
1219         cv_broadcast(&mi->mi_async_cv);
1220         /*
1221          * There is no explicit call to mutex_exit(&mi->mi_async_lock)
1222          * since CALLB_CPR_EXIT is actually responsible for releasing
1223          * 'mi_async_lock'.
1224          */
1225         CALLB_CPR_EXIT(&cprinfo);
1226         VFS_RELE(vfsp); /* release thread's hold */
1227         MI4_RELE(mi);
1228         zthread_exit();
1229 }
1230 
1231 /*
1232  * Signal (and wait for) the async manager thread to clean up and go away.
1233  */
1234 void
1235 nfs4_async_manager_stop(vfs_t *vfsp)
1236 {
1237         mntinfo4_t *mi = VFTOMI4(vfsp);
1238 
1239         mutex_enter(&mi->mi_async_lock);
1240         mutex_enter(&mi->mi_lock);
1241         mi->mi_flags |= MI4_ASYNC_MGR_STOP;
1242         mutex_exit(&mi->mi_lock);
1243         cv_broadcast(&mi->mi_async_reqs_cv);
1244         /*
1245          * Wait for the async manager thread to die.
1246          */
1247         while (mi->mi_manager_thread != NULL)
1248                 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
1249         mutex_exit(&mi->mi_async_lock);
1250 }
1251 
1252 int
1253 nfs4_async_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr,
1254     struct seg *seg, cred_t *cr, void (*readahead)(vnode_t *,
1255     u_offset_t, caddr_t, struct seg *, cred_t *))
1256 {
1257         rnode4_t *rp;
1258         mntinfo4_t *mi;
1259         struct nfs4_async_reqs *args;
1260 
1261         rp = VTOR4(vp);
1262         ASSERT(rp->r_freef == NULL);
1263 
1264         mi = VTOMI4(vp);
1265 
1266         /*
1267          * If addr falls in a different segment, don't bother doing readahead.
1268          */
1269         if (addr >= seg->s_base + seg->s_size)
1270                 return (-1);
1271 
1272         /*
1273          * If we can't allocate a request structure, punt on the readahead.
1274          */
1275         if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1276                 return (-1);
1277 
1278         /*
1279          * If a lock operation is pending, don't initiate any new
1280          * readaheads.  Otherwise, bump r_count to indicate the new
1281          * asynchronous I/O.
1282          */
1283         if (!nfs_rw_tryenter(&rp->r_lkserlock, RW_READER)) {
1284                 kmem_free(args, sizeof (*args));
1285                 return (-1);
1286         }
1287         mutex_enter(&rp->r_statelock);
1288         rp->r_count++;
1289         mutex_exit(&rp->r_statelock);
1290         nfs_rw_exit(&rp->r_lkserlock);
1291 
1292         args->a_next = NULL;
1293 #ifdef DEBUG
1294         args->a_queuer = curthread;
1295 #endif
1296         VN_HOLD(vp);
1297         args->a_vp = vp;
1298         ASSERT(cr != NULL);
1299         crhold(cr);
1300         args->a_cred = cr;
1301         args->a_io = NFS4_READ_AHEAD;
1302         args->a_nfs4_readahead = readahead;
1303         args->a_nfs4_blkoff = blkoff;
1304         args->a_nfs4_seg = seg;
1305         args->a_nfs4_addr = addr;
1306 
1307         mutex_enter(&mi->mi_async_lock);
1308 
1309         /*
1310          * If asyncio has been disabled, don't bother readahead.
1311          */
1312         if (mi->mi_max_threads == 0) {
1313                 mutex_exit(&mi->mi_async_lock);
1314                 goto noasync;
1315         }
1316 
1317         /*
1318          * Link request structure into the async list and
1319          * wakeup async thread to do the i/o.
1320          */
1321         if (mi->mi_async_reqs[NFS4_READ_AHEAD] == NULL) {
1322                 mi->mi_async_reqs[NFS4_READ_AHEAD] = args;
1323                 mi->mi_async_tail[NFS4_READ_AHEAD] = args;
1324         } else {
1325                 mi->mi_async_tail[NFS4_READ_AHEAD]->a_next = args;
1326                 mi->mi_async_tail[NFS4_READ_AHEAD] = args;
1327         }
1328 
1329         if (mi->mi_io_kstats) {
1330                 mutex_enter(&mi->mi_lock);
1331                 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1332                 mutex_exit(&mi->mi_lock);
1333         }
1334 
1335         mi->mi_async_req_count++;
1336         ASSERT(mi->mi_async_req_count != 0);
1337         cv_signal(&mi->mi_async_reqs_cv);
1338         mutex_exit(&mi->mi_async_lock);
1339         return (0);
1340 
1341 noasync:
1342         mutex_enter(&rp->r_statelock);
1343         rp->r_count--;
1344         cv_broadcast(&rp->r_cv);
1345         mutex_exit(&rp->r_statelock);
1346         VN_RELE(vp);
1347         crfree(cr);
1348         kmem_free(args, sizeof (*args));
1349         return (-1);
1350 }
1351 
1352 static void
1353 nfs4_async_start(struct vfs *vfsp)
1354 {
1355         nfs4_async_common_start(vfsp, NFS4_ASYNC_QUEUE);
1356 }
1357 
1358 static void
1359 nfs4_async_pgops_start(struct vfs *vfsp)
1360 {
1361         nfs4_async_common_start(vfsp, NFS4_ASYNC_PGOPS_QUEUE);
1362 }
1363 
1364 /*
1365  * The async queues for each mounted file system are arranged as a
1366  * set of queues, one for each async i/o type.  Requests are taken
1367  * from the queues in a round-robin fashion.  A number of consecutive
1368  * requests are taken from each queue before moving on to the next
1369  * queue.  This functionality may allow the NFS Version 2 server to do
1370  * write clustering, even if the client is mixing writes and reads
1371  * because it will take multiple write requests from the queue
1372  * before processing any of the other async i/o types.
1373  *
1374  * XXX The nfs4_async_common_start thread is unsafe in the light of the present
1375  * model defined by cpr to suspend the system. Specifically over the
1376  * wire calls are cpr-unsafe. The thread should be reevaluated in
1377  * case of future updates to the cpr model.
1378  */
1379 static void
1380 nfs4_async_common_start(struct vfs *vfsp, int async_queue)
1381 {
1382         struct nfs4_async_reqs *args;
1383         mntinfo4_t *mi = VFTOMI4(vfsp);
1384         clock_t time_left = 1;
1385         callb_cpr_t cprinfo;
1386         int i;
1387         extern int nfs_async_timeout;
1388         int async_types;
1389         kcondvar_t *async_work_cv;
1390 
1391         if (async_queue == NFS4_ASYNC_QUEUE) {
1392                 async_types = NFS4_ASYNC_TYPES;
1393                 async_work_cv = &mi->mi_async_work_cv[NFS4_ASYNC_QUEUE];
1394         } else {
1395                 async_types = NFS4_ASYNC_PGOPS_TYPES;
1396                 async_work_cv = &mi->mi_async_work_cv[NFS4_ASYNC_PGOPS_QUEUE];
1397         }
1398 
1399         /*
1400          * Dynamic initialization of nfs_async_timeout to allow nfs to be
1401          * built in an implementation independent manner.
1402          */
1403         if (nfs_async_timeout == -1)
1404                 nfs_async_timeout = NFS_ASYNC_TIMEOUT;
1405 
1406         CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, "nas");
1407 
1408         mutex_enter(&mi->mi_async_lock);
1409         for (;;) {
1410                 /*
1411                  * Find the next queue containing an entry.  We start
1412                  * at the current queue pointer and then round robin
1413                  * through all of them until we either find a non-empty
1414                  * queue or have looked through all of them.
1415                  */
1416                 for (i = 0; i < async_types; i++) {
1417                         args = *mi->mi_async_curr[async_queue];
1418                         if (args != NULL)
1419                                 break;
1420                         mi->mi_async_curr[async_queue]++;
1421                         if (mi->mi_async_curr[async_queue] ==
1422                             &mi->mi_async_reqs[async_types]) {
1423                                 mi->mi_async_curr[async_queue] =
1424                                     &mi->mi_async_reqs[0];
1425                         }
1426                 }
1427                 /*
1428                  * If we didn't find a entry, then block until woken up
1429                  * again and then look through the queues again.
1430                  */
1431                 if (args == NULL) {
1432                         /*
1433                          * Exiting is considered to be safe for CPR as well
1434                          */
1435                         CALLB_CPR_SAFE_BEGIN(&cprinfo);
1436 
1437                         /*
1438                          * Wakeup thread waiting to unmount the file
1439                          * system only if all async threads are inactive.
1440                          *
1441                          * If we've timed-out and there's nothing to do,
1442                          * then get rid of this thread.
1443                          */
1444                         if (mi->mi_max_threads == 0 || time_left <= 0) {
1445                                 --mi->mi_threads[async_queue];
1446 
1447                                 if (mi->mi_threads[NFS4_ASYNC_QUEUE] == 0 &&
1448                                     mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] == 0)
1449                                         cv_signal(&mi->mi_async_cv);
1450                                 CALLB_CPR_EXIT(&cprinfo);
1451                                 VFS_RELE(vfsp); /* release thread's hold */
1452                                 MI4_RELE(mi);
1453                                 zthread_exit();
1454                                 /* NOTREACHED */
1455                         }
1456                         time_left = cv_reltimedwait(async_work_cv,
1457                             &mi->mi_async_lock, nfs_async_timeout,
1458                             TR_CLOCK_TICK);
1459 
1460                         CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
1461 
1462                         continue;
1463                 } else {
1464                         time_left = 1;
1465                 }
1466 
1467                 /*
1468                  * Remove the request from the async queue and then
1469                  * update the current async request queue pointer.  If
1470                  * the current queue is empty or we have removed enough
1471                  * consecutive entries from it, then reset the counter
1472                  * for this queue and then move the current pointer to
1473                  * the next queue.
1474                  */
1475                 *mi->mi_async_curr[async_queue] = args->a_next;
1476                 if (*mi->mi_async_curr[async_queue] == NULL ||
1477                     --mi->mi_async_clusters[args->a_io] == 0) {
1478                         mi->mi_async_clusters[args->a_io] =
1479                             mi->mi_async_init_clusters;
1480                         mi->mi_async_curr[async_queue]++;
1481                         if (mi->mi_async_curr[async_queue] ==
1482                             &mi->mi_async_reqs[async_types]) {
1483                                 mi->mi_async_curr[async_queue] =
1484                                     &mi->mi_async_reqs[0];
1485                         }
1486                 }
1487 
1488                 if (args->a_io != NFS4_INACTIVE && mi->mi_io_kstats) {
1489                         mutex_enter(&mi->mi_lock);
1490                         kstat_waitq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
1491                         mutex_exit(&mi->mi_lock);
1492                 }
1493 
1494                 mutex_exit(&mi->mi_async_lock);
1495 
1496                 /*
1497                  * Obtain arguments from the async request structure.
1498                  */
1499                 if (args->a_io == NFS4_READ_AHEAD && mi->mi_max_threads > 0) {
1500                         (*args->a_nfs4_readahead)(args->a_vp,
1501                             args->a_nfs4_blkoff, args->a_nfs4_addr,
1502                             args->a_nfs4_seg, args->a_cred);
1503                 } else if (args->a_io == NFS4_PUTAPAGE) {
1504                         (void) (*args->a_nfs4_putapage)(args->a_vp,
1505                             args->a_nfs4_pp, args->a_nfs4_off,
1506                             args->a_nfs4_len, args->a_nfs4_flags,
1507                             args->a_cred);
1508                 } else if (args->a_io == NFS4_PAGEIO) {
1509                         (void) (*args->a_nfs4_pageio)(args->a_vp,
1510                             args->a_nfs4_pp, args->a_nfs4_off,
1511                             args->a_nfs4_len, args->a_nfs4_flags,
1512                             args->a_cred);
1513                 } else if (args->a_io == NFS4_READDIR) {
1514                         (void) ((*args->a_nfs4_readdir)(args->a_vp,
1515                             args->a_nfs4_rdc, args->a_cred));
1516                 } else if (args->a_io == NFS4_COMMIT) {
1517                         (*args->a_nfs4_commit)(args->a_vp, args->a_nfs4_plist,
1518                             args->a_nfs4_offset, args->a_nfs4_count,
1519                             args->a_cred);
1520                 } else if (args->a_io == NFS4_INACTIVE) {
1521                         nfs4_inactive_otw(args->a_vp, args->a_cred);
1522                 }
1523 
1524                 /*
1525                  * Now, release the vnode and free the credentials
1526                  * structure.
1527                  */
1528                 free_async_args4(args);
1529                 /*
1530                  * Reacquire the mutex because it will be needed above.
1531                  */
1532                 mutex_enter(&mi->mi_async_lock);
1533         }
1534 }
1535 
1536 /*
1537  * nfs4_inactive_thread - look for vnodes that need over-the-wire calls as
1538  * part of VOP_INACTIVE.
1539  */
1540 
1541 void
1542 nfs4_inactive_thread(mntinfo4_t *mi)
1543 {
1544         struct nfs4_async_reqs *args;
1545         callb_cpr_t cprinfo;
1546         vfs_t *vfsp = mi->mi_vfsp;
1547 
1548         CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr,
1549             "nfs4_inactive_thread");
1550 
1551         for (;;) {
1552                 mutex_enter(&mi->mi_async_lock);
1553                 args = mi->mi_async_reqs[NFS4_INACTIVE];
1554                 if (args == NULL) {
1555                         mutex_enter(&mi->mi_lock);
1556                         /*
1557                          * We don't want to exit until the async manager is done
1558                          * with its work; hence the check for mi_manager_thread
1559                          * being NULL.
1560                          *
1561                          * The async manager thread will cv_broadcast() on
1562                          * mi_inact_req_cv when it's done, at which point we'll
1563                          * wake up and exit.
1564                          */
1565                         if (mi->mi_manager_thread == NULL)
1566                                 goto die;
1567                         mi->mi_flags |= MI4_INACTIVE_IDLE;
1568                         mutex_exit(&mi->mi_lock);
1569                         cv_signal(&mi->mi_async_cv);
1570                         CALLB_CPR_SAFE_BEGIN(&cprinfo);
1571                         cv_wait(&mi->mi_inact_req_cv, &mi->mi_async_lock);
1572                         CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
1573                         mutex_exit(&mi->mi_async_lock);
1574                 } else {
1575                         mutex_enter(&mi->mi_lock);
1576                         mi->mi_flags &= ~MI4_INACTIVE_IDLE;
1577                         mutex_exit(&mi->mi_lock);
1578                         mi->mi_async_reqs[NFS4_INACTIVE] = args->a_next;
1579                         mutex_exit(&mi->mi_async_lock);
1580                         nfs4_inactive_otw(args->a_vp, args->a_cred);
1581                         crfree(args->a_cred);
1582                         kmem_free(args, sizeof (*args));
1583                 }
1584         }
1585 die:
1586         mutex_exit(&mi->mi_lock);
1587         mi->mi_inactive_thread = NULL;
1588         cv_signal(&mi->mi_async_cv);
1589 
1590         /*
1591          * There is no explicit call to mutex_exit(&mi->mi_async_lock) since
1592          * CALLB_CPR_EXIT is actually responsible for releasing 'mi_async_lock'.
1593          */
1594         CALLB_CPR_EXIT(&cprinfo);
1595 
1596         NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
1597             "nfs4_inactive_thread exiting for vfs %p\n", (void *)vfsp));
1598 
1599         MI4_RELE(mi);
1600         zthread_exit();
1601         /* NOTREACHED */
1602 }
1603 
1604 /*
1605  * nfs_async_stop:
1606  * Wait for all outstanding putpage operations and the inactive thread to
1607  * complete; nfs4_async_stop_sig() without interruptibility.
1608  */
1609 void
1610 nfs4_async_stop(struct vfs *vfsp)
1611 {
1612         mntinfo4_t *mi = VFTOMI4(vfsp);
1613 
1614         /*
1615          * Wait for all outstanding async operations to complete and for
1616          * worker threads to exit.
1617          */
1618         mutex_enter(&mi->mi_async_lock);
1619         mi->mi_max_threads = 0;
1620         NFS4_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv);
1621         while (mi->mi_threads[NFS4_ASYNC_QUEUE] != 0 ||
1622             mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] != 0)
1623                 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
1624 
1625         /*
1626          * Wait for the inactive thread to finish doing what it's doing.  It
1627          * won't exit until the last reference to the vfs_t goes away.
1628          */
1629         if (mi->mi_inactive_thread != NULL) {
1630                 mutex_enter(&mi->mi_lock);
1631                 while (!(mi->mi_flags & MI4_INACTIVE_IDLE) ||
1632                     (mi->mi_async_reqs[NFS4_INACTIVE] != NULL)) {
1633                         mutex_exit(&mi->mi_lock);
1634                         cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
1635                         mutex_enter(&mi->mi_lock);
1636                 }
1637                 mutex_exit(&mi->mi_lock);
1638         }
1639         mutex_exit(&mi->mi_async_lock);
1640 }
1641 
1642 /*
1643  * nfs_async_stop_sig:
1644  * Wait for all outstanding putpage operations and the inactive thread to
1645  * complete. If a signal is delivered we will abort and return non-zero;
1646  * otherwise return 0. Since this routine is called from nfs4_unmount, we
1647  * need to make it interruptible.
1648  */
1649 int
1650 nfs4_async_stop_sig(struct vfs *vfsp)
1651 {
1652         mntinfo4_t *mi = VFTOMI4(vfsp);
1653         ushort_t omax;
1654         bool_t intr = FALSE;
1655 
1656         /*
1657          * Wait for all outstanding putpage operations to complete and for
1658          * worker threads to exit.
1659          */
1660         mutex_enter(&mi->mi_async_lock);
1661         omax = mi->mi_max_threads;
1662         mi->mi_max_threads = 0;
1663         NFS4_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv);
1664         while (mi->mi_threads[NFS4_ASYNC_QUEUE] != 0 ||
1665             mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] != 0) {
1666                 if (!cv_wait_sig(&mi->mi_async_cv, &mi->mi_async_lock)) {
1667                         intr = TRUE;
1668                         goto interrupted;
1669                 }
1670         }
1671 
1672         /*
1673          * Wait for the inactive thread to finish doing what it's doing.  It
1674          * won't exit until the a last reference to the vfs_t goes away.
1675          */
1676         if (mi->mi_inactive_thread != NULL) {
1677                 mutex_enter(&mi->mi_lock);
1678                 while (!(mi->mi_flags & MI4_INACTIVE_IDLE) ||
1679                     (mi->mi_async_reqs[NFS4_INACTIVE] != NULL)) {
1680                         mutex_exit(&mi->mi_lock);
1681                         if (!cv_wait_sig(&mi->mi_async_cv,
1682                             &mi->mi_async_lock)) {
1683                                 intr = TRUE;
1684                                 goto interrupted;
1685                         }
1686                         mutex_enter(&mi->mi_lock);
1687                 }
1688                 mutex_exit(&mi->mi_lock);
1689         }
1690 interrupted:
1691         if (intr)
1692                 mi->mi_max_threads = omax;
1693         mutex_exit(&mi->mi_async_lock);
1694 
1695         return (intr);
1696 }
1697 
1698 int
1699 nfs4_async_putapage(vnode_t *vp, page_t *pp, u_offset_t off, size_t len,
1700     int flags, cred_t *cr, int (*putapage)(vnode_t *, page_t *,
1701     u_offset_t, size_t, int, cred_t *))
1702 {
1703         rnode4_t *rp;
1704         mntinfo4_t *mi;
1705         struct nfs4_async_reqs *args;
1706 
1707         ASSERT(flags & B_ASYNC);
1708         ASSERT(vp->v_vfsp != NULL);
1709 
1710         rp = VTOR4(vp);
1711         ASSERT(rp->r_count > 0);
1712 
1713         mi = VTOMI4(vp);
1714 
1715         /*
1716          * If we can't allocate a request structure, do the putpage
1717          * operation synchronously in this thread's context.
1718          */
1719         if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1720                 goto noasync;
1721 
1722         args->a_next = NULL;
1723 #ifdef DEBUG
1724         args->a_queuer = curthread;
1725 #endif
1726         VN_HOLD(vp);
1727         args->a_vp = vp;
1728         ASSERT(cr != NULL);
1729         crhold(cr);
1730         args->a_cred = cr;
1731         args->a_io = NFS4_PUTAPAGE;
1732         args->a_nfs4_putapage = putapage;
1733         args->a_nfs4_pp = pp;
1734         args->a_nfs4_off = off;
1735         args->a_nfs4_len = (uint_t)len;
1736         args->a_nfs4_flags = flags;
1737 
1738         mutex_enter(&mi->mi_async_lock);
1739 
1740         /*
1741          * If asyncio has been disabled, then make a synchronous request.
1742          * This check is done a second time in case async io was diabled
1743          * while this thread was blocked waiting for memory pressure to
1744          * reduce or for the queue to drain.
1745          */
1746         if (mi->mi_max_threads == 0) {
1747                 mutex_exit(&mi->mi_async_lock);
1748 
1749                 VN_RELE(vp);
1750                 crfree(cr);
1751                 kmem_free(args, sizeof (*args));
1752                 goto noasync;
1753         }
1754 
1755         /*
1756          * Link request structure into the async list and
1757          * wakeup async thread to do the i/o.
1758          */
1759         if (mi->mi_async_reqs[NFS4_PUTAPAGE] == NULL) {
1760                 mi->mi_async_reqs[NFS4_PUTAPAGE] = args;
1761                 mi->mi_async_tail[NFS4_PUTAPAGE] = args;
1762         } else {
1763                 mi->mi_async_tail[NFS4_PUTAPAGE]->a_next = args;
1764                 mi->mi_async_tail[NFS4_PUTAPAGE] = args;
1765         }
1766 
1767         mutex_enter(&rp->r_statelock);
1768         rp->r_count++;
1769         rp->r_awcount++;
1770         mutex_exit(&rp->r_statelock);
1771 
1772         if (mi->mi_io_kstats) {
1773                 mutex_enter(&mi->mi_lock);
1774                 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1775                 mutex_exit(&mi->mi_lock);
1776         }
1777 
1778         mi->mi_async_req_count++;
1779         ASSERT(mi->mi_async_req_count != 0);
1780         cv_signal(&mi->mi_async_reqs_cv);
1781         mutex_exit(&mi->mi_async_lock);
1782         return (0);
1783 
1784 noasync:
1785 
1786         if (curproc == proc_pageout || curproc == proc_fsflush ||
1787             nfs_zone() == mi->mi_zone) {
1788                 /*
1789                  * If we get here in the context of the pageout/fsflush,
1790                  * or we have run out of memory or we're attempting to
1791                  * unmount we refuse to do a sync write, because this may
1792                  * hang pageout/fsflush and the machine. In this case,
1793                  * we just re-mark the page as dirty and punt on the page.
1794                  *
1795                  * Make sure B_FORCE isn't set.  We can re-mark the
1796                  * pages as dirty and unlock the pages in one swoop by
1797                  * passing in B_ERROR to pvn_write_done().  However,
1798                  * we should make sure B_FORCE isn't set - we don't
1799                  * want the page tossed before it gets written out.
1800                  */
1801                 if (flags & B_FORCE)
1802                         flags &= ~(B_INVAL | B_FORCE);
1803                 pvn_write_done(pp, flags | B_ERROR);
1804                 return (0);
1805         }
1806 

1807         /*
1808          * We'll get here only if (nfs_zone() != mi->mi_zone)
1809          * which means that this was a cross-zone sync putpage.
1810          *
1811          * We pass in B_ERROR to pvn_write_done() to re-mark the pages
1812          * as dirty and unlock them.
1813          *
1814          * We don't want to clear B_FORCE here as the caller presumably
1815          * knows what they're doing if they set it.
1816          */
1817         pvn_write_done(pp, flags | B_ERROR);
1818         return (EPERM);


1819 }
1820 
1821 int
1822 nfs4_async_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
1823     int flags, cred_t *cr, int (*pageio)(vnode_t *, page_t *, u_offset_t,
1824     size_t, int, cred_t *))
1825 {
1826         rnode4_t *rp;
1827         mntinfo4_t *mi;
1828         struct nfs4_async_reqs *args;
1829 
1830         ASSERT(flags & B_ASYNC);
1831         ASSERT(vp->v_vfsp != NULL);
1832 
1833         rp = VTOR4(vp);
1834         ASSERT(rp->r_count > 0);
1835 
1836         mi = VTOMI4(vp);
1837 
1838         /*
1839          * If we can't allocate a request structure, do the pageio
1840          * request synchronously in this thread's context.
1841          */
1842         if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1843                 goto noasync;
1844 
1845         args->a_next = NULL;
1846 #ifdef DEBUG
1847         args->a_queuer = curthread;
1848 #endif
1849         VN_HOLD(vp);
1850         args->a_vp = vp;
1851         ASSERT(cr != NULL);
1852         crhold(cr);
1853         args->a_cred = cr;
1854         args->a_io = NFS4_PAGEIO;
1855         args->a_nfs4_pageio = pageio;
1856         args->a_nfs4_pp = pp;
1857         args->a_nfs4_off = io_off;
1858         args->a_nfs4_len = (uint_t)io_len;
1859         args->a_nfs4_flags = flags;
1860 
1861         mutex_enter(&mi->mi_async_lock);
1862 
1863         /*
1864          * If asyncio has been disabled, then make a synchronous request.
1865          * This check is done a second time in case async io was diabled
1866          * while this thread was blocked waiting for memory pressure to
1867          * reduce or for the queue to drain.
1868          */
1869         if (mi->mi_max_threads == 0) {
1870                 mutex_exit(&mi->mi_async_lock);
1871 
1872                 VN_RELE(vp);
1873                 crfree(cr);
1874                 kmem_free(args, sizeof (*args));
1875                 goto noasync;
1876         }
1877 
1878         /*
1879          * Link request structure into the async list and
1880          * wakeup async thread to do the i/o.
1881          */
1882         if (mi->mi_async_reqs[NFS4_PAGEIO] == NULL) {
1883                 mi->mi_async_reqs[NFS4_PAGEIO] = args;
1884                 mi->mi_async_tail[NFS4_PAGEIO] = args;
1885         } else {
1886                 mi->mi_async_tail[NFS4_PAGEIO]->a_next = args;
1887                 mi->mi_async_tail[NFS4_PAGEIO] = args;
1888         }
1889 
1890         mutex_enter(&rp->r_statelock);
1891         rp->r_count++;
1892         rp->r_awcount++;
1893         mutex_exit(&rp->r_statelock);
1894 
1895         if (mi->mi_io_kstats) {
1896                 mutex_enter(&mi->mi_lock);
1897                 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1898                 mutex_exit(&mi->mi_lock);
1899         }
1900 
1901         mi->mi_async_req_count++;
1902         ASSERT(mi->mi_async_req_count != 0);
1903         cv_signal(&mi->mi_async_reqs_cv);
1904         mutex_exit(&mi->mi_async_lock);
1905         return (0);
1906 
1907 noasync:
1908         /*
1909          * If we can't do it ASYNC, for reads we do nothing (but cleanup
1910          * the page list), for writes we do it synchronously, except for
1911          * proc_pageout/proc_fsflush as described below.
1912          */
1913         if (flags & B_READ) {
1914                 pvn_read_done(pp, flags | B_ERROR);
1915                 return (0);
1916         }
1917 
1918         if (curproc == proc_pageout || curproc == proc_fsflush) {
1919                 /*
1920                  * If we get here in the context of the pageout/fsflush,
1921                  * we refuse to do a sync write, because this may hang
1922                  * pageout/fsflush (and the machine). In this case, we just
1923                  * re-mark the page as dirty and punt on the page.
1924                  *
1925                  * Make sure B_FORCE isn't set.  We can re-mark the
1926                  * pages as dirty and unlock the pages in one swoop by
1927                  * passing in B_ERROR to pvn_write_done().  However,
1928                  * we should make sure B_FORCE isn't set - we don't
1929                  * want the page tossed before it gets written out.
1930                  */
1931                 if (flags & B_FORCE)
1932                         flags &= ~(B_INVAL | B_FORCE);
1933                 pvn_write_done(pp, flags | B_ERROR);
1934                 return (0);
1935         }
1936 
1937         if (nfs_zone() != mi->mi_zone) {
1938                 /*
1939                  * So this was a cross-zone sync pageio.  We pass in B_ERROR
1940                  * to pvn_write_done() to re-mark the pages as dirty and unlock
1941                  * them.
1942                  *
1943                  * We don't want to clear B_FORCE here as the caller presumably
1944                  * knows what they're doing if they set it.
1945                  */
1946                 pvn_write_done(pp, flags | B_ERROR);
1947                 return (EPERM);
1948         }
1949         return ((*pageio)(vp, pp, io_off, io_len, flags, cr));
1950 }
1951 
1952 void
1953 nfs4_async_readdir(vnode_t *vp, rddir4_cache *rdc, cred_t *cr,
1954     int (*readdir)(vnode_t *, rddir4_cache *, cred_t *))
1955 {
1956         rnode4_t *rp;
1957         mntinfo4_t *mi;
1958         struct nfs4_async_reqs *args;
1959 
1960         rp = VTOR4(vp);
1961         ASSERT(rp->r_freef == NULL);
1962 
1963         mi = VTOMI4(vp);
1964 
1965         /*
1966          * If we can't allocate a request structure, skip the readdir.
1967          */
1968         if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1969                 goto noasync;
1970 
1971         args->a_next = NULL;
1972 #ifdef DEBUG
1973         args->a_queuer = curthread;
1974 #endif
1975         VN_HOLD(vp);
1976         args->a_vp = vp;
1977         ASSERT(cr != NULL);
1978         crhold(cr);
1979         args->a_cred = cr;
1980         args->a_io = NFS4_READDIR;
1981         args->a_nfs4_readdir = readdir;
1982         args->a_nfs4_rdc = rdc;
1983 
1984         mutex_enter(&mi->mi_async_lock);
1985 
1986         /*
1987          * If asyncio has been disabled, then skip this request
1988          */
1989         if (mi->mi_max_threads == 0) {
1990                 mutex_exit(&mi->mi_async_lock);
1991 
1992                 VN_RELE(vp);
1993                 crfree(cr);
1994                 kmem_free(args, sizeof (*args));
1995                 goto noasync;
1996         }
1997 
1998         /*
1999          * Link request structure into the async list and
2000          * wakeup async thread to do the i/o.
2001          */
2002         if (mi->mi_async_reqs[NFS4_READDIR] == NULL) {
2003                 mi->mi_async_reqs[NFS4_READDIR] = args;
2004                 mi->mi_async_tail[NFS4_READDIR] = args;
2005         } else {
2006                 mi->mi_async_tail[NFS4_READDIR]->a_next = args;
2007                 mi->mi_async_tail[NFS4_READDIR] = args;
2008         }
2009 
2010         mutex_enter(&rp->r_statelock);
2011         rp->r_count++;
2012         mutex_exit(&rp->r_statelock);
2013 
2014         if (mi->mi_io_kstats) {
2015                 mutex_enter(&mi->mi_lock);
2016                 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
2017                 mutex_exit(&mi->mi_lock);
2018         }
2019 
2020         mi->mi_async_req_count++;
2021         ASSERT(mi->mi_async_req_count != 0);
2022         cv_signal(&mi->mi_async_reqs_cv);
2023         mutex_exit(&mi->mi_async_lock);
2024         return;
2025 
2026 noasync:
2027         mutex_enter(&rp->r_statelock);
2028         rdc->entries = NULL;
2029         /*
2030          * Indicate that no one is trying to fill this entry and
2031          * it still needs to be filled.
2032          */
2033         rdc->flags &= ~RDDIR;
2034         rdc->flags |= RDDIRREQ;
2035         rddir4_cache_rele(rp, rdc);
2036         mutex_exit(&rp->r_statelock);
2037 }
2038 
2039 void
2040 nfs4_async_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count,
2041     cred_t *cr, void (*commit)(vnode_t *, page_t *, offset3, count3,
2042     cred_t *))
2043 {
2044         rnode4_t *rp;
2045         mntinfo4_t *mi;
2046         struct nfs4_async_reqs *args;
2047         page_t *pp;
2048 
2049         rp = VTOR4(vp);
2050         mi = VTOMI4(vp);
2051 
2052         /*
2053          * If we can't allocate a request structure, do the commit
2054          * operation synchronously in this thread's context.
2055          */
2056         if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
2057                 goto noasync;
2058 
2059         args->a_next = NULL;
2060 #ifdef DEBUG
2061         args->a_queuer = curthread;
2062 #endif
2063         VN_HOLD(vp);
2064         args->a_vp = vp;
2065         ASSERT(cr != NULL);
2066         crhold(cr);
2067         args->a_cred = cr;
2068         args->a_io = NFS4_COMMIT;
2069         args->a_nfs4_commit = commit;
2070         args->a_nfs4_plist = plist;
2071         args->a_nfs4_offset = offset;
2072         args->a_nfs4_count = count;
2073 
2074         mutex_enter(&mi->mi_async_lock);
2075 
2076         /*
2077          * If asyncio has been disabled, then make a synchronous request.
2078          * This check is done a second time in case async io was diabled
2079          * while this thread was blocked waiting for memory pressure to
2080          * reduce or for the queue to drain.
2081          */
2082         if (mi->mi_max_threads == 0) {
2083                 mutex_exit(&mi->mi_async_lock);
2084 
2085                 VN_RELE(vp);
2086                 crfree(cr);
2087                 kmem_free(args, sizeof (*args));
2088                 goto noasync;
2089         }
2090 
2091         /*
2092          * Link request structure into the async list and
2093          * wakeup async thread to do the i/o.
2094          */
2095         if (mi->mi_async_reqs[NFS4_COMMIT] == NULL) {
2096                 mi->mi_async_reqs[NFS4_COMMIT] = args;
2097                 mi->mi_async_tail[NFS4_COMMIT] = args;
2098         } else {
2099                 mi->mi_async_tail[NFS4_COMMIT]->a_next = args;
2100                 mi->mi_async_tail[NFS4_COMMIT] = args;
2101         }
2102 
2103         mutex_enter(&rp->r_statelock);
2104         rp->r_count++;
2105         mutex_exit(&rp->r_statelock);
2106 
2107         if (mi->mi_io_kstats) {
2108                 mutex_enter(&mi->mi_lock);
2109                 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
2110                 mutex_exit(&mi->mi_lock);
2111         }
2112 
2113         mi->mi_async_req_count++;
2114         ASSERT(mi->mi_async_req_count != 0);
2115         cv_signal(&mi->mi_async_reqs_cv);
2116         mutex_exit(&mi->mi_async_lock);
2117         return;
2118 
2119 noasync:
2120         if (curproc == proc_pageout || curproc == proc_fsflush ||
2121             nfs_zone() != mi->mi_zone) {
2122                 while (plist != NULL) {
2123                         pp = plist;
2124                         page_sub(&plist, pp);
2125                         pp->p_fsdata = C_COMMIT;
2126                         page_unlock(pp);
2127                 }
2128                 return;
2129         }
2130         (*commit)(vp, plist, offset, count, cr);
2131 }
2132 
2133 /*
2134  * nfs4_async_inactive - hand off a VOP_INACTIVE call to a thread.  The
2135  * reference to the vnode is handed over to the thread; the caller should
2136  * no longer refer to the vnode.
2137  *
2138  * Unlike most of the async routines, this handoff is needed for
2139  * correctness reasons, not just performance.  So doing operations in the
2140  * context of the current thread is not an option.
2141  */
2142 void
2143 nfs4_async_inactive(vnode_t *vp, cred_t *cr)
2144 {
2145         mntinfo4_t *mi;
2146         struct nfs4_async_reqs *args;
2147         boolean_t signal_inactive_thread = B_FALSE;
2148 
2149         mi = VTOMI4(vp);
2150 
2151         args = kmem_alloc(sizeof (*args), KM_SLEEP);
2152         args->a_next = NULL;
2153 #ifdef DEBUG
2154         args->a_queuer = curthread;
2155 #endif
2156         args->a_vp = vp;
2157         ASSERT(cr != NULL);
2158         crhold(cr);
2159         args->a_cred = cr;
2160         args->a_io = NFS4_INACTIVE;
2161 
2162         /*
2163          * Note that we don't check mi->mi_max_threads here, since we
2164          * *need* to get rid of this vnode regardless of whether someone
2165          * set nfs4_max_threads to zero in /etc/system.
2166          *
2167          * The manager thread knows about this and is willing to create
2168          * at least one thread to accommodate us.
2169          */
2170         mutex_enter(&mi->mi_async_lock);
2171         if (mi->mi_inactive_thread == NULL) {
2172                 rnode4_t *rp;
2173                 vnode_t *unldvp = NULL;
2174                 char *unlname;
2175                 cred_t *unlcred;
2176 
2177                 mutex_exit(&mi->mi_async_lock);
2178                 /*
2179                  * We just need to free up the memory associated with the
2180                  * vnode, which can be safely done from within the current
2181                  * context.
2182                  */
2183                 crfree(cr);     /* drop our reference */
2184                 kmem_free(args, sizeof (*args));
2185                 rp = VTOR4(vp);
2186                 mutex_enter(&rp->r_statelock);
2187                 if (rp->r_unldvp != NULL) {
2188                         unldvp = rp->r_unldvp;
2189                         rp->r_unldvp = NULL;
2190                         unlname = rp->r_unlname;
2191                         rp->r_unlname = NULL;
2192                         unlcred = rp->r_unlcred;
2193                         rp->r_unlcred = NULL;
2194                 }
2195                 mutex_exit(&rp->r_statelock);
2196                 /*
2197                  * No need to explicitly throw away any cached pages.  The
2198                  * eventual r4inactive() will attempt a synchronous
2199                  * VOP_PUTPAGE() which will immediately fail since the request
2200                  * is coming from the wrong zone, and then will proceed to call
2201                  * nfs4_invalidate_pages() which will clean things up for us.
2202                  *
2203                  * Throw away the delegation here so rp4_addfree()'s attempt to
2204                  * return any existing delegations becomes a no-op.
2205                  */
2206                 if (rp->r_deleg_type != OPEN_DELEGATE_NONE) {
2207                         (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER,
2208                             FALSE);
2209                         (void) nfs4delegreturn(rp, NFS4_DR_DISCARD);
2210                         nfs_rw_exit(&mi->mi_recovlock);
2211                 }
2212                 nfs4_clear_open_streams(rp);
2213 
2214                 rp4_addfree(rp, cr);
2215                 if (unldvp != NULL) {
2216                         kmem_free(unlname, MAXNAMELEN);
2217                         VN_RELE(unldvp);
2218                         crfree(unlcred);
2219                 }
2220                 return;
2221         }
2222 
2223         if (mi->mi_manager_thread == NULL) {
2224                 /*
2225                  * We want to talk to the inactive thread.
2226                  */
2227                 signal_inactive_thread = B_TRUE;
2228         }
2229 
2230         /*
2231          * Enqueue the vnode and wake up either the special thread (empty
2232          * list) or an async thread.
2233          */
2234         if (mi->mi_async_reqs[NFS4_INACTIVE] == NULL) {
2235                 mi->mi_async_reqs[NFS4_INACTIVE] = args;
2236                 mi->mi_async_tail[NFS4_INACTIVE] = args;
2237                 signal_inactive_thread = B_TRUE;
2238         } else {
2239                 mi->mi_async_tail[NFS4_INACTIVE]->a_next = args;
2240                 mi->mi_async_tail[NFS4_INACTIVE] = args;
2241         }
2242         if (signal_inactive_thread) {
2243                 cv_signal(&mi->mi_inact_req_cv);
2244         } else  {
2245                 mi->mi_async_req_count++;
2246                 ASSERT(mi->mi_async_req_count != 0);
2247                 cv_signal(&mi->mi_async_reqs_cv);
2248         }
2249 
2250         mutex_exit(&mi->mi_async_lock);
2251 }
2252 
2253 int
2254 writerp4(rnode4_t *rp, caddr_t base, int tcount, struct uio *uio, int pgcreated)
2255 {
2256         int pagecreate;
2257         int n;
2258         int saved_n;
2259         caddr_t saved_base;
2260         u_offset_t offset;
2261         int error;
2262         int sm_error;
2263         vnode_t *vp = RTOV(rp);
2264 
2265         ASSERT(tcount <= MAXBSIZE && tcount <= uio->uio_resid);
2266         ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_WRITER));
2267         if (!vpm_enable) {
2268                 ASSERT(((uintptr_t)base & MAXBOFFSET) + tcount <= MAXBSIZE);
2269         }
2270 
2271         /*
2272          * Move bytes in at most PAGESIZE chunks. We must avoid
2273          * spanning pages in uiomove() because page faults may cause
2274          * the cache to be invalidated out from under us. The r_size is not
2275          * updated until after the uiomove. If we push the last page of a
2276          * file before r_size is correct, we will lose the data written past
2277          * the current (and invalid) r_size.
2278          */
2279         do {
2280                 offset = uio->uio_loffset;
2281                 pagecreate = 0;
2282 
2283                 /*
2284                  * n is the number of bytes required to satisfy the request
2285                  *   or the number of bytes to fill out the page.
2286                  */
2287                 n = (int)MIN((PAGESIZE - (offset & PAGEOFFSET)), tcount);
2288 
2289                 /*
2290                  * Check to see if we can skip reading in the page
2291                  * and just allocate the memory.  We can do this
2292                  * if we are going to rewrite the entire mapping
2293                  * or if we are going to write to or beyond the current
2294                  * end of file from the beginning of the mapping.
2295                  *
2296                  * The read of r_size is now protected by r_statelock.
2297                  */
2298                 mutex_enter(&rp->r_statelock);
2299                 /*
2300                  * When pgcreated is nonzero the caller has already done
2301                  * a segmap_getmapflt with forcefault 0 and S_WRITE. With
2302                  * segkpm this means we already have at least one page
2303                  * created and mapped at base.
2304                  */
2305                 pagecreate = pgcreated ||
2306                     ((offset & PAGEOFFSET) == 0 &&
2307                     (n == PAGESIZE || ((offset + n) >= rp->r_size)));
2308 
2309                 mutex_exit(&rp->r_statelock);
2310 
2311                 if (!vpm_enable && pagecreate) {
2312                         /*
2313                          * The last argument tells segmap_pagecreate() to
2314                          * always lock the page, as opposed to sometimes
2315                          * returning with the page locked. This way we avoid a
2316                          * fault on the ensuing uiomove(), but also
2317                          * more importantly (to fix bug 1094402) we can
2318                          * call segmap_fault() to unlock the page in all
2319                          * cases. An alternative would be to modify
2320                          * segmap_pagecreate() to tell us when it is
2321                          * locking a page, but that's a fairly major
2322                          * interface change.
2323                          */
2324                         if (pgcreated == 0)
2325                                 (void) segmap_pagecreate(segkmap, base,
2326                                     (uint_t)n, 1);
2327                         saved_base = base;
2328                         saved_n = n;
2329                 }
2330 
2331                 /*
2332                  * The number of bytes of data in the last page can not
2333                  * be accurately be determined while page is being
2334                  * uiomove'd to and the size of the file being updated.
2335                  * Thus, inform threads which need to know accurately
2336                  * how much data is in the last page of the file.  They
2337                  * will not do the i/o immediately, but will arrange for
2338                  * the i/o to happen later when this modify operation
2339                  * will have finished.
2340                  */
2341                 ASSERT(!(rp->r_flags & R4MODINPROGRESS));
2342                 mutex_enter(&rp->r_statelock);
2343                 rp->r_flags |= R4MODINPROGRESS;
2344                 rp->r_modaddr = (offset & MAXBMASK);
2345                 mutex_exit(&rp->r_statelock);
2346 
2347                 if (vpm_enable) {
2348                         /*
2349                          * Copy data. If new pages are created, part of
2350                          * the page that is not written will be initizliazed
2351                          * with zeros.
2352                          */
2353                         error = vpm_data_copy(vp, offset, n, uio,
2354                             !pagecreate, NULL, 0, S_WRITE);
2355                 } else {
2356                         error = uiomove(base, n, UIO_WRITE, uio);
2357                 }
2358 
2359                 /*
2360                  * r_size is the maximum number of
2361                  * bytes known to be in the file.
2362                  * Make sure it is at least as high as the
2363                  * first unwritten byte pointed to by uio_loffset.
2364                  */
2365                 mutex_enter(&rp->r_statelock);
2366                 if (rp->r_size < uio->uio_loffset)
2367                         rp->r_size = uio->uio_loffset;
2368                 rp->r_flags &= ~R4MODINPROGRESS;
2369                 rp->r_flags |= R4DIRTY;
2370                 mutex_exit(&rp->r_statelock);
2371 
2372                 /* n = # of bytes written */
2373                 n = (int)(uio->uio_loffset - offset);
2374 
2375                 if (!vpm_enable) {
2376                         base += n;
2377                 }
2378 
2379                 tcount -= n;
2380                 /*
2381                  * If we created pages w/o initializing them completely,
2382                  * we need to zero the part that wasn't set up.
2383                  * This happens on a most EOF write cases and if
2384                  * we had some sort of error during the uiomove.
2385                  */
2386                 if (!vpm_enable && pagecreate) {
2387                         if ((uio->uio_loffset & PAGEOFFSET) || n == 0)
2388                                 (void) kzero(base, PAGESIZE - n);
2389 
2390                         if (pgcreated) {
2391                                 /*
2392                                  * Caller is responsible for this page,
2393                                  * it was not created in this loop.
2394                                  */
2395                                 pgcreated = 0;
2396                         } else {
2397                                 /*
2398                                  * For bug 1094402: segmap_pagecreate locks
2399                                  * page. Unlock it. This also unlocks the
2400                                  * pages allocated by page_create_va() in
2401                                  * segmap_pagecreate().
2402                                  */
2403                                 sm_error = segmap_fault(kas.a_hat, segkmap,
2404                                     saved_base, saved_n,
2405                                     F_SOFTUNLOCK, S_WRITE);
2406                                 if (error == 0)
2407                                         error = sm_error;
2408                         }
2409                 }
2410         } while (tcount > 0 && error == 0);
2411 
2412         return (error);
2413 }
2414 
2415 int
2416 nfs4_putpages(vnode_t *vp, u_offset_t off, size_t len, int flags, cred_t *cr)
2417 {
2418         rnode4_t *rp;
2419         page_t *pp;
2420         u_offset_t eoff;
2421         u_offset_t io_off;
2422         size_t io_len;
2423         int error;
2424         int rdirty;
2425         int err;
2426 
2427         rp = VTOR4(vp);
2428         ASSERT(rp->r_count > 0);
2429 
2430         if (!nfs4_has_pages(vp))
2431                 return (0);
2432 
2433         ASSERT(vp->v_type != VCHR);
2434 
2435         /*
2436          * If R4OUTOFSPACE is set, then all writes turn into B_INVAL
2437          * writes.  B_FORCE is set to force the VM system to actually
2438          * invalidate the pages, even if the i/o failed.  The pages
2439          * need to get invalidated because they can't be written out
2440          * because there isn't any space left on either the server's
2441          * file system or in the user's disk quota.  The B_FREE bit
2442          * is cleared to avoid confusion as to whether this is a
2443          * request to place the page on the freelist or to destroy
2444          * it.
2445          */
2446         if ((rp->r_flags & R4OUTOFSPACE) ||
2447             (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED))
2448                 flags = (flags & ~B_FREE) | B_INVAL | B_FORCE;
2449 
2450         if (len == 0) {
2451                 /*
2452                  * If doing a full file synchronous operation, then clear
2453                  * the R4DIRTY bit.  If a page gets dirtied while the flush
2454                  * is happening, then R4DIRTY will get set again.  The
2455                  * R4DIRTY bit must get cleared before the flush so that
2456                  * we don't lose this information.
2457                  *
2458                  * If there are no full file async write operations
2459                  * pending and RDIRTY bit is set, clear it.
2460                  */
2461                 if (off == (u_offset_t)0 &&
2462                     !(flags & B_ASYNC) &&
2463                     (rp->r_flags & R4DIRTY)) {
2464                         mutex_enter(&rp->r_statelock);
2465                         rdirty = (rp->r_flags & R4DIRTY);
2466                         rp->r_flags &= ~R4DIRTY;
2467                         mutex_exit(&rp->r_statelock);
2468                 } else if (flags & B_ASYNC && off == (u_offset_t)0) {
2469                         mutex_enter(&rp->r_statelock);
2470                         if (rp->r_flags & R4DIRTY && rp->r_awcount == 0) {
2471                                 rdirty = (rp->r_flags & R4DIRTY);
2472                                 rp->r_flags &= ~R4DIRTY;
2473                         }
2474                         mutex_exit(&rp->r_statelock);
2475                 } else
2476                         rdirty = 0;
2477 
2478                 /*
2479                  * Search the entire vp list for pages >= off, and flush
2480                  * the dirty pages.
2481                  */
2482                 error = pvn_vplist_dirty(vp, off, rp->r_putapage,
2483                     flags, cr);
2484 
2485                 /*
2486                  * If an error occurred and the file was marked as dirty
2487                  * before and we aren't forcibly invalidating pages, then
2488                  * reset the R4DIRTY flag.
2489                  */
2490                 if (error && rdirty &&
2491                     (flags & (B_INVAL | B_FORCE)) != (B_INVAL | B_FORCE)) {
2492                         mutex_enter(&rp->r_statelock);
2493                         rp->r_flags |= R4DIRTY;
2494                         mutex_exit(&rp->r_statelock);
2495                 }
2496         } else {
2497                 /*
2498                  * Do a range from [off...off + len) looking for pages
2499                  * to deal with.
2500                  */
2501                 error = 0;
2502                 io_len = 0;
2503                 eoff = off + len;
2504                 mutex_enter(&rp->r_statelock);
2505                 for (io_off = off; io_off < eoff && io_off < rp->r_size;
2506                     io_off += io_len) {
2507                         mutex_exit(&rp->r_statelock);
2508                         /*
2509                          * If we are not invalidating, synchronously
2510                          * freeing or writing pages use the routine
2511                          * page_lookup_nowait() to prevent reclaiming
2512                          * them from the free list.
2513                          */
2514                         if ((flags & B_INVAL) || !(flags & B_ASYNC)) {
2515                                 pp = page_lookup(vp, io_off,
2516                                     (flags & (B_INVAL | B_FREE)) ?
2517                                     SE_EXCL : SE_SHARED);
2518                         } else {
2519                                 pp = page_lookup_nowait(vp, io_off,
2520                                     (flags & B_FREE) ? SE_EXCL : SE_SHARED);
2521                         }
2522 
2523                         if (pp == NULL || !pvn_getdirty(pp, flags))
2524                                 io_len = PAGESIZE;
2525                         else {
2526                                 err = (*rp->r_putapage)(vp, pp, &io_off,
2527                                     &io_len, flags, cr);
2528                                 if (!error)
2529                                         error = err;
2530                                 /*
2531                                  * "io_off" and "io_len" are returned as
2532                                  * the range of pages we actually wrote.
2533                                  * This allows us to skip ahead more quickly
2534                                  * since several pages may've been dealt
2535                                  * with by this iteration of the loop.
2536                                  */
2537                         }
2538                         mutex_enter(&rp->r_statelock);
2539                 }
2540                 mutex_exit(&rp->r_statelock);
2541         }
2542 
2543         return (error);
2544 }
2545 
2546 void
2547 nfs4_invalidate_pages(vnode_t *vp, u_offset_t off, cred_t *cr)
2548 {
2549         rnode4_t *rp;
2550 
2551         rp = VTOR4(vp);
2552         if (IS_SHADOW(vp, rp))
2553                 vp = RTOV4(rp);
2554         mutex_enter(&rp->r_statelock);
2555         while (rp->r_flags & R4TRUNCATE)
2556                 cv_wait(&rp->r_cv, &rp->r_statelock);
2557         rp->r_flags |= R4TRUNCATE;
2558         if (off == (u_offset_t)0) {
2559                 rp->r_flags &= ~R4DIRTY;
2560                 if (!(rp->r_flags & R4STALE))
2561                         rp->r_error = 0;
2562         }
2563         rp->r_truncaddr = off;
2564         mutex_exit(&rp->r_statelock);
2565         (void) pvn_vplist_dirty(vp, off, rp->r_putapage,
2566             B_INVAL | B_TRUNC, cr);
2567         mutex_enter(&rp->r_statelock);
2568         rp->r_flags &= ~R4TRUNCATE;
2569         cv_broadcast(&rp->r_cv);
2570         mutex_exit(&rp->r_statelock);
2571 }
2572 
2573 static int
2574 nfs4_mnt_kstat_update(kstat_t *ksp, int rw)
2575 {
2576         mntinfo4_t *mi;
2577         struct mntinfo_kstat *mik;
2578         vfs_t *vfsp;
2579 
2580         /* this is a read-only kstat. Bail out on a write */
2581         if (rw == KSTAT_WRITE)
2582                 return (EACCES);
2583 
2584 
2585         /*
2586          * We don't want to wait here as kstat_chain_lock could be held by
2587          * dounmount(). dounmount() takes vfs_reflock before the chain lock
2588          * and thus could lead to a deadlock.
2589          */
2590         vfsp = (struct vfs *)ksp->ks_private;
2591 
2592         mi = VFTOMI4(vfsp);
2593         mik = (struct mntinfo_kstat *)ksp->ks_data;
2594 
2595         (void) strcpy(mik->mik_proto, mi->mi_curr_serv->sv_knconf->knc_proto);
2596 
2597         mik->mik_vers = (uint32_t)mi->mi_vers;
2598         mik->mik_flags = mi->mi_flags;
2599         /*
2600          * The sv_secdata holds the flavor the client specifies.
2601          * If the client uses default and a security negotiation
2602          * occurs, sv_currsec will point to the current flavor
2603          * selected from the server flavor list.
2604          * sv_currsec is NULL if no security negotiation takes place.
2605          */
2606         mik->mik_secmod = mi->mi_curr_serv->sv_currsec ?
2607             mi->mi_curr_serv->sv_currsec->secmod :
2608             mi->mi_curr_serv->sv_secdata->secmod;
2609         mik->mik_curread = (uint32_t)mi->mi_curread;
2610         mik->mik_curwrite = (uint32_t)mi->mi_curwrite;
2611         mik->mik_retrans = mi->mi_retrans;
2612         mik->mik_timeo = mi->mi_timeo;
2613         mik->mik_acregmin = HR2SEC(mi->mi_acregmin);
2614         mik->mik_acregmax = HR2SEC(mi->mi_acregmax);
2615         mik->mik_acdirmin = HR2SEC(mi->mi_acdirmin);
2616         mik->mik_acdirmax = HR2SEC(mi->mi_acdirmax);
2617         mik->mik_noresponse = (uint32_t)mi->mi_noresponse;
2618         mik->mik_failover = (uint32_t)mi->mi_failover;
2619         mik->mik_remap = (uint32_t)mi->mi_remap;
2620 
2621         (void) strcpy(mik->mik_curserver, mi->mi_curr_serv->sv_hostname);
2622 
2623         return (0);
2624 }
2625 
2626 void
2627 nfs4_mnt_kstat_init(struct vfs *vfsp)
2628 {
2629         mntinfo4_t *mi = VFTOMI4(vfsp);
2630 
2631         /*
2632          * PSARC 2001/697 Contract Private Interface
2633          * All nfs kstats are under SunMC contract
2634          * Please refer to the PSARC listed above and contact
2635          * SunMC before making any changes!
2636          *
2637          * Changes must be reviewed by Solaris File Sharing
2638          * Changes must be communicated to contract-2001-697@sun.com
2639          *
2640          */
2641 
2642         mi->mi_io_kstats = kstat_create_zone("nfs", getminor(vfsp->vfs_dev),
2643             NULL, "nfs", KSTAT_TYPE_IO, 1, 0, mi->mi_zone->zone_id);
2644         if (mi->mi_io_kstats) {
2645                 if (mi->mi_zone->zone_id != GLOBAL_ZONEID)
2646                         kstat_zone_add(mi->mi_io_kstats, GLOBAL_ZONEID);
2647                 mi->mi_io_kstats->ks_lock = &mi->mi_lock;
2648                 kstat_install(mi->mi_io_kstats);
2649         }
2650 
2651         if ((mi->mi_ro_kstats = kstat_create_zone("nfs",
2652             getminor(vfsp->vfs_dev), "mntinfo", "misc", KSTAT_TYPE_RAW,
2653             sizeof (struct mntinfo_kstat), 0, mi->mi_zone->zone_id)) != NULL) {
2654                 if (mi->mi_zone->zone_id != GLOBAL_ZONEID)
2655                         kstat_zone_add(mi->mi_ro_kstats, GLOBAL_ZONEID);
2656                 mi->mi_ro_kstats->ks_update = nfs4_mnt_kstat_update;
2657                 mi->mi_ro_kstats->ks_private = (void *)vfsp;
2658                 kstat_install(mi->mi_ro_kstats);
2659         }
2660 
2661         nfs4_mnt_recov_kstat_init(vfsp);
2662 }
2663 
2664 void
2665 nfs4_write_error(vnode_t *vp, int error, cred_t *cr)
2666 {
2667         mntinfo4_t *mi;
2668         clock_t now = ddi_get_lbolt();
2669 
2670         mi = VTOMI4(vp);
2671         /*
2672          * In case of forced unmount, do not print any messages
2673          * since it can flood the console with error messages.
2674          */
2675         if (mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED)
2676                 return;
2677 
2678         /*
2679          * If the mount point is dead, not recoverable, do not
2680          * print error messages that can flood the console.
2681          */
2682         if (mi->mi_flags & MI4_RECOV_FAIL)
2683                 return;
2684 
2685         /*
2686          * No use in flooding the console with ENOSPC
2687          * messages from the same file system.
2688          */
2689         if ((error != ENOSPC && error != EDQUOT) ||
2690             now - mi->mi_printftime > 0) {
2691                 zoneid_t zoneid = mi->mi_zone->zone_id;
2692 
2693 #ifdef DEBUG
2694                 nfs_perror(error, "NFS%ld write error on host %s: %m.\n",
2695                     mi->mi_vers, VTOR4(vp)->r_server->sv_hostname, NULL);
2696 #else
2697                 nfs_perror(error, "NFS write error on host %s: %m.\n",
2698                     VTOR4(vp)->r_server->sv_hostname, NULL);
2699 #endif
2700                 if (error == ENOSPC || error == EDQUOT) {
2701                         zcmn_err(zoneid, CE_CONT,
2702                             "^File: userid=%d, groupid=%d\n",
2703                             crgetuid(cr), crgetgid(cr));
2704                         if (crgetuid(curthread->t_cred) != crgetuid(cr) ||
2705                             crgetgid(curthread->t_cred) != crgetgid(cr)) {
2706                                 zcmn_err(zoneid, CE_CONT,
2707                                     "^User: userid=%d, groupid=%d\n",
2708                                     crgetuid(curthread->t_cred),
2709                                     crgetgid(curthread->t_cred));
2710                         }
2711                         mi->mi_printftime = now +
2712                             nfs_write_error_interval * hz;
2713                 }
2714                 sfh4_printfhandle(VTOR4(vp)->r_fh);
2715 #ifdef DEBUG
2716                 if (error == EACCES) {
2717                         zcmn_err(zoneid, CE_CONT,
2718                             "nfs_bio: cred is%s kcred\n",
2719                             cr == kcred ? "" : " not");
2720                 }
2721 #endif
2722         }
2723 }
2724 
2725 /*
2726  * Return non-zero if the given file can be safely memory mapped.  Locks
2727  * are safe if whole-file (length and offset are both zero).
2728  */
2729 
2730 #define SAFE_LOCK(flk)  ((flk).l_start == 0 && (flk).l_len == 0)
2731 
2732 static int
2733 nfs4_safemap(const vnode_t *vp)
2734 {
2735         locklist_t      *llp, *next_llp;
2736         int             safe = 1;
2737         rnode4_t        *rp = VTOR4(vp);
2738 
2739         ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER));
2740 
2741         NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, "nfs4_safemap: "
2742             "vp = %p", (void *)vp));
2743 
2744         /*
2745          * Review all the locks for the vnode, both ones that have been
2746          * acquired and ones that are pending.  We assume that
2747          * flk_active_locks_for_vp() has merged any locks that can be
2748          * merged (so that if a process has the entire file locked, it is
2749          * represented as a single lock).
2750          *
2751          * Note that we can't bail out of the loop if we find a non-safe
2752          * lock, because we have to free all the elements in the llp list.
2753          * We might be able to speed up this code slightly by not looking
2754          * at each lock's l_start and l_len fields once we've found a
2755          * non-safe lock.
2756          */
2757 
2758         llp = flk_active_locks_for_vp(vp);
2759         while (llp) {
2760                 NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE,
2761                     "nfs4_safemap: active lock (%" PRId64 ", %" PRId64 ")",
2762                     llp->ll_flock.l_start, llp->ll_flock.l_len));
2763                 if (!SAFE_LOCK(llp->ll_flock)) {
2764                         safe = 0;
2765                         NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE,
2766                             "nfs4_safemap: unsafe active lock (%" PRId64
2767                             ", %" PRId64 ")", llp->ll_flock.l_start,
2768                             llp->ll_flock.l_len));
2769                 }
2770                 next_llp = llp->ll_next;
2771                 VN_RELE(llp->ll_vp);
2772                 kmem_free(llp, sizeof (*llp));
2773                 llp = next_llp;
2774         }
2775 
2776         NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, "nfs4_safemap: %s",
2777             safe ? "safe" : "unsafe"));
2778         return (safe);
2779 }
2780 
2781 /*
2782  * Return whether there is a lost LOCK or LOCKU queued up for the given
2783  * file that would make an mmap request unsafe.  cf. nfs4_safemap().
2784  */
2785 
2786 bool_t
2787 nfs4_map_lost_lock_conflict(vnode_t *vp)
2788 {
2789         bool_t conflict = FALSE;
2790         nfs4_lost_rqst_t *lrp;
2791         mntinfo4_t *mi = VTOMI4(vp);
2792 
2793         mutex_enter(&mi->mi_lock);
2794         for (lrp = list_head(&mi->mi_lost_state); lrp != NULL;
2795             lrp = list_next(&mi->mi_lost_state, lrp)) {
2796                 if (lrp->lr_op != OP_LOCK && lrp->lr_op != OP_LOCKU)
2797                         continue;
2798                 ASSERT(lrp->lr_vp != NULL);
2799                 if (!VOP_CMP(lrp->lr_vp, vp, NULL))
2800                         continue;       /* different file */
2801                 if (!SAFE_LOCK(*lrp->lr_flk)) {
2802                         conflict = TRUE;
2803                         break;
2804                 }
2805         }
2806 
2807         mutex_exit(&mi->mi_lock);
2808         return (conflict);
2809 }
2810 
2811 /*
2812  * nfs_lockcompletion:
2813  *
2814  * If the vnode has a lock that makes it unsafe to cache the file, mark it
2815  * as non cachable (set VNOCACHE bit).
2816  */
2817 
2818 void
2819 nfs4_lockcompletion(vnode_t *vp, int cmd)
2820 {
2821         rnode4_t *rp = VTOR4(vp);
2822 
2823         ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER));
2824         ASSERT(!IS_SHADOW(vp, rp));
2825 
2826         if (cmd == F_SETLK || cmd == F_SETLKW) {
2827 
2828                 if (!nfs4_safemap(vp)) {
2829                         mutex_enter(&vp->v_lock);
2830                         vp->v_flag |= VNOCACHE;
2831                         mutex_exit(&vp->v_lock);
2832                 } else {
2833                         mutex_enter(&vp->v_lock);
2834                         vp->v_flag &= ~VNOCACHE;
2835                         mutex_exit(&vp->v_lock);
2836                 }
2837         }
2838         /*
2839          * The cached attributes of the file are stale after acquiring
2840          * the lock on the file. They were updated when the file was
2841          * opened, but not updated when the lock was acquired. Therefore the
2842          * cached attributes are invalidated after the lock is obtained.
2843          */
2844         PURGE_ATTRCACHE4(vp);
2845 }
2846 
2847 /* ARGSUSED */
2848 static void *
2849 nfs4_mi_init(zoneid_t zoneid)
2850 {
2851         struct mi4_globals *mig;
2852 
2853         mig = kmem_alloc(sizeof (*mig), KM_SLEEP);
2854         mutex_init(&mig->mig_lock, NULL, MUTEX_DEFAULT, NULL);
2855         list_create(&mig->mig_list, sizeof (mntinfo4_t),
2856             offsetof(mntinfo4_t, mi_zone_node));
2857         mig->mig_destructor_called = B_FALSE;
2858         return (mig);
2859 }
2860 
2861 /*
2862  * Callback routine to tell all NFSv4 mounts in the zone to start tearing down
2863  * state and killing off threads.
2864  */
2865 /* ARGSUSED */
2866 static void
2867 nfs4_mi_shutdown(zoneid_t zoneid, void *data)
2868 {
2869         struct mi4_globals *mig = data;
2870         mntinfo4_t *mi;
2871         nfs4_server_t *np;
2872 
2873         NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
2874             "nfs4_mi_shutdown zone %d\n", zoneid));
2875         ASSERT(mig != NULL);
2876         for (;;) {
2877                 mutex_enter(&mig->mig_lock);
2878                 mi = list_head(&mig->mig_list);
2879                 if (mi == NULL) {
2880                         mutex_exit(&mig->mig_lock);
2881                         break;
2882                 }
2883 
2884                 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
2885                     "nfs4_mi_shutdown stopping vfs %p\n", (void *)mi->mi_vfsp));
2886                 /*
2887                  * purge the DNLC for this filesystem
2888                  */
2889                 (void) dnlc_purge_vfsp(mi->mi_vfsp, 0);
2890                 /*
2891                  * Tell existing async worker threads to exit.
2892                  */
2893                 mutex_enter(&mi->mi_async_lock);
2894                 mi->mi_max_threads = 0;
2895                 NFS4_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv);
2896                 /*
2897                  * Set the appropriate flags, signal and wait for both the
2898                  * async manager and the inactive thread to exit when they're
2899                  * done with their current work.
2900                  */
2901                 mutex_enter(&mi->mi_lock);
2902                 mi->mi_flags |= (MI4_ASYNC_MGR_STOP|MI4_DEAD);
2903                 mutex_exit(&mi->mi_lock);
2904                 mutex_exit(&mi->mi_async_lock);
2905                 if (mi->mi_manager_thread) {
2906                         nfs4_async_manager_stop(mi->mi_vfsp);
2907                 }
2908                 if (mi->mi_inactive_thread) {
2909                         mutex_enter(&mi->mi_async_lock);
2910                         cv_signal(&mi->mi_inact_req_cv);
2911                         /*
2912                          * Wait for the inactive thread to exit.
2913                          */
2914                         while (mi->mi_inactive_thread != NULL) {
2915                                 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
2916                         }
2917                         mutex_exit(&mi->mi_async_lock);
2918                 }
2919                 /*
2920                  * Wait for the recovery thread to complete, that is, it will
2921                  * signal when it is done using the "mi" structure and about
2922                  * to exit
2923                  */
2924                 mutex_enter(&mi->mi_lock);
2925                 while (mi->mi_in_recovery > 0)
2926                         cv_wait(&mi->mi_cv_in_recov, &mi->mi_lock);
2927                 mutex_exit(&mi->mi_lock);
2928                 /*
2929                  * We're done when every mi has been done or the list is empty.
2930                  * This one is done, remove it from the list.
2931                  */
2932                 list_remove(&mig->mig_list, mi);
2933                 mutex_exit(&mig->mig_lock);
2934                 zone_rele_ref(&mi->mi_zone_ref, ZONE_REF_NFSV4);
2935 
2936                 /*
2937                  * Release hold on vfs and mi done to prevent race with zone
2938                  * shutdown. This releases the hold in nfs4_mi_zonelist_add.
2939                  */
2940                 VFS_RELE(mi->mi_vfsp);
2941                 MI4_RELE(mi);
2942         }
2943         /*
2944          * Tell each renew thread in the zone to exit
2945          */
2946         mutex_enter(&nfs4_server_lst_lock);
2947         for (np = nfs4_server_lst.forw; np != &nfs4_server_lst; np = np->forw) {
2948                 mutex_enter(&np->s_lock);
2949                 if (np->zoneid == zoneid) {
2950                         /*
2951                          * We add another hold onto the nfs4_server_t
2952                          * because this will make sure tha the nfs4_server_t
2953                          * stays around until nfs4_callback_fini_zone destroys
2954                          * the zone. This way, the renew thread can
2955                          * unconditionally release its holds on the
2956                          * nfs4_server_t.
2957                          */
2958                         np->s_refcnt++;
2959                         nfs4_mark_srv_dead(np);
2960                 }
2961                 mutex_exit(&np->s_lock);
2962         }
2963         mutex_exit(&nfs4_server_lst_lock);
2964 }
2965 
2966 static void
2967 nfs4_mi_free_globals(struct mi4_globals *mig)
2968 {
2969         list_destroy(&mig->mig_list);    /* makes sure the list is empty */
2970         mutex_destroy(&mig->mig_lock);
2971         kmem_free(mig, sizeof (*mig));
2972 }
2973 
2974 /* ARGSUSED */
2975 static void
2976 nfs4_mi_destroy(zoneid_t zoneid, void *data)
2977 {
2978         struct mi4_globals *mig = data;
2979 
2980         NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
2981             "nfs4_mi_destroy zone %d\n", zoneid));
2982         ASSERT(mig != NULL);
2983         mutex_enter(&mig->mig_lock);
2984         if (list_head(&mig->mig_list) != NULL) {
2985                 /* Still waiting for VFS_FREEVFS() */
2986                 mig->mig_destructor_called = B_TRUE;
2987                 mutex_exit(&mig->mig_lock);
2988                 return;
2989         }
2990         nfs4_mi_free_globals(mig);
2991 }
2992 
2993 /*
2994  * Add an NFS mount to the per-zone list of NFS mounts.
2995  */
2996 void
2997 nfs4_mi_zonelist_add(mntinfo4_t *mi)
2998 {
2999         struct mi4_globals *mig;
3000 
3001         mig = zone_getspecific(mi4_list_key, mi->mi_zone);
3002         mutex_enter(&mig->mig_lock);
3003         list_insert_head(&mig->mig_list, mi);
3004         /*
3005          * hold added to eliminate race with zone shutdown -this will be
3006          * released in mi_shutdown
3007          */
3008         MI4_HOLD(mi);
3009         VFS_HOLD(mi->mi_vfsp);
3010         mutex_exit(&mig->mig_lock);
3011 }
3012 
3013 /*
3014  * Remove an NFS mount from the per-zone list of NFS mounts.
3015  */
3016 int
3017 nfs4_mi_zonelist_remove(mntinfo4_t *mi)
3018 {
3019         struct mi4_globals *mig;
3020         int ret = 0;
3021 
3022         mig = zone_getspecific(mi4_list_key, mi->mi_zone);
3023         mutex_enter(&mig->mig_lock);
3024         mutex_enter(&mi->mi_lock);
3025         /* if this mi is marked dead, then the zone already released it */
3026         if (!(mi->mi_flags & MI4_DEAD)) {
3027                 list_remove(&mig->mig_list, mi);
3028                 mutex_exit(&mi->mi_lock);
3029 
3030                 /* release the holds put on in zonelist_add(). */
3031                 VFS_RELE(mi->mi_vfsp);
3032                 MI4_RELE(mi);
3033                 ret = 1;
3034         } else {
3035                 mutex_exit(&mi->mi_lock);
3036         }
3037 
3038         /*
3039          * We can be called asynchronously by VFS_FREEVFS() after the zone
3040          * shutdown/destroy callbacks have executed; if so, clean up the zone's
3041          * mi globals.
3042          */
3043         if (list_head(&mig->mig_list) == NULL &&
3044             mig->mig_destructor_called == B_TRUE) {
3045                 nfs4_mi_free_globals(mig);
3046                 return (ret);
3047         }
3048         mutex_exit(&mig->mig_lock);
3049         return (ret);
3050 }
3051 
3052 void
3053 nfs_free_mi4(mntinfo4_t *mi)
3054 {
3055         nfs4_open_owner_t       *foop;
3056         nfs4_oo_hash_bucket_t   *bucketp;
3057         nfs4_debug_msg_t        *msgp;
3058         int i;
3059         servinfo4_t             *svp;
3060 
3061         /*
3062          * Code introduced here should be carefully evaluated to make
3063          * sure none of the freed resources are accessed either directly
3064          * or indirectly after freeing them. For eg: Introducing calls to
3065          * NFS4_DEBUG that use mntinfo4_t structure member after freeing
3066          * the structure members or other routines calling back into NFS
3067          * accessing freed mntinfo4_t structure member.
3068          */
3069         mutex_enter(&mi->mi_lock);
3070         ASSERT(mi->mi_recovthread == NULL);
3071         ASSERT(mi->mi_flags & MI4_ASYNC_MGR_STOP);
3072         mutex_exit(&mi->mi_lock);
3073         mutex_enter(&mi->mi_async_lock);
3074         ASSERT(mi->mi_threads[NFS4_ASYNC_QUEUE] == 0 &&
3075             mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] == 0);
3076         ASSERT(mi->mi_manager_thread == NULL);
3077         mutex_exit(&mi->mi_async_lock);
3078         if (mi->mi_io_kstats) {
3079                 kstat_delete(mi->mi_io_kstats);
3080                 mi->mi_io_kstats = NULL;
3081         }
3082         if (mi->mi_ro_kstats) {
3083                 kstat_delete(mi->mi_ro_kstats);
3084                 mi->mi_ro_kstats = NULL;
3085         }
3086         if (mi->mi_recov_ksp) {
3087                 kstat_delete(mi->mi_recov_ksp);
3088                 mi->mi_recov_ksp = NULL;
3089         }
3090         mutex_enter(&mi->mi_msg_list_lock);
3091         while (msgp = list_head(&mi->mi_msg_list)) {
3092                 list_remove(&mi->mi_msg_list, msgp);
3093                 nfs4_free_msg(msgp);
3094         }
3095         mutex_exit(&mi->mi_msg_list_lock);
3096         list_destroy(&mi->mi_msg_list);
3097         if (mi->mi_fname != NULL)
3098                 fn_rele(&mi->mi_fname);
3099         if (mi->mi_rootfh != NULL)
3100                 sfh4_rele(&mi->mi_rootfh);
3101         if (mi->mi_srvparentfh != NULL)
3102                 sfh4_rele(&mi->mi_srvparentfh);
3103         svp = mi->mi_servers;
3104         sv4_free(svp);
3105         mutex_destroy(&mi->mi_lock);
3106         mutex_destroy(&mi->mi_async_lock);
3107         mutex_destroy(&mi->mi_msg_list_lock);
3108         nfs_rw_destroy(&mi->mi_recovlock);
3109         nfs_rw_destroy(&mi->mi_rename_lock);
3110         nfs_rw_destroy(&mi->mi_fh_lock);
3111         cv_destroy(&mi->mi_failover_cv);
3112         cv_destroy(&mi->mi_async_reqs_cv);
3113         cv_destroy(&mi->mi_async_work_cv[NFS4_ASYNC_QUEUE]);
3114         cv_destroy(&mi->mi_async_work_cv[NFS4_ASYNC_PGOPS_QUEUE]);
3115         cv_destroy(&mi->mi_async_cv);
3116         cv_destroy(&mi->mi_inact_req_cv);
3117         /*
3118          * Destroy the oo hash lists and mutexes for the cred hash table.
3119          */
3120         for (i = 0; i < NFS4_NUM_OO_BUCKETS; i++) {
3121                 bucketp = &(mi->mi_oo_list[i]);
3122                 /* Destroy any remaining open owners on the list */
3123                 foop = list_head(&bucketp->b_oo_hash_list);
3124                 while (foop != NULL) {
3125                         list_remove(&bucketp->b_oo_hash_list, foop);
3126                         nfs4_destroy_open_owner(foop);
3127                         foop = list_head(&bucketp->b_oo_hash_list);
3128                 }
3129                 list_destroy(&bucketp->b_oo_hash_list);
3130                 mutex_destroy(&bucketp->b_lock);
3131         }
3132         /*
3133          * Empty and destroy the freed open owner list.
3134          */
3135         foop = list_head(&mi->mi_foo_list);
3136         while (foop != NULL) {
3137                 list_remove(&mi->mi_foo_list, foop);
3138                 nfs4_destroy_open_owner(foop);
3139                 foop = list_head(&mi->mi_foo_list);
3140         }
3141         list_destroy(&mi->mi_foo_list);
3142         list_destroy(&mi->mi_bseqid_list);
3143         list_destroy(&mi->mi_lost_state);
3144         avl_destroy(&mi->mi_filehandles);
3145         kmem_free(mi, sizeof (*mi));
3146 }
3147 void
3148 mi_hold(mntinfo4_t *mi)
3149 {
3150         atomic_add_32(&mi->mi_count, 1);
3151         ASSERT(mi->mi_count != 0);
3152 }
3153 
3154 void
3155 mi_rele(mntinfo4_t *mi)
3156 {
3157         ASSERT(mi->mi_count != 0);
3158         if (atomic_add_32_nv(&mi->mi_count, -1) == 0) {
3159                 nfs_free_mi4(mi);
3160         }
3161 }
3162 
3163 vnode_t    nfs4_xattr_notsupp_vnode;
3164 
3165 void
3166 nfs4_clnt_init(void)
3167 {
3168         nfs4_vnops_init();
3169         (void) nfs4_rnode_init();
3170         (void) nfs4_shadow_init();
3171         (void) nfs4_acache_init();
3172         (void) nfs4_subr_init();
3173         nfs4_acl_init();
3174         nfs_idmap_init();
3175         nfs4_callback_init();
3176         nfs4_secinfo_init();
3177 #ifdef  DEBUG
3178         tsd_create(&nfs4_tsd_key, NULL);
3179 #endif
3180 
3181         /*
3182          * Add a CPR callback so that we can update client
3183          * lease after a suspend and resume.
3184          */
3185         cid = callb_add(nfs4_client_cpr_callb, 0, CB_CL_CPR_RPC, "nfs4");
3186 
3187         zone_key_create(&mi4_list_key, nfs4_mi_init, nfs4_mi_shutdown,
3188             nfs4_mi_destroy);
3189 
3190         /*
3191          * Initialise the reference count of the notsupp xattr cache vnode to 1
3192          * so that it never goes away (VOP_INACTIVE isn't called on it).
3193          */
3194         nfs4_xattr_notsupp_vnode.v_count = 1;
3195 }
3196 
3197 void
3198 nfs4_clnt_fini(void)
3199 {
3200         (void) zone_key_delete(mi4_list_key);
3201         nfs4_vnops_fini();
3202         (void) nfs4_rnode_fini();
3203         (void) nfs4_shadow_fini();
3204         (void) nfs4_acache_fini();
3205         (void) nfs4_subr_fini();
3206         nfs_idmap_fini();
3207         nfs4_callback_fini();
3208         nfs4_secinfo_fini();
3209 #ifdef  DEBUG
3210         tsd_destroy(&nfs4_tsd_key);
3211 #endif
3212         if (cid)
3213                 (void) callb_delete(cid);
3214 }
3215 
3216 /*ARGSUSED*/
3217 static boolean_t
3218 nfs4_client_cpr_callb(void *arg, int code)
3219 {
3220         /*
3221          * We get called for Suspend and Resume events.
3222          * For the suspend case we simply don't care!
3223          */
3224         if (code == CB_CODE_CPR_CHKPT) {
3225                 return (B_TRUE);
3226         }
3227 
3228         /*
3229          * When we get to here we are in the process of
3230          * resuming the system from a previous suspend.
3231          */
3232         nfs4_client_resumed = gethrestime_sec();
3233         return (B_TRUE);
3234 }
3235 
3236 void
3237 nfs4_renew_lease_thread(nfs4_server_t *sp)
3238 {
3239         int     error = 0;
3240         time_t  tmp_last_renewal_time, tmp_time, tmp_now_time, kip_secs;
3241         clock_t tick_delay = 0;
3242         clock_t time_left = 0;
3243         callb_cpr_t cpr_info;
3244         kmutex_t cpr_lock;
3245 
3246         NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3247             "nfs4_renew_lease_thread: acting on sp 0x%p", (void*)sp));
3248         mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL);
3249         CALLB_CPR_INIT(&cpr_info, &cpr_lock, callb_generic_cpr, "nfsv4Lease");
3250 
3251         mutex_enter(&sp->s_lock);
3252         /* sp->s_lease_time is set via a GETATTR */
3253         sp->last_renewal_time = gethrestime_sec();
3254         sp->lease_valid = NFS4_LEASE_UNINITIALIZED;
3255         ASSERT(sp->s_refcnt >= 1);
3256 
3257         for (;;) {
3258                 if (!sp->state_ref_count ||
3259                     sp->lease_valid != NFS4_LEASE_VALID) {
3260 
3261                         kip_secs = MAX((sp->s_lease_time >> 1) -
3262                             (3 * sp->propagation_delay.tv_sec), 1);
3263 
3264                         tick_delay = SEC_TO_TICK(kip_secs);
3265 
3266                         NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3267                             "nfs4_renew_lease_thread: no renew : thread "
3268                             "wait %ld secs", kip_secs));
3269 
3270                         NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3271                             "nfs4_renew_lease_thread: no renew : "
3272                             "state_ref_count %d, lease_valid %d",
3273                             sp->state_ref_count, sp->lease_valid));
3274 
3275                         mutex_enter(&cpr_lock);
3276                         CALLB_CPR_SAFE_BEGIN(&cpr_info);
3277                         mutex_exit(&cpr_lock);
3278                         time_left = cv_reltimedwait(&sp->cv_thread_exit,
3279                             &sp->s_lock, tick_delay, TR_CLOCK_TICK);
3280                         mutex_enter(&cpr_lock);
3281                         CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock);
3282                         mutex_exit(&cpr_lock);
3283 
3284                         NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3285                             "nfs4_renew_lease_thread: no renew: "
3286                             "time left %ld", time_left));
3287 
3288                         if (sp->s_thread_exit == NFS4_THREAD_EXIT)
3289                                 goto die;
3290                         continue;
3291                 }
3292 
3293                 tmp_last_renewal_time = sp->last_renewal_time;
3294 
3295                 tmp_time = gethrestime_sec() - sp->last_renewal_time +
3296                     (3 * sp->propagation_delay.tv_sec);
3297 
3298                 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3299                     "nfs4_renew_lease_thread: tmp_time %ld, "
3300                     "sp->last_renewal_time %ld", tmp_time,
3301                     sp->last_renewal_time));
3302 
3303                 kip_secs = MAX((sp->s_lease_time >> 1) - tmp_time, 1);
3304 
3305                 tick_delay = SEC_TO_TICK(kip_secs);
3306 
3307                 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3308                     "nfs4_renew_lease_thread: valid lease: sleep for %ld "
3309                     "secs", kip_secs));
3310 
3311                 mutex_enter(&cpr_lock);
3312                 CALLB_CPR_SAFE_BEGIN(&cpr_info);
3313                 mutex_exit(&cpr_lock);
3314                 time_left = cv_reltimedwait(&sp->cv_thread_exit, &sp->s_lock,
3315                     tick_delay, TR_CLOCK_TICK);
3316                 mutex_enter(&cpr_lock);
3317                 CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock);
3318                 mutex_exit(&cpr_lock);
3319 
3320                 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3321                     "nfs4_renew_lease_thread: valid lease: time left %ld :"
3322                     "sp last_renewal_time %ld, nfs4_client_resumed %ld, "
3323                     "tmp_last_renewal_time %ld", time_left,
3324                     sp->last_renewal_time, nfs4_client_resumed,
3325                     tmp_last_renewal_time));
3326 
3327                 if (sp->s_thread_exit == NFS4_THREAD_EXIT)
3328                         goto die;
3329 
3330                 if (tmp_last_renewal_time == sp->last_renewal_time ||
3331                     (nfs4_client_resumed != 0 &&
3332                     nfs4_client_resumed > sp->last_renewal_time)) {
3333                         /*
3334                          * Issue RENEW op since we haven't renewed the lease
3335                          * since we slept.
3336                          */
3337                         tmp_now_time = gethrestime_sec();
3338                         error = nfs4renew(sp);
3339                         /*
3340                          * Need to re-acquire sp's lock, nfs4renew()
3341                          * relinqueshes it.
3342                          */
3343                         mutex_enter(&sp->s_lock);
3344 
3345                         /*
3346                          * See if someone changed s_thread_exit while we gave
3347                          * up s_lock.
3348                          */
3349                         if (sp->s_thread_exit == NFS4_THREAD_EXIT)
3350                                 goto die;
3351 
3352                         if (!error) {
3353                                 /*
3354                                  * check to see if we implicitly renewed while
3355                                  * we waited for a reply for our RENEW call.
3356                                  */
3357                                 if (tmp_last_renewal_time ==
3358                                     sp->last_renewal_time) {
3359                                         /* no implicit renew came */
3360                                         sp->last_renewal_time = tmp_now_time;
3361                                 } else {
3362                                         NFS4_DEBUG(nfs4_client_lease_debug,
3363                                             (CE_NOTE, "renew_thread: did "
3364                                             "implicit renewal before reply "
3365                                             "from server for RENEW"));
3366                                 }
3367                         } else {
3368                                 /* figure out error */
3369                                 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3370                                     "renew_thread: nfs4renew returned error"
3371                                     " %d", error));
3372                         }
3373 
3374                 }
3375         }
3376 
3377 die:
3378         NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3379             "nfs4_renew_lease_thread: thread exiting"));
3380 
3381         while (sp->s_otw_call_count != 0) {
3382                 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3383                     "nfs4_renew_lease_thread: waiting for outstanding "
3384                     "otw calls to finish for sp 0x%p, current "
3385                     "s_otw_call_count %d", (void *)sp,
3386                     sp->s_otw_call_count));
3387                 mutex_enter(&cpr_lock);
3388                 CALLB_CPR_SAFE_BEGIN(&cpr_info);
3389                 mutex_exit(&cpr_lock);
3390                 cv_wait(&sp->s_cv_otw_count, &sp->s_lock);
3391                 mutex_enter(&cpr_lock);
3392                 CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock);
3393                 mutex_exit(&cpr_lock);
3394         }
3395         mutex_exit(&sp->s_lock);
3396 
3397         nfs4_server_rele(sp);           /* free the thread's reference */
3398         nfs4_server_rele(sp);           /* free the list's reference */
3399         sp = NULL;
3400 
3401 done:
3402         mutex_enter(&cpr_lock);
3403         CALLB_CPR_EXIT(&cpr_info);  /* drops cpr_lock */
3404         mutex_destroy(&cpr_lock);
3405 
3406         NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3407             "nfs4_renew_lease_thread: renew thread exit officially"));
3408 
3409         zthread_exit();
3410         /* NOT REACHED */
3411 }
3412 
3413 /*
3414  * Send out a RENEW op to the server.
3415  * Assumes sp is locked down.
3416  */
3417 static int
3418 nfs4renew(nfs4_server_t *sp)
3419 {
3420         COMPOUND4args_clnt args;
3421         COMPOUND4res_clnt res;
3422         nfs_argop4 argop[1];
3423         int doqueue = 1;
3424         int rpc_error;
3425         cred_t *cr;
3426         mntinfo4_t *mi;
3427         timespec_t prop_time, after_time;
3428         int needrecov = FALSE;
3429         nfs4_recov_state_t recov_state;
3430         nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
3431 
3432         NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "nfs4renew"));
3433 
3434         recov_state.rs_flags = 0;
3435         recov_state.rs_num_retry_despite_err = 0;
3436 
3437 recov_retry:
3438         mi = sp->mntinfo4_list;
3439         VFS_HOLD(mi->mi_vfsp);
3440         mutex_exit(&sp->s_lock);
3441         ASSERT(mi != NULL);
3442 
3443         e.error = nfs4_start_op(mi, NULL, NULL, &recov_state);
3444         if (e.error) {
3445                 VFS_RELE(mi->mi_vfsp);
3446                 return (e.error);
3447         }
3448 
3449         /* Check to see if we're dealing with a marked-dead sp */
3450         mutex_enter(&sp->s_lock);
3451         if (sp->s_thread_exit == NFS4_THREAD_EXIT) {
3452                 mutex_exit(&sp->s_lock);
3453                 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3454                 VFS_RELE(mi->mi_vfsp);
3455                 return (0);
3456         }
3457 
3458         /* Make sure mi hasn't changed on us */
3459         if (mi != sp->mntinfo4_list) {
3460                 /* Must drop sp's lock to avoid a recursive mutex enter */
3461                 mutex_exit(&sp->s_lock);
3462                 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3463                 VFS_RELE(mi->mi_vfsp);
3464                 mutex_enter(&sp->s_lock);
3465                 goto recov_retry;
3466         }
3467         mutex_exit(&sp->s_lock);
3468 
3469         args.ctag = TAG_RENEW;
3470 
3471         args.array_len = 1;
3472         args.array = argop;
3473 
3474         argop[0].argop = OP_RENEW;
3475 
3476         mutex_enter(&sp->s_lock);
3477         argop[0].nfs_argop4_u.oprenew.clientid = sp->clientid;
3478         cr = sp->s_cred;
3479         crhold(cr);
3480         mutex_exit(&sp->s_lock);
3481 
3482         ASSERT(cr != NULL);
3483 
3484         /* used to figure out RTT for sp */
3485         gethrestime(&prop_time);
3486 
3487         NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
3488             "nfs4renew: %s call, sp 0x%p", needrecov ? "recov" : "first",
3489             (void*)sp));
3490         NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "before: %ld s %ld ns ",
3491             prop_time.tv_sec, prop_time.tv_nsec));
3492 
3493         DTRACE_PROBE2(nfs4__renew__start, nfs4_server_t *, sp,
3494             mntinfo4_t *, mi);
3495 
3496         rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
3497         crfree(cr);
3498 
3499         DTRACE_PROBE2(nfs4__renew__end, nfs4_server_t *, sp,
3500             mntinfo4_t *, mi);
3501 
3502         gethrestime(&after_time);
3503 
3504         mutex_enter(&sp->s_lock);
3505         sp->propagation_delay.tv_sec =
3506             MAX(1, after_time.tv_sec - prop_time.tv_sec);
3507         mutex_exit(&sp->s_lock);
3508 
3509         NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "after : %ld s %ld ns ",
3510             after_time.tv_sec, after_time.tv_nsec));
3511 
3512         if (e.error == 0 && res.status == NFS4ERR_CB_PATH_DOWN) {
3513                 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3514                 nfs4_delegreturn_all(sp);
3515                 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3516                 VFS_RELE(mi->mi_vfsp);
3517                 /*
3518                  * If the server returns CB_PATH_DOWN, it has renewed
3519                  * the lease and informed us that the callback path is
3520                  * down.  Since the lease is renewed, just return 0 and
3521                  * let the renew thread proceed as normal.
3522                  */
3523                 return (0);
3524         }
3525 
3526         needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
3527         if (!needrecov && e.error) {
3528                 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3529                 VFS_RELE(mi->mi_vfsp);
3530                 return (e.error);
3531         }
3532 
3533         rpc_error = e.error;
3534 
3535         if (needrecov) {
3536                 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
3537                     "nfs4renew: initiating recovery\n"));
3538 
3539                 if (nfs4_start_recovery(&e, mi, NULL, NULL, NULL, NULL,
3540                     OP_RENEW, NULL, NULL, NULL) == FALSE) {
3541                         nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3542                         VFS_RELE(mi->mi_vfsp);
3543                         if (!e.error)
3544                                 (void) xdr_free(xdr_COMPOUND4res_clnt,
3545                                     (caddr_t)&res);
3546                         mutex_enter(&sp->s_lock);
3547                         goto recov_retry;
3548                 }
3549                 /* fall through for res.status case */
3550         }
3551 
3552         if (res.status) {
3553                 if (res.status == NFS4ERR_LEASE_MOVED) {
3554                         /*EMPTY*/
3555                         /*
3556                          * XXX need to try every mntinfo4 in sp->mntinfo4_list
3557                          * to renew the lease on that server
3558                          */
3559                 }
3560                 e.error = geterrno4(res.status);
3561         }
3562 
3563         if (!rpc_error)
3564                 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3565 
3566         nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3567 
3568         VFS_RELE(mi->mi_vfsp);
3569 
3570         return (e.error);
3571 }
3572 
3573 void
3574 nfs4_inc_state_ref_count(mntinfo4_t *mi)
3575 {
3576         nfs4_server_t   *sp;
3577 
3578         /* this locks down sp if it is found */
3579         sp = find_nfs4_server(mi);
3580 
3581         if (sp != NULL) {
3582                 nfs4_inc_state_ref_count_nolock(sp, mi);
3583                 mutex_exit(&sp->s_lock);
3584                 nfs4_server_rele(sp);
3585         }
3586 }
3587 
3588 /*
3589  * Bump the number of OPEN files (ie: those with state) so we know if this
3590  * nfs4_server has any state to maintain a lease for or not.
3591  *
3592  * Also, marks the nfs4_server's lease valid if it hasn't been done so already.
3593  */
3594 void
3595 nfs4_inc_state_ref_count_nolock(nfs4_server_t *sp, mntinfo4_t *mi)
3596 {
3597         ASSERT(mutex_owned(&sp->s_lock));
3598 
3599         sp->state_ref_count++;
3600         NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3601             "nfs4_inc_state_ref_count: state_ref_count now %d",
3602             sp->state_ref_count));
3603 
3604         if (sp->lease_valid == NFS4_LEASE_UNINITIALIZED)
3605                 sp->lease_valid = NFS4_LEASE_VALID;
3606 
3607         /*
3608          * If this call caused the lease to be marked valid and/or
3609          * took the state_ref_count from 0 to 1, then start the time
3610          * on lease renewal.
3611          */
3612         if (sp->lease_valid == NFS4_LEASE_VALID && sp->state_ref_count == 1)
3613                 sp->last_renewal_time = gethrestime_sec();
3614 
3615         /* update the number of open files for mi */
3616         mi->mi_open_files++;
3617 }
3618 
3619 void
3620 nfs4_dec_state_ref_count(mntinfo4_t *mi)
3621 {
3622         nfs4_server_t   *sp;
3623 
3624         /* this locks down sp if it is found */
3625         sp = find_nfs4_server_all(mi, 1);
3626 
3627         if (sp != NULL) {
3628                 nfs4_dec_state_ref_count_nolock(sp, mi);
3629                 mutex_exit(&sp->s_lock);
3630                 nfs4_server_rele(sp);
3631         }
3632 }
3633 
3634 /*
3635  * Decrement the number of OPEN files (ie: those with state) so we know if
3636  * this nfs4_server has any state to maintain a lease for or not.
3637  */
3638 void
3639 nfs4_dec_state_ref_count_nolock(nfs4_server_t *sp, mntinfo4_t *mi)
3640 {
3641         ASSERT(mutex_owned(&sp->s_lock));
3642         ASSERT(sp->state_ref_count != 0);
3643         sp->state_ref_count--;
3644 
3645         NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3646             "nfs4_dec_state_ref_count: state ref count now %d",
3647             sp->state_ref_count));
3648 
3649         mi->mi_open_files--;
3650         NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3651             "nfs4_dec_state_ref_count: mi open files %d, v4 flags 0x%x",
3652             mi->mi_open_files, mi->mi_flags));
3653 
3654         /* We don't have to hold the mi_lock to test mi_flags */
3655         if (mi->mi_open_files == 0 &&
3656             (mi->mi_flags & MI4_REMOVE_ON_LAST_CLOSE)) {
3657                 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3658                     "nfs4_dec_state_ref_count: remove mntinfo4 %p since "
3659                     "we have closed the last open file", (void*)mi));
3660                 nfs4_remove_mi_from_server(mi, sp);
3661         }
3662 }
3663 
3664 bool_t
3665 inlease(nfs4_server_t *sp)
3666 {
3667         bool_t result;
3668 
3669         ASSERT(mutex_owned(&sp->s_lock));
3670 
3671         if (sp->lease_valid == NFS4_LEASE_VALID &&
3672             gethrestime_sec() < sp->last_renewal_time + sp->s_lease_time)
3673                 result = TRUE;
3674         else
3675                 result = FALSE;
3676 
3677         return (result);
3678 }
3679 
3680 
3681 /*
3682  * Return non-zero if the given nfs4_server_t is going through recovery.
3683  */
3684 
3685 int
3686 nfs4_server_in_recovery(nfs4_server_t *sp)
3687 {
3688         return (nfs_rw_lock_held(&sp->s_recovlock, RW_WRITER));
3689 }
3690 
3691 /*
3692  * Compare two shared filehandle objects.  Returns -1, 0, or +1, if the
3693  * first is less than, equal to, or greater than the second.
3694  */
3695 
3696 int
3697 sfh4cmp(const void *p1, const void *p2)
3698 {
3699         const nfs4_sharedfh_t *sfh1 = (const nfs4_sharedfh_t *)p1;
3700         const nfs4_sharedfh_t *sfh2 = (const nfs4_sharedfh_t *)p2;
3701 
3702         return (nfs4cmpfh(&sfh1->sfh_fh, &sfh2->sfh_fh));
3703 }
3704 
3705 /*
3706  * Create a table for shared filehandle objects.
3707  */
3708 
3709 void
3710 sfh4_createtab(avl_tree_t *tab)
3711 {
3712         avl_create(tab, sfh4cmp, sizeof (nfs4_sharedfh_t),
3713             offsetof(nfs4_sharedfh_t, sfh_tree));
3714 }
3715 
3716 /*
3717  * Return a shared filehandle object for the given filehandle.  The caller
3718  * is responsible for eventually calling sfh4_rele().
3719  */
3720 
3721 nfs4_sharedfh_t *
3722 sfh4_put(const nfs_fh4 *fh, mntinfo4_t *mi, nfs4_sharedfh_t *key)
3723 {
3724         nfs4_sharedfh_t *sfh, *nsfh;
3725         avl_index_t where;
3726         nfs4_sharedfh_t skey;
3727 
3728         if (!key) {
3729                 skey.sfh_fh = *fh;
3730                 key = &skey;
3731         }
3732 
3733         nsfh = kmem_alloc(sizeof (nfs4_sharedfh_t), KM_SLEEP);
3734         nsfh->sfh_fh.nfs_fh4_len = fh->nfs_fh4_len;
3735         /*
3736          * We allocate the largest possible filehandle size because it's
3737          * not that big, and it saves us from possibly having to resize the
3738          * buffer later.
3739          */
3740         nsfh->sfh_fh.nfs_fh4_val = kmem_alloc(NFS4_FHSIZE, KM_SLEEP);
3741         bcopy(fh->nfs_fh4_val, nsfh->sfh_fh.nfs_fh4_val, fh->nfs_fh4_len);
3742         mutex_init(&nsfh->sfh_lock, NULL, MUTEX_DEFAULT, NULL);
3743         nsfh->sfh_refcnt = 1;
3744         nsfh->sfh_flags = SFH4_IN_TREE;
3745         nsfh->sfh_mi = mi;
3746         NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE, "sfh4_get: new object (%p)",
3747             (void *)nsfh));
3748 
3749         (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0);
3750         sfh = avl_find(&mi->mi_filehandles, key, &where);
3751         if (sfh != NULL) {
3752                 mutex_enter(&sfh->sfh_lock);
3753                 sfh->sfh_refcnt++;
3754                 mutex_exit(&sfh->sfh_lock);
3755                 nfs_rw_exit(&mi->mi_fh_lock);
3756                 /* free our speculative allocs */
3757                 kmem_free(nsfh->sfh_fh.nfs_fh4_val, NFS4_FHSIZE);
3758                 kmem_free(nsfh, sizeof (nfs4_sharedfh_t));
3759                 return (sfh);
3760         }
3761 
3762         avl_insert(&mi->mi_filehandles, nsfh, where);
3763         nfs_rw_exit(&mi->mi_fh_lock);
3764 
3765         return (nsfh);
3766 }
3767 
3768 /*
3769  * Return a shared filehandle object for the given filehandle.  The caller
3770  * is responsible for eventually calling sfh4_rele().
3771  */
3772 
3773 nfs4_sharedfh_t *
3774 sfh4_get(const nfs_fh4 *fh, mntinfo4_t *mi)
3775 {
3776         nfs4_sharedfh_t *sfh;
3777         nfs4_sharedfh_t key;
3778 
3779         ASSERT(fh->nfs_fh4_len <= NFS4_FHSIZE);
3780 
3781 #ifdef DEBUG
3782         if (nfs4_sharedfh_debug) {
3783                 nfs4_fhandle_t fhandle;
3784 
3785                 fhandle.fh_len = fh->nfs_fh4_len;
3786                 bcopy(fh->nfs_fh4_val, fhandle.fh_buf, fhandle.fh_len);
3787                 zcmn_err(mi->mi_zone->zone_id, CE_NOTE, "sfh4_get:");
3788                 nfs4_printfhandle(&fhandle);
3789         }
3790 #endif
3791 
3792         /*
3793          * If there's already an object for the given filehandle, bump the
3794          * reference count and return it.  Otherwise, create a new object
3795          * and add it to the AVL tree.
3796          */
3797 
3798         key.sfh_fh = *fh;
3799 
3800         (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0);
3801         sfh = avl_find(&mi->mi_filehandles, &key, NULL);
3802         if (sfh != NULL) {
3803                 mutex_enter(&sfh->sfh_lock);
3804                 sfh->sfh_refcnt++;
3805                 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE,
3806                     "sfh4_get: found existing %p, new refcnt=%d",
3807                     (void *)sfh, sfh->sfh_refcnt));
3808                 mutex_exit(&sfh->sfh_lock);
3809                 nfs_rw_exit(&mi->mi_fh_lock);
3810                 return (sfh);
3811         }
3812         nfs_rw_exit(&mi->mi_fh_lock);
3813 
3814         return (sfh4_put(fh, mi, &key));
3815 }
3816 
3817 /*
3818  * Get a reference to the given shared filehandle object.
3819  */
3820 
3821 void
3822 sfh4_hold(nfs4_sharedfh_t *sfh)
3823 {
3824         ASSERT(sfh->sfh_refcnt > 0);
3825 
3826         mutex_enter(&sfh->sfh_lock);
3827         sfh->sfh_refcnt++;
3828         NFS4_DEBUG(nfs4_sharedfh_debug,
3829             (CE_NOTE, "sfh4_hold %p, new refcnt=%d",
3830             (void *)sfh, sfh->sfh_refcnt));
3831         mutex_exit(&sfh->sfh_lock);
3832 }
3833 
3834 /*
3835  * Release a reference to the given shared filehandle object and null out
3836  * the given pointer.
3837  */
3838 
3839 void
3840 sfh4_rele(nfs4_sharedfh_t **sfhpp)
3841 {
3842         mntinfo4_t *mi;
3843         nfs4_sharedfh_t *sfh = *sfhpp;
3844 
3845         ASSERT(sfh->sfh_refcnt > 0);
3846 
3847         mutex_enter(&sfh->sfh_lock);
3848         if (sfh->sfh_refcnt > 1) {
3849                 sfh->sfh_refcnt--;
3850                 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE,
3851                     "sfh4_rele %p, new refcnt=%d",
3852                     (void *)sfh, sfh->sfh_refcnt));
3853                 mutex_exit(&sfh->sfh_lock);
3854                 goto finish;
3855         }
3856         mutex_exit(&sfh->sfh_lock);
3857 
3858         /*
3859          * Possibly the last reference, so get the lock for the table in
3860          * case it's time to remove the object from the table.
3861          */
3862         mi = sfh->sfh_mi;
3863         (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0);
3864         mutex_enter(&sfh->sfh_lock);
3865         sfh->sfh_refcnt--;
3866         if (sfh->sfh_refcnt > 0) {
3867                 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE,
3868                     "sfh4_rele %p, new refcnt=%d",
3869                     (void *)sfh, sfh->sfh_refcnt));
3870                 mutex_exit(&sfh->sfh_lock);
3871                 nfs_rw_exit(&mi->mi_fh_lock);
3872                 goto finish;
3873         }
3874 
3875         NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE,
3876             "sfh4_rele %p, last ref", (void *)sfh));
3877         if (sfh->sfh_flags & SFH4_IN_TREE) {
3878                 avl_remove(&mi->mi_filehandles, sfh);
3879                 sfh->sfh_flags &= ~SFH4_IN_TREE;
3880         }
3881         mutex_exit(&sfh->sfh_lock);
3882         nfs_rw_exit(&mi->mi_fh_lock);
3883         mutex_destroy(&sfh->sfh_lock);
3884         kmem_free(sfh->sfh_fh.nfs_fh4_val, NFS4_FHSIZE);
3885         kmem_free(sfh, sizeof (nfs4_sharedfh_t));
3886 
3887 finish:
3888         *sfhpp = NULL;
3889 }
3890 
3891 /*
3892  * Update the filehandle for the given shared filehandle object.
3893  */
3894 
3895 int nfs4_warn_dupfh = 0;        /* if set, always warn about dup fhs below */
3896 
3897 void
3898 sfh4_update(nfs4_sharedfh_t *sfh, const nfs_fh4 *newfh)
3899 {
3900         mntinfo4_t *mi = sfh->sfh_mi;
3901         nfs4_sharedfh_t *dupsfh;
3902         avl_index_t where;
3903         nfs4_sharedfh_t key;
3904 
3905 #ifdef DEBUG
3906         mutex_enter(&sfh->sfh_lock);
3907         ASSERT(sfh->sfh_refcnt > 0);
3908         mutex_exit(&sfh->sfh_lock);
3909 #endif
3910         ASSERT(newfh->nfs_fh4_len <= NFS4_FHSIZE);
3911 
3912         /*
3913          * The basic plan is to remove the shared filehandle object from
3914          * the table, update it to have the new filehandle, then reinsert
3915          * it.
3916          */
3917 
3918         (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0);
3919         mutex_enter(&sfh->sfh_lock);
3920         if (sfh->sfh_flags & SFH4_IN_TREE) {
3921                 avl_remove(&mi->mi_filehandles, sfh);
3922                 sfh->sfh_flags &= ~SFH4_IN_TREE;
3923         }
3924         mutex_exit(&sfh->sfh_lock);
3925         sfh->sfh_fh.nfs_fh4_len = newfh->nfs_fh4_len;
3926         bcopy(newfh->nfs_fh4_val, sfh->sfh_fh.nfs_fh4_val,
3927             sfh->sfh_fh.nfs_fh4_len);
3928 
3929         /*
3930          * XXX If there is already a shared filehandle object with the new
3931          * filehandle, we're in trouble, because the rnode code assumes
3932          * that there is only one shared filehandle object for a given
3933          * filehandle.  So issue a warning (for read-write mounts only)
3934          * and don't try to re-insert the given object into the table.
3935          * Hopefully the given object will quickly go away and everyone
3936          * will use the new object.
3937          */
3938         key.sfh_fh = *newfh;
3939         dupsfh = avl_find(&mi->mi_filehandles, &key, &where);
3940         if (dupsfh != NULL) {
3941                 if (!(mi->mi_vfsp->vfs_flag & VFS_RDONLY) || nfs4_warn_dupfh) {
3942                         zcmn_err(mi->mi_zone->zone_id, CE_WARN, "sfh4_update: "
3943                             "duplicate filehandle detected");
3944                         sfh4_printfhandle(dupsfh);
3945                 }
3946         } else {
3947                 avl_insert(&mi->mi_filehandles, sfh, where);
3948                 mutex_enter(&sfh->sfh_lock);
3949                 sfh->sfh_flags |= SFH4_IN_TREE;
3950                 mutex_exit(&sfh->sfh_lock);
3951         }
3952         nfs_rw_exit(&mi->mi_fh_lock);
3953 }
3954 
3955 /*
3956  * Copy out the current filehandle for the given shared filehandle object.
3957  */
3958 
3959 void
3960 sfh4_copyval(const nfs4_sharedfh_t *sfh, nfs4_fhandle_t *fhp)
3961 {
3962         mntinfo4_t *mi = sfh->sfh_mi;
3963 
3964         ASSERT(sfh->sfh_refcnt > 0);
3965 
3966         (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0);
3967         fhp->fh_len = sfh->sfh_fh.nfs_fh4_len;
3968         ASSERT(fhp->fh_len <= NFS4_FHSIZE);
3969         bcopy(sfh->sfh_fh.nfs_fh4_val, fhp->fh_buf, fhp->fh_len);
3970         nfs_rw_exit(&mi->mi_fh_lock);
3971 }
3972 
3973 /*
3974  * Print out the filehandle for the given shared filehandle object.
3975  */
3976 
3977 void
3978 sfh4_printfhandle(const nfs4_sharedfh_t *sfh)
3979 {
3980         nfs4_fhandle_t fhandle;
3981 
3982         sfh4_copyval(sfh, &fhandle);
3983         nfs4_printfhandle(&fhandle);
3984 }
3985 
3986 /*
3987  * Compare 2 fnames.  Returns -1 if the first is "less" than the second, 0
3988  * if they're the same, +1 if the first is "greater" than the second.  The
3989  * caller (or whoever's calling the AVL package) is responsible for
3990  * handling locking issues.
3991  */
3992 
3993 static int
3994 fncmp(const void *p1, const void *p2)
3995 {
3996         const nfs4_fname_t *f1 = p1;
3997         const nfs4_fname_t *f2 = p2;
3998         int res;
3999 
4000         res = strcmp(f1->fn_name, f2->fn_name);
4001         /*
4002          * The AVL package wants +/-1, not arbitrary positive or negative
4003          * integers.
4004          */
4005         if (res > 0)
4006                 res = 1;
4007         else if (res < 0)
4008                 res = -1;
4009         return (res);
4010 }
4011 
4012 /*
4013  * Get or create an fname with the given name, as a child of the given
4014  * fname.  The caller is responsible for eventually releasing the reference
4015  * (fn_rele()).  parent may be NULL.
4016  */
4017 
4018 nfs4_fname_t *
4019 fn_get(nfs4_fname_t *parent, char *name, nfs4_sharedfh_t *sfh)
4020 {
4021         nfs4_fname_t key;
4022         nfs4_fname_t *fnp;
4023         avl_index_t where;
4024 
4025         key.fn_name = name;
4026 
4027         /*
4028          * If there's already an fname registered with the given name, bump
4029          * its reference count and return it.  Otherwise, create a new one
4030          * and add it to the parent's AVL tree.
4031          *
4032          * fname entries we are looking for should match both name
4033          * and sfh stored in the fname.
4034          */
4035 again:
4036         if (parent != NULL) {
4037                 mutex_enter(&parent->fn_lock);
4038                 fnp = avl_find(&parent->fn_children, &key, &where);
4039                 if (fnp != NULL) {
4040                         /*
4041                          * This hold on fnp is released below later,
4042                          * in case this is not the fnp we want.
4043                          */
4044                         fn_hold(fnp);
4045 
4046                         if (fnp->fn_sfh == sfh) {
4047                                 /*
4048                                  * We have found our entry.
4049                                  * put an hold and return it.
4050                                  */
4051                                 mutex_exit(&parent->fn_lock);
4052                                 return (fnp);
4053                         }
4054 
4055                         /*
4056                          * We have found an entry that has a mismatching
4057                          * fn_sfh. This could be a stale entry due to
4058                          * server side rename. We will remove this entry
4059                          * and make sure no such entries exist.
4060                          */
4061                         mutex_exit(&parent->fn_lock);
4062                         mutex_enter(&fnp->fn_lock);
4063                         if (fnp->fn_parent == parent) {
4064                                 /*
4065                                  * Remove ourselves from parent's
4066                                  * fn_children tree.
4067                                  */
4068                                 mutex_enter(&parent->fn_lock);
4069                                 avl_remove(&parent->fn_children, fnp);
4070                                 mutex_exit(&parent->fn_lock);
4071                                 fn_rele(&fnp->fn_parent);
4072                         }
4073                         mutex_exit(&fnp->fn_lock);
4074                         fn_rele(&fnp);
4075                         goto again;
4076                 }
4077         }
4078 
4079         fnp = kmem_alloc(sizeof (nfs4_fname_t), KM_SLEEP);
4080         mutex_init(&fnp->fn_lock, NULL, MUTEX_DEFAULT, NULL);
4081         fnp->fn_parent = parent;
4082         if (parent != NULL)
4083                 fn_hold(parent);
4084         fnp->fn_len = strlen(name);
4085         ASSERT(fnp->fn_len < MAXNAMELEN);
4086         fnp->fn_name = kmem_alloc(fnp->fn_len + 1, KM_SLEEP);
4087         (void) strcpy(fnp->fn_name, name);
4088         fnp->fn_refcnt = 1;
4089 
4090         /*
4091          * This hold on sfh is later released
4092          * when we do the final fn_rele() on this fname.
4093          */
4094         sfh4_hold(sfh);
4095         fnp->fn_sfh = sfh;
4096 
4097         avl_create(&fnp->fn_children, fncmp, sizeof (nfs4_fname_t),
4098             offsetof(nfs4_fname_t, fn_tree));
4099         NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE,
4100             "fn_get %p:%s, a new nfs4_fname_t!",
4101             (void *)fnp, fnp->fn_name));
4102         if (parent != NULL) {
4103                 avl_insert(&parent->fn_children, fnp, where);
4104                 mutex_exit(&parent->fn_lock);
4105         }
4106 
4107         return (fnp);
4108 }
4109 
4110 void
4111 fn_hold(nfs4_fname_t *fnp)
4112 {
4113         atomic_add_32(&fnp->fn_refcnt, 1);
4114         NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE,
4115             "fn_hold %p:%s, new refcnt=%d",
4116             (void *)fnp, fnp->fn_name, fnp->fn_refcnt));
4117 }
4118 
4119 /*
4120  * Decrement the reference count of the given fname, and destroy it if its
4121  * reference count goes to zero.  Nulls out the given pointer.
4122  */
4123 
4124 void
4125 fn_rele(nfs4_fname_t **fnpp)
4126 {
4127         nfs4_fname_t *parent;
4128         uint32_t newref;
4129         nfs4_fname_t *fnp;
4130 
4131 recur:
4132         fnp = *fnpp;
4133         *fnpp = NULL;
4134 
4135         mutex_enter(&fnp->fn_lock);
4136         parent = fnp->fn_parent;
4137         if (parent != NULL)
4138                 mutex_enter(&parent->fn_lock);   /* prevent new references */
4139         newref = atomic_add_32_nv(&fnp->fn_refcnt, -1);
4140         if (newref > 0) {
4141                 NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE,
4142                     "fn_rele %p:%s, new refcnt=%d",
4143                     (void *)fnp, fnp->fn_name, fnp->fn_refcnt));
4144                 if (parent != NULL)
4145                         mutex_exit(&parent->fn_lock);
4146                 mutex_exit(&fnp->fn_lock);
4147                 return;
4148         }
4149 
4150         NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE,
4151             "fn_rele %p:%s, last reference, deleting...",
4152             (void *)fnp, fnp->fn_name));
4153         if (parent != NULL) {
4154                 avl_remove(&parent->fn_children, fnp);
4155                 mutex_exit(&parent->fn_lock);
4156         }
4157         kmem_free(fnp->fn_name, fnp->fn_len + 1);
4158         sfh4_rele(&fnp->fn_sfh);
4159         mutex_destroy(&fnp->fn_lock);
4160         avl_destroy(&fnp->fn_children);
4161         kmem_free(fnp, sizeof (nfs4_fname_t));
4162         /*
4163          * Recursivly fn_rele the parent.
4164          * Use goto instead of a recursive call to avoid stack overflow.
4165          */
4166         if (parent != NULL) {
4167                 fnpp = &parent;
4168                 goto recur;
4169         }
4170 }
4171 
4172 /*
4173  * Returns the single component name of the given fname, in a MAXNAMELEN
4174  * string buffer, which the caller is responsible for freeing.  Note that
4175  * the name may become invalid as a result of fn_move().
4176  */
4177 
4178 char *
4179 fn_name(nfs4_fname_t *fnp)
4180 {
4181         char *name;
4182 
4183         ASSERT(fnp->fn_len < MAXNAMELEN);
4184         name = kmem_alloc(MAXNAMELEN, KM_SLEEP);
4185         mutex_enter(&fnp->fn_lock);
4186         (void) strcpy(name, fnp->fn_name);
4187         mutex_exit(&fnp->fn_lock);
4188 
4189         return (name);
4190 }
4191 
4192 
4193 /*
4194  * fn_path_realloc
4195  *
4196  * This function, used only by fn_path, constructs
4197  * a new string which looks like "prepend" + "/" + "current".
4198  * by allocating a new string and freeing the old one.
4199  */
4200 static void
4201 fn_path_realloc(char **curses, char *prepend)
4202 {
4203         int len, curlen = 0;
4204         char *news;
4205 
4206         if (*curses == NULL) {
4207                 /*
4208                  * Prime the pump, allocate just the
4209                  * space for prepend and return that.
4210                  */
4211                 len = strlen(prepend) + 1;
4212                 news = kmem_alloc(len, KM_SLEEP);
4213                 (void) strncpy(news, prepend, len);
4214         } else {
4215                 /*
4216                  * Allocate the space  for a new string
4217                  * +1 +1 is for the "/" and the NULL
4218                  * byte at the end of it all.
4219                  */
4220                 curlen = strlen(*curses);
4221                 len = curlen + strlen(prepend) + 1 + 1;
4222                 news = kmem_alloc(len, KM_SLEEP);
4223                 (void) strncpy(news, prepend, len);
4224                 (void) strcat(news, "/");
4225                 (void) strcat(news, *curses);
4226                 kmem_free(*curses, curlen + 1);
4227         }
4228         *curses = news;
4229 }
4230 
4231 /*
4232  * Returns the path name (starting from the fs root) for the given fname.
4233  * The caller is responsible for freeing.  Note that the path may be or
4234  * become invalid as a result of fn_move().
4235  */
4236 
4237 char *
4238 fn_path(nfs4_fname_t *fnp)
4239 {
4240         char *path;
4241         nfs4_fname_t *nextfnp;
4242 
4243         if (fnp == NULL)
4244                 return (NULL);
4245 
4246         path = NULL;
4247 
4248         /* walk up the tree constructing the pathname.  */
4249 
4250         fn_hold(fnp);                   /* adjust for later rele */
4251         do {
4252                 mutex_enter(&fnp->fn_lock);
4253                 /*
4254                  * Add fn_name in front of the current path
4255                  */
4256                 fn_path_realloc(&path, fnp->fn_name);
4257                 nextfnp = fnp->fn_parent;
4258                 if (nextfnp != NULL)
4259                         fn_hold(nextfnp);
4260                 mutex_exit(&fnp->fn_lock);
4261                 fn_rele(&fnp);
4262                 fnp = nextfnp;
4263         } while (fnp != NULL);
4264 
4265         return (path);
4266 }
4267 
4268 /*
4269  * Return a reference to the parent of the given fname, which the caller is
4270  * responsible for eventually releasing.
4271  */
4272 
4273 nfs4_fname_t *
4274 fn_parent(nfs4_fname_t *fnp)
4275 {
4276         nfs4_fname_t *parent;
4277 
4278         mutex_enter(&fnp->fn_lock);
4279         parent = fnp->fn_parent;
4280         if (parent != NULL)
4281                 fn_hold(parent);
4282         mutex_exit(&fnp->fn_lock);
4283 
4284         return (parent);
4285 }
4286 
4287 /*
4288  * Update fnp so that its parent is newparent and its name is newname.
4289  */
4290 
4291 void
4292 fn_move(nfs4_fname_t *fnp, nfs4_fname_t *newparent, char *newname)
4293 {
4294         nfs4_fname_t *parent, *tmpfnp;
4295         ssize_t newlen;
4296         nfs4_fname_t key;
4297         avl_index_t where;
4298 
4299         /*
4300          * This assert exists to catch the client trying to rename
4301          * a dir to be a child of itself.  This happened at a recent
4302          * bakeoff against a 3rd party (broken) server which allowed
4303          * the rename to succeed.  If it trips it means that:
4304          *      a) the code in nfs4rename that detects this case is broken
4305          *      b) the server is broken (since it allowed the bogus rename)
4306          *
4307          * For non-DEBUG kernels, prepare for a recursive mutex_enter
4308          * panic below from:  mutex_enter(&newparent->fn_lock);
4309          */
4310         ASSERT(fnp != newparent);
4311 
4312         /*
4313          * Remove fnp from its current parent, change its name, then add it
4314          * to newparent. It might happen that fnp was replaced by another
4315          * nfs4_fname_t with the same fn_name in parent->fn_children.
4316          * In such case, fnp->fn_parent is NULL and we skip the removal
4317          * of fnp from its current parent.
4318          */
4319         mutex_enter(&fnp->fn_lock);
4320         parent = fnp->fn_parent;
4321         if (parent != NULL) {
4322                 mutex_enter(&parent->fn_lock);
4323                 avl_remove(&parent->fn_children, fnp);
4324                 mutex_exit(&parent->fn_lock);
4325                 fn_rele(&fnp->fn_parent);
4326         }
4327 
4328         newlen = strlen(newname);
4329         if (newlen != fnp->fn_len) {
4330                 ASSERT(newlen < MAXNAMELEN);
4331                 kmem_free(fnp->fn_name, fnp->fn_len + 1);
4332                 fnp->fn_name = kmem_alloc(newlen + 1, KM_SLEEP);
4333                 fnp->fn_len = newlen;
4334         }
4335         (void) strcpy(fnp->fn_name, newname);
4336 
4337 again:
4338         mutex_enter(&newparent->fn_lock);
4339         key.fn_name = fnp->fn_name;
4340         tmpfnp = avl_find(&newparent->fn_children, &key, &where);
4341         if (tmpfnp != NULL) {
4342                 /*
4343                  * This could be due to a file that was unlinked while
4344                  * open, or perhaps the rnode is in the free list.  Remove
4345                  * it from newparent and let it go away on its own.  The
4346                  * contorted code is to deal with lock order issues and
4347                  * race conditions.
4348                  */
4349                 fn_hold(tmpfnp);
4350                 mutex_exit(&newparent->fn_lock);
4351                 mutex_enter(&tmpfnp->fn_lock);
4352                 if (tmpfnp->fn_parent == newparent) {
4353                         mutex_enter(&newparent->fn_lock);
4354                         avl_remove(&newparent->fn_children, tmpfnp);
4355                         mutex_exit(&newparent->fn_lock);
4356                         fn_rele(&tmpfnp->fn_parent);
4357                 }
4358                 mutex_exit(&tmpfnp->fn_lock);
4359                 fn_rele(&tmpfnp);
4360                 goto again;
4361         }
4362         fnp->fn_parent = newparent;
4363         fn_hold(newparent);
4364         avl_insert(&newparent->fn_children, fnp, where);
4365         mutex_exit(&newparent->fn_lock);
4366         mutex_exit(&fnp->fn_lock);
4367 }
4368 
4369 #ifdef DEBUG
4370 /*
4371  * Return non-zero if the type information makes sense for the given vnode.
4372  * Otherwise panic.
4373  */
4374 int
4375 nfs4_consistent_type(vnode_t *vp)
4376 {
4377         rnode4_t *rp = VTOR4(vp);
4378 
4379         if (nfs4_vtype_debug && vp->v_type != VNON &&
4380             rp->r_attr.va_type != VNON && vp->v_type != rp->r_attr.va_type) {
4381                 cmn_err(CE_PANIC, "vnode %p type mismatch; v_type=%d, "
4382                     "rnode attr type=%d", (void *)vp, vp->v_type,
4383                     rp->r_attr.va_type);
4384         }
4385 
4386         return (1);
4387 }
4388 #endif /* DEBUG */
--- EOF ---