NFS4 data corruption (#3508) If async calls are disabled, nfs4_async_putapage is supposed to do its work synchronously. Due to a bug, it sometimes just does nothing, leaving the page for later. Unfortunately the caller has already reset the R4DIRTY flag. Without R4DIRTY, nfs4_attrcache_va can't see that there are still outstanding writes and accepts the file size from the server, which is too low. When the dirty page finally gets written back, the page size is truncated to the file size, leaving some bytes unwritten. Reviewed by: Marcel Telka <marcel@telka.sk> Reviewed by: Robert Gordon <rbg@openrbg.com>
1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 25 /* 26 * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T. 27 * All Rights Reserved 28 */ 29 30 #include <sys/param.h> 31 #include <sys/types.h> 32 #include <sys/systm.h> 33 #include <sys/thread.h> 34 #include <sys/t_lock.h> 35 #include <sys/time.h> 36 #include <sys/vnode.h> 37 #include <sys/vfs.h> 38 #include <sys/errno.h> 39 #include <sys/buf.h> 40 #include <sys/stat.h> 41 #include <sys/cred.h> 42 #include <sys/kmem.h> 43 #include <sys/debug.h> 44 #include <sys/dnlc.h> 45 #include <sys/vmsystm.h> 46 #include <sys/flock.h> 47 #include <sys/share.h> 48 #include <sys/cmn_err.h> 49 #include <sys/tiuser.h> 50 #include <sys/sysmacros.h> 51 #include <sys/callb.h> 52 #include <sys/acl.h> 53 #include <sys/kstat.h> 54 #include <sys/signal.h> 55 #include <sys/disp.h> 56 #include <sys/atomic.h> 57 #include <sys/list.h> 58 #include <sys/sdt.h> 59 60 #include <rpc/types.h> 61 #include <rpc/xdr.h> 62 #include <rpc/auth.h> 63 #include <rpc/clnt.h> 64 65 #include <nfs/nfs.h> 66 #include <nfs/nfs_clnt.h> 67 #include <nfs/nfs_acl.h> 68 69 #include <nfs/nfs4.h> 70 #include <nfs/rnode4.h> 71 #include <nfs/nfs4_clnt.h> 72 73 #include <vm/hat.h> 74 #include <vm/as.h> 75 #include <vm/page.h> 76 #include <vm/pvn.h> 77 #include <vm/seg.h> 78 #include <vm/seg_map.h> 79 #include <vm/seg_vn.h> 80 81 #include <sys/ddi.h> 82 83 /* 84 * Arguments to page-flush thread. 85 */ 86 typedef struct { 87 vnode_t *vp; 88 cred_t *cr; 89 } pgflush_t; 90 91 #ifdef DEBUG 92 int nfs4_client_lease_debug; 93 int nfs4_sharedfh_debug; 94 int nfs4_fname_debug; 95 96 /* temporary: panic if v_type is inconsistent with r_attr va_type */ 97 int nfs4_vtype_debug; 98 99 uint_t nfs4_tsd_key; 100 #endif 101 102 static time_t nfs4_client_resumed = 0; 103 static callb_id_t cid = 0; 104 105 static int nfs4renew(nfs4_server_t *); 106 static void nfs4_attrcache_va(vnode_t *, nfs4_ga_res_t *, int); 107 static void nfs4_pgflush_thread(pgflush_t *); 108 109 static boolean_t nfs4_client_cpr_callb(void *, int); 110 111 struct mi4_globals { 112 kmutex_t mig_lock; /* lock protecting mig_list */ 113 list_t mig_list; /* list of NFS v4 mounts in zone */ 114 boolean_t mig_destructor_called; 115 }; 116 117 static zone_key_t mi4_list_key; 118 119 /* 120 * Attributes caching: 121 * 122 * Attributes are cached in the rnode in struct vattr form. 123 * There is a time associated with the cached attributes (r_time_attr_inval) 124 * which tells whether the attributes are valid. The time is initialized 125 * to the difference between current time and the modify time of the vnode 126 * when new attributes are cached. This allows the attributes for 127 * files that have changed recently to be timed out sooner than for files 128 * that have not changed for a long time. There are minimum and maximum 129 * timeout values that can be set per mount point. 130 */ 131 132 /* 133 * If a cache purge is in progress, wait for it to finish. 134 * 135 * The current thread must not be in the middle of an 136 * nfs4_start_op/nfs4_end_op region. Otherwise, there could be a deadlock 137 * between this thread, a recovery thread, and the page flush thread. 138 */ 139 int 140 nfs4_waitfor_purge_complete(vnode_t *vp) 141 { 142 rnode4_t *rp; 143 k_sigset_t smask; 144 145 rp = VTOR4(vp); 146 if ((rp->r_serial != NULL && rp->r_serial != curthread) || 147 ((rp->r_flags & R4PGFLUSH) && rp->r_pgflush != curthread)) { 148 mutex_enter(&rp->r_statelock); 149 sigintr(&smask, VTOMI4(vp)->mi_flags & MI4_INT); 150 while ((rp->r_serial != NULL && rp->r_serial != curthread) || 151 ((rp->r_flags & R4PGFLUSH) && 152 rp->r_pgflush != curthread)) { 153 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) { 154 sigunintr(&smask); 155 mutex_exit(&rp->r_statelock); 156 return (EINTR); 157 } 158 } 159 sigunintr(&smask); 160 mutex_exit(&rp->r_statelock); 161 } 162 return (0); 163 } 164 165 /* 166 * Validate caches by checking cached attributes. If they have timed out, 167 * then get new attributes from the server. As a side effect, cache 168 * invalidation is done if the attributes have changed. 169 * 170 * If the attributes have not timed out and if there is a cache 171 * invalidation being done by some other thread, then wait until that 172 * thread has completed the cache invalidation. 173 */ 174 int 175 nfs4_validate_caches(vnode_t *vp, cred_t *cr) 176 { 177 int error; 178 nfs4_ga_res_t gar; 179 180 if (ATTRCACHE4_VALID(vp)) { 181 error = nfs4_waitfor_purge_complete(vp); 182 if (error) 183 return (error); 184 return (0); 185 } 186 187 gar.n4g_va.va_mask = AT_ALL; 188 return (nfs4_getattr_otw(vp, &gar, cr, 0)); 189 } 190 191 /* 192 * Fill in attribute from the cache. 193 * If valid, then return 0 to indicate that no error occurred, 194 * otherwise return 1 to indicate that an error occurred. 195 */ 196 static int 197 nfs4_getattr_cache(vnode_t *vp, struct vattr *vap) 198 { 199 rnode4_t *rp; 200 201 rp = VTOR4(vp); 202 mutex_enter(&rp->r_statelock); 203 mutex_enter(&rp->r_statev4_lock); 204 if (ATTRCACHE4_VALID(vp)) { 205 mutex_exit(&rp->r_statev4_lock); 206 /* 207 * Cached attributes are valid 208 */ 209 *vap = rp->r_attr; 210 mutex_exit(&rp->r_statelock); 211 return (0); 212 } 213 mutex_exit(&rp->r_statev4_lock); 214 mutex_exit(&rp->r_statelock); 215 return (1); 216 } 217 218 219 /* 220 * If returned error is ESTALE flush all caches. The nfs4_purge_caches() 221 * call is synchronous because all the pages were invalidated by the 222 * nfs4_invalidate_pages() call. 223 */ 224 void 225 nfs4_purge_stale_fh(int errno, vnode_t *vp, cred_t *cr) 226 { 227 struct rnode4 *rp = VTOR4(vp); 228 229 /* Ensure that the ..._end_op() call has been done */ 230 ASSERT(tsd_get(nfs4_tsd_key) == NULL); 231 232 if (errno != ESTALE) 233 return; 234 235 mutex_enter(&rp->r_statelock); 236 rp->r_flags |= R4STALE; 237 if (!rp->r_error) 238 rp->r_error = errno; 239 mutex_exit(&rp->r_statelock); 240 if (nfs4_has_pages(vp)) 241 nfs4_invalidate_pages(vp, (u_offset_t)0, cr); 242 nfs4_purge_caches(vp, NFS4_PURGE_DNLC, cr, FALSE); 243 } 244 245 /* 246 * Purge all of the various NFS `data' caches. If "asyncpg" is TRUE, the 247 * page purge is done asynchronously. 248 */ 249 void 250 nfs4_purge_caches(vnode_t *vp, int purge_dnlc, cred_t *cr, int asyncpg) 251 { 252 rnode4_t *rp; 253 char *contents; 254 vnode_t *xattr; 255 int size; 256 int pgflush; /* are we the page flush thread? */ 257 258 /* 259 * Purge the DNLC for any entries which refer to this file. 260 */ 261 if (vp->v_count > 1 && 262 (vp->v_type == VDIR || purge_dnlc == NFS4_PURGE_DNLC)) 263 dnlc_purge_vp(vp); 264 265 /* 266 * Clear any readdir state bits and purge the readlink response cache. 267 */ 268 rp = VTOR4(vp); 269 mutex_enter(&rp->r_statelock); 270 rp->r_flags &= ~R4LOOKUP; 271 contents = rp->r_symlink.contents; 272 size = rp->r_symlink.size; 273 rp->r_symlink.contents = NULL; 274 275 xattr = rp->r_xattr_dir; 276 rp->r_xattr_dir = NULL; 277 278 /* 279 * Purge pathconf cache too. 280 */ 281 rp->r_pathconf.pc4_xattr_valid = 0; 282 rp->r_pathconf.pc4_cache_valid = 0; 283 284 pgflush = (curthread == rp->r_pgflush); 285 mutex_exit(&rp->r_statelock); 286 287 if (contents != NULL) { 288 289 kmem_free((void *)contents, size); 290 } 291 292 if (xattr != NULL) 293 VN_RELE(xattr); 294 295 /* 296 * Flush the page cache. If the current thread is the page flush 297 * thread, don't initiate a new page flush. There's no need for 298 * it, and doing it correctly is hard. 299 */ 300 if (nfs4_has_pages(vp) && !pgflush) { 301 if (!asyncpg) { 302 (void) nfs4_waitfor_purge_complete(vp); 303 nfs4_flush_pages(vp, cr); 304 } else { 305 pgflush_t *args; 306 307 /* 308 * We don't hold r_statelock while creating the 309 * thread, in case the call blocks. So we use a 310 * flag to indicate that a page flush thread is 311 * active. 312 */ 313 mutex_enter(&rp->r_statelock); 314 if (rp->r_flags & R4PGFLUSH) { 315 mutex_exit(&rp->r_statelock); 316 } else { 317 rp->r_flags |= R4PGFLUSH; 318 mutex_exit(&rp->r_statelock); 319 320 args = kmem_alloc(sizeof (pgflush_t), 321 KM_SLEEP); 322 args->vp = vp; 323 VN_HOLD(args->vp); 324 args->cr = cr; 325 crhold(args->cr); 326 (void) zthread_create(NULL, 0, 327 nfs4_pgflush_thread, args, 0, 328 minclsyspri); 329 } 330 } 331 } 332 333 /* 334 * Flush the readdir response cache. 335 */ 336 nfs4_purge_rddir_cache(vp); 337 } 338 339 /* 340 * Invalidate all pages for the given file, after writing back the dirty 341 * ones. 342 */ 343 344 void 345 nfs4_flush_pages(vnode_t *vp, cred_t *cr) 346 { 347 int error; 348 rnode4_t *rp = VTOR4(vp); 349 350 error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_INVAL, cr, NULL); 351 if (error == ENOSPC || error == EDQUOT) { 352 mutex_enter(&rp->r_statelock); 353 if (!rp->r_error) 354 rp->r_error = error; 355 mutex_exit(&rp->r_statelock); 356 } 357 } 358 359 /* 360 * Page flush thread. 361 */ 362 363 static void 364 nfs4_pgflush_thread(pgflush_t *args) 365 { 366 rnode4_t *rp = VTOR4(args->vp); 367 368 /* remember which thread we are, so we don't deadlock ourselves */ 369 mutex_enter(&rp->r_statelock); 370 ASSERT(rp->r_pgflush == NULL); 371 rp->r_pgflush = curthread; 372 mutex_exit(&rp->r_statelock); 373 374 nfs4_flush_pages(args->vp, args->cr); 375 376 mutex_enter(&rp->r_statelock); 377 rp->r_pgflush = NULL; 378 rp->r_flags &= ~R4PGFLUSH; 379 cv_broadcast(&rp->r_cv); 380 mutex_exit(&rp->r_statelock); 381 382 VN_RELE(args->vp); 383 crfree(args->cr); 384 kmem_free(args, sizeof (pgflush_t)); 385 zthread_exit(); 386 } 387 388 /* 389 * Purge the readdir cache of all entries which are not currently 390 * being filled. 391 */ 392 void 393 nfs4_purge_rddir_cache(vnode_t *vp) 394 { 395 rnode4_t *rp; 396 397 rp = VTOR4(vp); 398 399 mutex_enter(&rp->r_statelock); 400 rp->r_direof = NULL; 401 rp->r_flags &= ~R4LOOKUP; 402 rp->r_flags |= R4READDIRWATTR; 403 rddir4_cache_purge(rp); 404 mutex_exit(&rp->r_statelock); 405 } 406 407 /* 408 * Set attributes cache for given vnode using virtual attributes. There is 409 * no cache validation, but if the attributes are deemed to be stale, they 410 * are ignored. This corresponds to nfs3_attrcache(). 411 * 412 * Set the timeout value on the attribute cache and fill it 413 * with the passed in attributes. 414 */ 415 void 416 nfs4_attrcache_noinval(vnode_t *vp, nfs4_ga_res_t *garp, hrtime_t t) 417 { 418 rnode4_t *rp = VTOR4(vp); 419 420 mutex_enter(&rp->r_statelock); 421 if (rp->r_time_attr_saved <= t) 422 nfs4_attrcache_va(vp, garp, FALSE); 423 mutex_exit(&rp->r_statelock); 424 } 425 426 /* 427 * Use the passed in virtual attributes to check to see whether the 428 * data and metadata caches are valid, cache the new attributes, and 429 * then do the cache invalidation if required. 430 * 431 * The cache validation and caching of the new attributes is done 432 * atomically via the use of the mutex, r_statelock. If required, 433 * the cache invalidation is done atomically w.r.t. the cache 434 * validation and caching of the attributes via the pseudo lock, 435 * r_serial. 436 * 437 * This routine is used to do cache validation and attributes caching 438 * for operations with a single set of post operation attributes. 439 */ 440 441 void 442 nfs4_attr_cache(vnode_t *vp, nfs4_ga_res_t *garp, 443 hrtime_t t, cred_t *cr, int async, 444 change_info4 *cinfo) 445 { 446 rnode4_t *rp; 447 int mtime_changed = 0; 448 int ctime_changed = 0; 449 vsecattr_t *vsp; 450 int was_serial, set_time_cache_inval, recov; 451 vattr_t *vap = &garp->n4g_va; 452 mntinfo4_t *mi = VTOMI4(vp); 453 len_t preattr_rsize; 454 boolean_t writemodify_set = B_FALSE; 455 boolean_t cachepurge_set = B_FALSE; 456 457 ASSERT(mi->mi_vfsp->vfs_dev == garp->n4g_va.va_fsid); 458 459 /* Is curthread the recovery thread? */ 460 mutex_enter(&mi->mi_lock); 461 recov = (VTOMI4(vp)->mi_recovthread == curthread); 462 mutex_exit(&mi->mi_lock); 463 464 rp = VTOR4(vp); 465 mutex_enter(&rp->r_statelock); 466 was_serial = (rp->r_serial == curthread); 467 if (rp->r_serial && !was_serial) { 468 klwp_t *lwp = ttolwp(curthread); 469 470 /* 471 * If we're the recovery thread, then purge current attrs 472 * and bail out to avoid potential deadlock between another 473 * thread caching attrs (r_serial thread), recov thread, 474 * and an async writer thread. 475 */ 476 if (recov) { 477 PURGE_ATTRCACHE4_LOCKED(rp); 478 mutex_exit(&rp->r_statelock); 479 return; 480 } 481 482 if (lwp != NULL) 483 lwp->lwp_nostop++; 484 while (rp->r_serial != NULL) { 485 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) { 486 mutex_exit(&rp->r_statelock); 487 if (lwp != NULL) 488 lwp->lwp_nostop--; 489 return; 490 } 491 } 492 if (lwp != NULL) 493 lwp->lwp_nostop--; 494 } 495 496 /* 497 * If there is a page flush thread, the current thread needs to 498 * bail out, to prevent a possible deadlock between the current 499 * thread (which might be in a start_op/end_op region), the 500 * recovery thread, and the page flush thread. Expire the 501 * attribute cache, so that any attributes the current thread was 502 * going to set are not lost. 503 */ 504 if ((rp->r_flags & R4PGFLUSH) && rp->r_pgflush != curthread) { 505 PURGE_ATTRCACHE4_LOCKED(rp); 506 mutex_exit(&rp->r_statelock); 507 return; 508 } 509 510 if (rp->r_time_attr_saved > t) { 511 /* 512 * Attributes have been cached since these attributes were 513 * probably made. If there is an inconsistency in what is 514 * cached, mark them invalid. If not, don't act on them. 515 */ 516 if (!CACHE4_VALID(rp, vap->va_mtime, vap->va_size)) 517 PURGE_ATTRCACHE4_LOCKED(rp); 518 mutex_exit(&rp->r_statelock); 519 return; 520 } 521 set_time_cache_inval = 0; 522 if (cinfo) { 523 /* 524 * Only directory modifying callers pass non-NULL cinfo. 525 */ 526 ASSERT(vp->v_type == VDIR); 527 /* 528 * If the cache timeout either doesn't exist or hasn't expired, 529 * and dir didn't changed on server before dirmod op 530 * and dir didn't change after dirmod op but before getattr 531 * then there's a chance that the client's cached data for 532 * this object is current (not stale). No immediate cache 533 * flush is required. 534 * 535 */ 536 if ((! rp->r_time_cache_inval || t < rp->r_time_cache_inval) && 537 cinfo->before == rp->r_change && 538 (garp->n4g_change_valid && 539 cinfo->after == garp->n4g_change)) { 540 541 /* 542 * If atomic isn't set, then the before/after info 543 * cannot be blindly trusted. For this case, we tell 544 * nfs4_attrcache_va to cache the attrs but also 545 * establish an absolute maximum cache timeout. When 546 * the timeout is reached, caches will be flushed. 547 */ 548 if (! cinfo->atomic) 549 set_time_cache_inval = 1; 550 } else { 551 552 /* 553 * We're not sure exactly what changed, but we know 554 * what to do. flush all caches for dir. remove the 555 * attr timeout. 556 * 557 * a) timeout expired. flush all caches. 558 * b) r_change != cinfo.before. flush all caches. 559 * c) r_change == cinfo.before, but cinfo.after != 560 * post-op getattr(change). flush all caches. 561 * d) post-op getattr(change) not provided by server. 562 * flush all caches. 563 */ 564 mtime_changed = 1; 565 ctime_changed = 1; 566 rp->r_time_cache_inval = 0; 567 } 568 } else { 569 /* 570 * Write thread after writing data to file on remote server, 571 * will always set R4WRITEMODIFIED to indicate that file on 572 * remote server was modified with a WRITE operation and would 573 * have marked attribute cache as timed out. If R4WRITEMODIFIED 574 * is set, then do not check for mtime and ctime change. 575 */ 576 if (!(rp->r_flags & R4WRITEMODIFIED)) { 577 if (!CACHE4_VALID(rp, vap->va_mtime, vap->va_size)) 578 mtime_changed = 1; 579 580 if (rp->r_attr.va_ctime.tv_sec != 581 vap->va_ctime.tv_sec || 582 rp->r_attr.va_ctime.tv_nsec != 583 vap->va_ctime.tv_nsec) 584 ctime_changed = 1; 585 } else { 586 writemodify_set = B_TRUE; 587 } 588 } 589 590 preattr_rsize = rp->r_size; 591 592 nfs4_attrcache_va(vp, garp, set_time_cache_inval); 593 594 /* 595 * If we have updated filesize in nfs4_attrcache_va, as soon as we 596 * drop statelock we will be in transition of purging all 597 * our caches and updating them. It is possible for another 598 * thread to pick this new file size and read in zeroed data. 599 * stall other threads till cache purge is complete. 600 */ 601 if ((!cinfo) && (rp->r_size != preattr_rsize)) { 602 /* 603 * If R4WRITEMODIFIED was set and we have updated the file 604 * size, Server's returned file size need not necessarily 605 * be because of this Client's WRITE. We need to purge 606 * all caches. 607 */ 608 if (writemodify_set) 609 mtime_changed = 1; 610 611 if (mtime_changed && !(rp->r_flags & R4INCACHEPURGE)) { 612 rp->r_flags |= R4INCACHEPURGE; 613 cachepurge_set = B_TRUE; 614 } 615 } 616 617 if (!mtime_changed && !ctime_changed) { 618 mutex_exit(&rp->r_statelock); 619 return; 620 } 621 622 rp->r_serial = curthread; 623 624 mutex_exit(&rp->r_statelock); 625 626 /* 627 * If we're the recov thread, then force async nfs4_purge_caches 628 * to avoid potential deadlock. 629 */ 630 if (mtime_changed) 631 nfs4_purge_caches(vp, NFS4_NOPURGE_DNLC, cr, recov ? 1 : async); 632 633 if ((rp->r_flags & R4INCACHEPURGE) && cachepurge_set) { 634 mutex_enter(&rp->r_statelock); 635 rp->r_flags &= ~R4INCACHEPURGE; 636 cv_broadcast(&rp->r_cv); 637 mutex_exit(&rp->r_statelock); 638 cachepurge_set = B_FALSE; 639 } 640 641 if (ctime_changed) { 642 (void) nfs4_access_purge_rp(rp); 643 if (rp->r_secattr != NULL) { 644 mutex_enter(&rp->r_statelock); 645 vsp = rp->r_secattr; 646 rp->r_secattr = NULL; 647 mutex_exit(&rp->r_statelock); 648 if (vsp != NULL) 649 nfs4_acl_free_cache(vsp); 650 } 651 } 652 653 if (!was_serial) { 654 mutex_enter(&rp->r_statelock); 655 rp->r_serial = NULL; 656 cv_broadcast(&rp->r_cv); 657 mutex_exit(&rp->r_statelock); 658 } 659 } 660 661 /* 662 * Set attributes cache for given vnode using virtual attributes. 663 * 664 * Set the timeout value on the attribute cache and fill it 665 * with the passed in attributes. 666 * 667 * The caller must be holding r_statelock. 668 */ 669 static void 670 nfs4_attrcache_va(vnode_t *vp, nfs4_ga_res_t *garp, int set_cache_timeout) 671 { 672 rnode4_t *rp; 673 mntinfo4_t *mi; 674 hrtime_t delta; 675 hrtime_t now; 676 vattr_t *vap = &garp->n4g_va; 677 678 rp = VTOR4(vp); 679 680 ASSERT(MUTEX_HELD(&rp->r_statelock)); 681 ASSERT(vap->va_mask == AT_ALL); 682 683 /* Switch to master before checking v_flag */ 684 if (IS_SHADOW(vp, rp)) 685 vp = RTOV4(rp); 686 687 now = gethrtime(); 688 689 mi = VTOMI4(vp); 690 691 /* 692 * Only establish a new cache timeout (if requested). Never 693 * extend a timeout. Never clear a timeout. Clearing a timeout 694 * is done by nfs4_update_dircaches (ancestor in our call chain) 695 */ 696 if (set_cache_timeout && ! rp->r_time_cache_inval) 697 rp->r_time_cache_inval = now + mi->mi_acdirmax; 698 699 /* 700 * Delta is the number of nanoseconds that we will 701 * cache the attributes of the file. It is based on 702 * the number of nanoseconds since the last time that 703 * we detected a change. The assumption is that files 704 * that changed recently are likely to change again. 705 * There is a minimum and a maximum for regular files 706 * and for directories which is enforced though. 707 * 708 * Using the time since last change was detected 709 * eliminates direct comparison or calculation 710 * using mixed client and server times. NFS does 711 * not make any assumptions regarding the client 712 * and server clocks being synchronized. 713 */ 714 if (vap->va_mtime.tv_sec != rp->r_attr.va_mtime.tv_sec || 715 vap->va_mtime.tv_nsec != rp->r_attr.va_mtime.tv_nsec || 716 vap->va_size != rp->r_attr.va_size) { 717 rp->r_time_attr_saved = now; 718 } 719 720 if ((mi->mi_flags & MI4_NOAC) || (vp->v_flag & VNOCACHE)) 721 delta = 0; 722 else { 723 delta = now - rp->r_time_attr_saved; 724 if (vp->v_type == VDIR) { 725 if (delta < mi->mi_acdirmin) 726 delta = mi->mi_acdirmin; 727 else if (delta > mi->mi_acdirmax) 728 delta = mi->mi_acdirmax; 729 } else { 730 if (delta < mi->mi_acregmin) 731 delta = mi->mi_acregmin; 732 else if (delta > mi->mi_acregmax) 733 delta = mi->mi_acregmax; 734 } 735 } 736 rp->r_time_attr_inval = now + delta; 737 738 rp->r_attr = *vap; 739 if (garp->n4g_change_valid) 740 rp->r_change = garp->n4g_change; 741 742 /* 743 * The attributes that were returned may be valid and can 744 * be used, but they may not be allowed to be cached. 745 * Reset the timers to cause immediate invalidation and 746 * clear r_change so no VERIFY operations will suceed 747 */ 748 if (garp->n4g_attrwhy == NFS4_GETATTR_NOCACHE_OK) { 749 rp->r_time_attr_inval = now; 750 rp->r_time_attr_saved = now; 751 rp->r_change = 0; 752 } 753 754 /* 755 * If mounted_on_fileid returned AND the object is a stub, 756 * then set object's va_nodeid to the mounted over fid 757 * returned by server. 758 * 759 * If mounted_on_fileid not provided/supported, then 760 * just set it to 0 for now. Eventually it would be 761 * better to set it to a hashed version of FH. This 762 * would probably be good enough to provide a unique 763 * fid/d_ino within a dir. 764 * 765 * We don't need to carry mounted_on_fileid in the 766 * rnode as long as the client never requests fileid 767 * without also requesting mounted_on_fileid. For 768 * now, it stays. 769 */ 770 if (garp->n4g_mon_fid_valid) { 771 rp->r_mntd_fid = garp->n4g_mon_fid; 772 773 if (RP_ISSTUB(rp)) 774 rp->r_attr.va_nodeid = rp->r_mntd_fid; 775 } 776 777 /* 778 * Check to see if there are valid pathconf bits to 779 * cache in the rnode. 780 */ 781 if (garp->n4g_ext_res) { 782 if (garp->n4g_ext_res->n4g_pc4.pc4_cache_valid) { 783 rp->r_pathconf = garp->n4g_ext_res->n4g_pc4; 784 } else { 785 if (garp->n4g_ext_res->n4g_pc4.pc4_xattr_valid) { 786 rp->r_pathconf.pc4_xattr_valid = TRUE; 787 rp->r_pathconf.pc4_xattr_exists = 788 garp->n4g_ext_res->n4g_pc4.pc4_xattr_exists; 789 } 790 } 791 } 792 /* 793 * Update the size of the file if there is no cached data or if 794 * the cached data is clean and there is no data being written 795 * out. 796 */ 797 if (rp->r_size != vap->va_size && 798 (!vn_has_cached_data(vp) || 799 (!(rp->r_flags & R4DIRTY) && rp->r_count == 0))) { 800 rp->r_size = vap->va_size; 801 } 802 nfs_setswaplike(vp, vap); 803 rp->r_flags &= ~R4WRITEMODIFIED; 804 } 805 806 /* 807 * Get attributes over-the-wire and update attributes cache 808 * if no error occurred in the over-the-wire operation. 809 * Return 0 if successful, otherwise error. 810 */ 811 int 812 nfs4_getattr_otw(vnode_t *vp, nfs4_ga_res_t *garp, cred_t *cr, int get_acl) 813 { 814 mntinfo4_t *mi = VTOMI4(vp); 815 hrtime_t t; 816 nfs4_recov_state_t recov_state; 817 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 818 819 recov_state.rs_flags = 0; 820 recov_state.rs_num_retry_despite_err = 0; 821 822 /* Save the original mount point security flavor */ 823 (void) save_mnt_secinfo(mi->mi_curr_serv); 824 825 recov_retry: 826 827 if ((e.error = nfs4_start_fop(mi, vp, NULL, OH_GETATTR, 828 &recov_state, NULL))) { 829 (void) check_mnt_secinfo(mi->mi_curr_serv, vp); 830 return (e.error); 831 } 832 833 t = gethrtime(); 834 835 nfs4_getattr_otw_norecovery(vp, garp, &e, cr, get_acl); 836 837 if (nfs4_needs_recovery(&e, FALSE, vp->v_vfsp)) { 838 if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL, 839 NULL, OP_GETATTR, NULL, NULL, NULL) == FALSE) { 840 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, 841 &recov_state, 1); 842 goto recov_retry; 843 } 844 } 845 846 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state, 0); 847 848 if (!e.error) { 849 if (e.stat == NFS4_OK) { 850 nfs4_attr_cache(vp, garp, t, cr, FALSE, NULL); 851 } else { 852 e.error = geterrno4(e.stat); 853 854 nfs4_purge_stale_fh(e.error, vp, cr); 855 } 856 } 857 858 /* 859 * If getattr a node that is a stub for a crossed 860 * mount point, keep the original secinfo flavor for 861 * the current file system, not the crossed one. 862 */ 863 (void) check_mnt_secinfo(mi->mi_curr_serv, vp); 864 865 return (e.error); 866 } 867 868 /* 869 * Generate a compound to get attributes over-the-wire. 870 */ 871 void 872 nfs4_getattr_otw_norecovery(vnode_t *vp, nfs4_ga_res_t *garp, 873 nfs4_error_t *ep, cred_t *cr, int get_acl) 874 { 875 COMPOUND4args_clnt args; 876 COMPOUND4res_clnt res; 877 int doqueue; 878 rnode4_t *rp = VTOR4(vp); 879 nfs_argop4 argop[2]; 880 881 args.ctag = TAG_GETATTR; 882 883 args.array_len = 2; 884 args.array = argop; 885 886 /* putfh */ 887 argop[0].argop = OP_CPUTFH; 888 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 889 890 /* getattr */ 891 /* 892 * Unlike nfs version 2 and 3, where getattr returns all the 893 * attributes, nfs version 4 returns only the ones explicitly 894 * asked for. This creates problems, as some system functions 895 * (e.g. cache check) require certain attributes and if the 896 * cached node lacks some attributes such as uid/gid, it can 897 * affect system utilities (e.g. "ls") that rely on the information 898 * to be there. This can lead to anything from system crashes to 899 * corrupted information processed by user apps. 900 * So to ensure that all bases are covered, request at least 901 * the AT_ALL attribute mask. 902 */ 903 argop[1].argop = OP_GETATTR; 904 argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 905 if (get_acl) 906 argop[1].nfs_argop4_u.opgetattr.attr_request |= FATTR4_ACL_MASK; 907 argop[1].nfs_argop4_u.opgetattr.mi = VTOMI4(vp); 908 909 doqueue = 1; 910 911 rfs4call(VTOMI4(vp), &args, &res, cr, &doqueue, 0, ep); 912 913 if (ep->error) 914 return; 915 916 if (res.status != NFS4_OK) { 917 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 918 return; 919 } 920 921 *garp = res.array[1].nfs_resop4_u.opgetattr.ga_res; 922 923 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 924 } 925 926 /* 927 * Return either cached or remote attributes. If get remote attr 928 * use them to check and invalidate caches, then cache the new attributes. 929 */ 930 int 931 nfs4getattr(vnode_t *vp, vattr_t *vap, cred_t *cr) 932 { 933 int error; 934 rnode4_t *rp; 935 nfs4_ga_res_t gar; 936 937 ASSERT(nfs4_consistent_type(vp)); 938 939 /* 940 * If we've got cached attributes, we're done, otherwise go 941 * to the server to get attributes, which will update the cache 942 * in the process. Either way, use the cached attributes for 943 * the caller's vattr_t. 944 * 945 * Note that we ignore the gar set by the OTW call: the attr caching 946 * code may make adjustments when storing to the rnode, and we want 947 * to see those changes here. 948 */ 949 rp = VTOR4(vp); 950 error = 0; 951 mutex_enter(&rp->r_statelock); 952 if (!ATTRCACHE4_VALID(vp)) { 953 mutex_exit(&rp->r_statelock); 954 error = nfs4_getattr_otw(vp, &gar, cr, 0); 955 mutex_enter(&rp->r_statelock); 956 } 957 958 if (!error) 959 *vap = rp->r_attr; 960 961 /* Return the client's view of file size */ 962 vap->va_size = rp->r_size; 963 964 mutex_exit(&rp->r_statelock); 965 966 ASSERT(nfs4_consistent_type(vp)); 967 968 return (error); 969 } 970 971 int 972 nfs4_attr_otw(vnode_t *vp, nfs4_tag_type_t tag_type, 973 nfs4_ga_res_t *garp, bitmap4 reqbitmap, cred_t *cr) 974 { 975 COMPOUND4args_clnt args; 976 COMPOUND4res_clnt res; 977 int doqueue; 978 nfs_argop4 argop[2]; 979 mntinfo4_t *mi = VTOMI4(vp); 980 bool_t needrecov = FALSE; 981 nfs4_recov_state_t recov_state; 982 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 983 nfs4_ga_ext_res_t *gerp; 984 985 recov_state.rs_flags = 0; 986 recov_state.rs_num_retry_despite_err = 0; 987 988 recov_retry: 989 args.ctag = tag_type; 990 991 args.array_len = 2; 992 args.array = argop; 993 994 e.error = nfs4_start_fop(mi, vp, NULL, OH_GETATTR, &recov_state, NULL); 995 if (e.error) 996 return (e.error); 997 998 /* putfh */ 999 argop[0].argop = OP_CPUTFH; 1000 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh; 1001 1002 /* getattr */ 1003 argop[1].argop = OP_GETATTR; 1004 argop[1].nfs_argop4_u.opgetattr.attr_request = reqbitmap; 1005 argop[1].nfs_argop4_u.opgetattr.mi = mi; 1006 1007 doqueue = 1; 1008 1009 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 1010 "nfs4_attr_otw: %s call, rp %s", needrecov ? "recov" : "first", 1011 rnode4info(VTOR4(vp)))); 1012 1013 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 1014 1015 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp); 1016 if (!needrecov && e.error) { 1017 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state, 1018 needrecov); 1019 return (e.error); 1020 } 1021 1022 if (needrecov) { 1023 bool_t abort; 1024 1025 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 1026 "nfs4_attr_otw: initiating recovery\n")); 1027 1028 abort = nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL, 1029 NULL, OP_GETATTR, NULL, NULL, NULL); 1030 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state, 1031 needrecov); 1032 if (!e.error) { 1033 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1034 e.error = geterrno4(res.status); 1035 } 1036 if (abort == FALSE) 1037 goto recov_retry; 1038 return (e.error); 1039 } 1040 1041 if (res.status) { 1042 e.error = geterrno4(res.status); 1043 } else { 1044 gerp = garp->n4g_ext_res; 1045 bcopy(&res.array[1].nfs_resop4_u.opgetattr.ga_res, 1046 garp, sizeof (nfs4_ga_res_t)); 1047 garp->n4g_ext_res = gerp; 1048 if (garp->n4g_ext_res && 1049 res.array[1].nfs_resop4_u.opgetattr.ga_res.n4g_ext_res) 1050 bcopy(res.array[1].nfs_resop4_u.opgetattr. 1051 ga_res.n4g_ext_res, 1052 garp->n4g_ext_res, sizeof (nfs4_ga_ext_res_t)); 1053 } 1054 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1055 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state, 1056 needrecov); 1057 return (e.error); 1058 } 1059 1060 /* 1061 * Asynchronous I/O parameters. nfs_async_threads is the high-water mark 1062 * for the demand-based allocation of async threads per-mount. The 1063 * nfs_async_timeout is the amount of time a thread will live after it 1064 * becomes idle, unless new I/O requests are received before the thread 1065 * dies. See nfs4_async_putpage and nfs4_async_start. 1066 */ 1067 1068 static void nfs4_async_start(struct vfs *); 1069 static void nfs4_async_pgops_start(struct vfs *); 1070 static void nfs4_async_common_start(struct vfs *, int); 1071 1072 static void 1073 free_async_args4(struct nfs4_async_reqs *args) 1074 { 1075 rnode4_t *rp; 1076 1077 if (args->a_io != NFS4_INACTIVE) { 1078 rp = VTOR4(args->a_vp); 1079 mutex_enter(&rp->r_statelock); 1080 rp->r_count--; 1081 if (args->a_io == NFS4_PUTAPAGE || 1082 args->a_io == NFS4_PAGEIO) 1083 rp->r_awcount--; 1084 cv_broadcast(&rp->r_cv); 1085 mutex_exit(&rp->r_statelock); 1086 VN_RELE(args->a_vp); 1087 } 1088 crfree(args->a_cred); 1089 kmem_free(args, sizeof (*args)); 1090 } 1091 1092 /* 1093 * Cross-zone thread creation and NFS access is disallowed, yet fsflush() and 1094 * pageout(), running in the global zone, have legitimate reasons to do 1095 * VOP_PUTPAGE(B_ASYNC) on other zones' NFS mounts. We avoid the problem by 1096 * use of a a per-mount "asynchronous requests manager thread" which is 1097 * signaled by the various asynchronous work routines when there is 1098 * asynchronous work to be done. It is responsible for creating new 1099 * worker threads if necessary, and notifying existing worker threads 1100 * that there is work to be done. 1101 * 1102 * In other words, it will "take the specifications from the customers and 1103 * give them to the engineers." 1104 * 1105 * Worker threads die off of their own accord if they are no longer 1106 * needed. 1107 * 1108 * This thread is killed when the zone is going away or the filesystem 1109 * is being unmounted. 1110 */ 1111 void 1112 nfs4_async_manager(vfs_t *vfsp) 1113 { 1114 callb_cpr_t cprinfo; 1115 mntinfo4_t *mi; 1116 uint_t max_threads; 1117 1118 mi = VFTOMI4(vfsp); 1119 1120 CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, 1121 "nfs4_async_manager"); 1122 1123 mutex_enter(&mi->mi_async_lock); 1124 /* 1125 * We want to stash the max number of threads that this mount was 1126 * allowed so we can use it later when the variable is set to zero as 1127 * part of the zone/mount going away. 1128 * 1129 * We want to be able to create at least one thread to handle 1130 * asynchronous inactive calls. 1131 */ 1132 max_threads = MAX(mi->mi_max_threads, 1); 1133 /* 1134 * We don't want to wait for mi_max_threads to go to zero, since that 1135 * happens as part of a failed unmount, but this thread should only 1136 * exit when the mount is really going away. 1137 * 1138 * Once MI4_ASYNC_MGR_STOP is set, no more async operations will be 1139 * attempted: the various _async_*() functions know to do things 1140 * inline if mi_max_threads == 0. Henceforth we just drain out the 1141 * outstanding requests. 1142 * 1143 * Note that we still create zthreads even if we notice the zone is 1144 * shutting down (MI4_ASYNC_MGR_STOP is set); this may cause the zone 1145 * shutdown sequence to take slightly longer in some cases, but 1146 * doesn't violate the protocol, as all threads will exit as soon as 1147 * they're done processing the remaining requests. 1148 */ 1149 for (;;) { 1150 while (mi->mi_async_req_count > 0) { 1151 /* 1152 * Paranoia: If the mount started out having 1153 * (mi->mi_max_threads == 0), and the value was 1154 * later changed (via a debugger or somesuch), 1155 * we could be confused since we will think we 1156 * can't create any threads, and the calling 1157 * code (which looks at the current value of 1158 * mi->mi_max_threads, now non-zero) thinks we 1159 * can. 1160 * 1161 * So, because we're paranoid, we create threads 1162 * up to the maximum of the original and the 1163 * current value. This means that future 1164 * (debugger-induced) alterations of 1165 * mi->mi_max_threads are ignored for our 1166 * purposes, but who told them they could change 1167 * random values on a live kernel anyhow? 1168 */ 1169 if (mi->mi_threads[NFS4_ASYNC_QUEUE] < 1170 MAX(mi->mi_max_threads, max_threads)) { 1171 mi->mi_threads[NFS4_ASYNC_QUEUE]++; 1172 mutex_exit(&mi->mi_async_lock); 1173 MI4_HOLD(mi); 1174 VFS_HOLD(vfsp); /* hold for new thread */ 1175 (void) zthread_create(NULL, 0, nfs4_async_start, 1176 vfsp, 0, minclsyspri); 1177 mutex_enter(&mi->mi_async_lock); 1178 } else if (mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] < 1179 NUM_ASYNC_PGOPS_THREADS) { 1180 mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE]++; 1181 mutex_exit(&mi->mi_async_lock); 1182 MI4_HOLD(mi); 1183 VFS_HOLD(vfsp); /* hold for new thread */ 1184 (void) zthread_create(NULL, 0, 1185 nfs4_async_pgops_start, vfsp, 0, 1186 minclsyspri); 1187 mutex_enter(&mi->mi_async_lock); 1188 } 1189 NFS4_WAKE_ASYNC_WORKER(mi->mi_async_work_cv); 1190 ASSERT(mi->mi_async_req_count != 0); 1191 mi->mi_async_req_count--; 1192 } 1193 1194 mutex_enter(&mi->mi_lock); 1195 if (mi->mi_flags & MI4_ASYNC_MGR_STOP) { 1196 mutex_exit(&mi->mi_lock); 1197 break; 1198 } 1199 mutex_exit(&mi->mi_lock); 1200 1201 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1202 cv_wait(&mi->mi_async_reqs_cv, &mi->mi_async_lock); 1203 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock); 1204 } 1205 1206 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE, 1207 "nfs4_async_manager exiting for vfs %p\n", (void *)mi->mi_vfsp)); 1208 /* 1209 * Let everyone know we're done. 1210 */ 1211 mi->mi_manager_thread = NULL; 1212 /* 1213 * Wake up the inactive thread. 1214 */ 1215 cv_broadcast(&mi->mi_inact_req_cv); 1216 /* 1217 * Wake up anyone sitting in nfs4_async_manager_stop() 1218 */ 1219 cv_broadcast(&mi->mi_async_cv); 1220 /* 1221 * There is no explicit call to mutex_exit(&mi->mi_async_lock) 1222 * since CALLB_CPR_EXIT is actually responsible for releasing 1223 * 'mi_async_lock'. 1224 */ 1225 CALLB_CPR_EXIT(&cprinfo); 1226 VFS_RELE(vfsp); /* release thread's hold */ 1227 MI4_RELE(mi); 1228 zthread_exit(); 1229 } 1230 1231 /* 1232 * Signal (and wait for) the async manager thread to clean up and go away. 1233 */ 1234 void 1235 nfs4_async_manager_stop(vfs_t *vfsp) 1236 { 1237 mntinfo4_t *mi = VFTOMI4(vfsp); 1238 1239 mutex_enter(&mi->mi_async_lock); 1240 mutex_enter(&mi->mi_lock); 1241 mi->mi_flags |= MI4_ASYNC_MGR_STOP; 1242 mutex_exit(&mi->mi_lock); 1243 cv_broadcast(&mi->mi_async_reqs_cv); 1244 /* 1245 * Wait for the async manager thread to die. 1246 */ 1247 while (mi->mi_manager_thread != NULL) 1248 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock); 1249 mutex_exit(&mi->mi_async_lock); 1250 } 1251 1252 int 1253 nfs4_async_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr, 1254 struct seg *seg, cred_t *cr, void (*readahead)(vnode_t *, 1255 u_offset_t, caddr_t, struct seg *, cred_t *)) 1256 { 1257 rnode4_t *rp; 1258 mntinfo4_t *mi; 1259 struct nfs4_async_reqs *args; 1260 1261 rp = VTOR4(vp); 1262 ASSERT(rp->r_freef == NULL); 1263 1264 mi = VTOMI4(vp); 1265 1266 /* 1267 * If addr falls in a different segment, don't bother doing readahead. 1268 */ 1269 if (addr >= seg->s_base + seg->s_size) 1270 return (-1); 1271 1272 /* 1273 * If we can't allocate a request structure, punt on the readahead. 1274 */ 1275 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1276 return (-1); 1277 1278 /* 1279 * If a lock operation is pending, don't initiate any new 1280 * readaheads. Otherwise, bump r_count to indicate the new 1281 * asynchronous I/O. 1282 */ 1283 if (!nfs_rw_tryenter(&rp->r_lkserlock, RW_READER)) { 1284 kmem_free(args, sizeof (*args)); 1285 return (-1); 1286 } 1287 mutex_enter(&rp->r_statelock); 1288 rp->r_count++; 1289 mutex_exit(&rp->r_statelock); 1290 nfs_rw_exit(&rp->r_lkserlock); 1291 1292 args->a_next = NULL; 1293 #ifdef DEBUG 1294 args->a_queuer = curthread; 1295 #endif 1296 VN_HOLD(vp); 1297 args->a_vp = vp; 1298 ASSERT(cr != NULL); 1299 crhold(cr); 1300 args->a_cred = cr; 1301 args->a_io = NFS4_READ_AHEAD; 1302 args->a_nfs4_readahead = readahead; 1303 args->a_nfs4_blkoff = blkoff; 1304 args->a_nfs4_seg = seg; 1305 args->a_nfs4_addr = addr; 1306 1307 mutex_enter(&mi->mi_async_lock); 1308 1309 /* 1310 * If asyncio has been disabled, don't bother readahead. 1311 */ 1312 if (mi->mi_max_threads == 0) { 1313 mutex_exit(&mi->mi_async_lock); 1314 goto noasync; 1315 } 1316 1317 /* 1318 * Link request structure into the async list and 1319 * wakeup async thread to do the i/o. 1320 */ 1321 if (mi->mi_async_reqs[NFS4_READ_AHEAD] == NULL) { 1322 mi->mi_async_reqs[NFS4_READ_AHEAD] = args; 1323 mi->mi_async_tail[NFS4_READ_AHEAD] = args; 1324 } else { 1325 mi->mi_async_tail[NFS4_READ_AHEAD]->a_next = args; 1326 mi->mi_async_tail[NFS4_READ_AHEAD] = args; 1327 } 1328 1329 if (mi->mi_io_kstats) { 1330 mutex_enter(&mi->mi_lock); 1331 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1332 mutex_exit(&mi->mi_lock); 1333 } 1334 1335 mi->mi_async_req_count++; 1336 ASSERT(mi->mi_async_req_count != 0); 1337 cv_signal(&mi->mi_async_reqs_cv); 1338 mutex_exit(&mi->mi_async_lock); 1339 return (0); 1340 1341 noasync: 1342 mutex_enter(&rp->r_statelock); 1343 rp->r_count--; 1344 cv_broadcast(&rp->r_cv); 1345 mutex_exit(&rp->r_statelock); 1346 VN_RELE(vp); 1347 crfree(cr); 1348 kmem_free(args, sizeof (*args)); 1349 return (-1); 1350 } 1351 1352 static void 1353 nfs4_async_start(struct vfs *vfsp) 1354 { 1355 nfs4_async_common_start(vfsp, NFS4_ASYNC_QUEUE); 1356 } 1357 1358 static void 1359 nfs4_async_pgops_start(struct vfs *vfsp) 1360 { 1361 nfs4_async_common_start(vfsp, NFS4_ASYNC_PGOPS_QUEUE); 1362 } 1363 1364 /* 1365 * The async queues for each mounted file system are arranged as a 1366 * set of queues, one for each async i/o type. Requests are taken 1367 * from the queues in a round-robin fashion. A number of consecutive 1368 * requests are taken from each queue before moving on to the next 1369 * queue. This functionality may allow the NFS Version 2 server to do 1370 * write clustering, even if the client is mixing writes and reads 1371 * because it will take multiple write requests from the queue 1372 * before processing any of the other async i/o types. 1373 * 1374 * XXX The nfs4_async_common_start thread is unsafe in the light of the present 1375 * model defined by cpr to suspend the system. Specifically over the 1376 * wire calls are cpr-unsafe. The thread should be reevaluated in 1377 * case of future updates to the cpr model. 1378 */ 1379 static void 1380 nfs4_async_common_start(struct vfs *vfsp, int async_queue) 1381 { 1382 struct nfs4_async_reqs *args; 1383 mntinfo4_t *mi = VFTOMI4(vfsp); 1384 clock_t time_left = 1; 1385 callb_cpr_t cprinfo; 1386 int i; 1387 extern int nfs_async_timeout; 1388 int async_types; 1389 kcondvar_t *async_work_cv; 1390 1391 if (async_queue == NFS4_ASYNC_QUEUE) { 1392 async_types = NFS4_ASYNC_TYPES; 1393 async_work_cv = &mi->mi_async_work_cv[NFS4_ASYNC_QUEUE]; 1394 } else { 1395 async_types = NFS4_ASYNC_PGOPS_TYPES; 1396 async_work_cv = &mi->mi_async_work_cv[NFS4_ASYNC_PGOPS_QUEUE]; 1397 } 1398 1399 /* 1400 * Dynamic initialization of nfs_async_timeout to allow nfs to be 1401 * built in an implementation independent manner. 1402 */ 1403 if (nfs_async_timeout == -1) 1404 nfs_async_timeout = NFS_ASYNC_TIMEOUT; 1405 1406 CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, "nas"); 1407 1408 mutex_enter(&mi->mi_async_lock); 1409 for (;;) { 1410 /* 1411 * Find the next queue containing an entry. We start 1412 * at the current queue pointer and then round robin 1413 * through all of them until we either find a non-empty 1414 * queue or have looked through all of them. 1415 */ 1416 for (i = 0; i < async_types; i++) { 1417 args = *mi->mi_async_curr[async_queue]; 1418 if (args != NULL) 1419 break; 1420 mi->mi_async_curr[async_queue]++; 1421 if (mi->mi_async_curr[async_queue] == 1422 &mi->mi_async_reqs[async_types]) { 1423 mi->mi_async_curr[async_queue] = 1424 &mi->mi_async_reqs[0]; 1425 } 1426 } 1427 /* 1428 * If we didn't find a entry, then block until woken up 1429 * again and then look through the queues again. 1430 */ 1431 if (args == NULL) { 1432 /* 1433 * Exiting is considered to be safe for CPR as well 1434 */ 1435 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1436 1437 /* 1438 * Wakeup thread waiting to unmount the file 1439 * system only if all async threads are inactive. 1440 * 1441 * If we've timed-out and there's nothing to do, 1442 * then get rid of this thread. 1443 */ 1444 if (mi->mi_max_threads == 0 || time_left <= 0) { 1445 --mi->mi_threads[async_queue]; 1446 1447 if (mi->mi_threads[NFS4_ASYNC_QUEUE] == 0 && 1448 mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] == 0) 1449 cv_signal(&mi->mi_async_cv); 1450 CALLB_CPR_EXIT(&cprinfo); 1451 VFS_RELE(vfsp); /* release thread's hold */ 1452 MI4_RELE(mi); 1453 zthread_exit(); 1454 /* NOTREACHED */ 1455 } 1456 time_left = cv_reltimedwait(async_work_cv, 1457 &mi->mi_async_lock, nfs_async_timeout, 1458 TR_CLOCK_TICK); 1459 1460 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock); 1461 1462 continue; 1463 } else { 1464 time_left = 1; 1465 } 1466 1467 /* 1468 * Remove the request from the async queue and then 1469 * update the current async request queue pointer. If 1470 * the current queue is empty or we have removed enough 1471 * consecutive entries from it, then reset the counter 1472 * for this queue and then move the current pointer to 1473 * the next queue. 1474 */ 1475 *mi->mi_async_curr[async_queue] = args->a_next; 1476 if (*mi->mi_async_curr[async_queue] == NULL || 1477 --mi->mi_async_clusters[args->a_io] == 0) { 1478 mi->mi_async_clusters[args->a_io] = 1479 mi->mi_async_init_clusters; 1480 mi->mi_async_curr[async_queue]++; 1481 if (mi->mi_async_curr[async_queue] == 1482 &mi->mi_async_reqs[async_types]) { 1483 mi->mi_async_curr[async_queue] = 1484 &mi->mi_async_reqs[0]; 1485 } 1486 } 1487 1488 if (args->a_io != NFS4_INACTIVE && mi->mi_io_kstats) { 1489 mutex_enter(&mi->mi_lock); 1490 kstat_waitq_exit(KSTAT_IO_PTR(mi->mi_io_kstats)); 1491 mutex_exit(&mi->mi_lock); 1492 } 1493 1494 mutex_exit(&mi->mi_async_lock); 1495 1496 /* 1497 * Obtain arguments from the async request structure. 1498 */ 1499 if (args->a_io == NFS4_READ_AHEAD && mi->mi_max_threads > 0) { 1500 (*args->a_nfs4_readahead)(args->a_vp, 1501 args->a_nfs4_blkoff, args->a_nfs4_addr, 1502 args->a_nfs4_seg, args->a_cred); 1503 } else if (args->a_io == NFS4_PUTAPAGE) { 1504 (void) (*args->a_nfs4_putapage)(args->a_vp, 1505 args->a_nfs4_pp, args->a_nfs4_off, 1506 args->a_nfs4_len, args->a_nfs4_flags, 1507 args->a_cred); 1508 } else if (args->a_io == NFS4_PAGEIO) { 1509 (void) (*args->a_nfs4_pageio)(args->a_vp, 1510 args->a_nfs4_pp, args->a_nfs4_off, 1511 args->a_nfs4_len, args->a_nfs4_flags, 1512 args->a_cred); 1513 } else if (args->a_io == NFS4_READDIR) { 1514 (void) ((*args->a_nfs4_readdir)(args->a_vp, 1515 args->a_nfs4_rdc, args->a_cred)); 1516 } else if (args->a_io == NFS4_COMMIT) { 1517 (*args->a_nfs4_commit)(args->a_vp, args->a_nfs4_plist, 1518 args->a_nfs4_offset, args->a_nfs4_count, 1519 args->a_cred); 1520 } else if (args->a_io == NFS4_INACTIVE) { 1521 nfs4_inactive_otw(args->a_vp, args->a_cred); 1522 } 1523 1524 /* 1525 * Now, release the vnode and free the credentials 1526 * structure. 1527 */ 1528 free_async_args4(args); 1529 /* 1530 * Reacquire the mutex because it will be needed above. 1531 */ 1532 mutex_enter(&mi->mi_async_lock); 1533 } 1534 } 1535 1536 /* 1537 * nfs4_inactive_thread - look for vnodes that need over-the-wire calls as 1538 * part of VOP_INACTIVE. 1539 */ 1540 1541 void 1542 nfs4_inactive_thread(mntinfo4_t *mi) 1543 { 1544 struct nfs4_async_reqs *args; 1545 callb_cpr_t cprinfo; 1546 vfs_t *vfsp = mi->mi_vfsp; 1547 1548 CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, 1549 "nfs4_inactive_thread"); 1550 1551 for (;;) { 1552 mutex_enter(&mi->mi_async_lock); 1553 args = mi->mi_async_reqs[NFS4_INACTIVE]; 1554 if (args == NULL) { 1555 mutex_enter(&mi->mi_lock); 1556 /* 1557 * We don't want to exit until the async manager is done 1558 * with its work; hence the check for mi_manager_thread 1559 * being NULL. 1560 * 1561 * The async manager thread will cv_broadcast() on 1562 * mi_inact_req_cv when it's done, at which point we'll 1563 * wake up and exit. 1564 */ 1565 if (mi->mi_manager_thread == NULL) 1566 goto die; 1567 mi->mi_flags |= MI4_INACTIVE_IDLE; 1568 mutex_exit(&mi->mi_lock); 1569 cv_signal(&mi->mi_async_cv); 1570 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1571 cv_wait(&mi->mi_inact_req_cv, &mi->mi_async_lock); 1572 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock); 1573 mutex_exit(&mi->mi_async_lock); 1574 } else { 1575 mutex_enter(&mi->mi_lock); 1576 mi->mi_flags &= ~MI4_INACTIVE_IDLE; 1577 mutex_exit(&mi->mi_lock); 1578 mi->mi_async_reqs[NFS4_INACTIVE] = args->a_next; 1579 mutex_exit(&mi->mi_async_lock); 1580 nfs4_inactive_otw(args->a_vp, args->a_cred); 1581 crfree(args->a_cred); 1582 kmem_free(args, sizeof (*args)); 1583 } 1584 } 1585 die: 1586 mutex_exit(&mi->mi_lock); 1587 mi->mi_inactive_thread = NULL; 1588 cv_signal(&mi->mi_async_cv); 1589 1590 /* 1591 * There is no explicit call to mutex_exit(&mi->mi_async_lock) since 1592 * CALLB_CPR_EXIT is actually responsible for releasing 'mi_async_lock'. 1593 */ 1594 CALLB_CPR_EXIT(&cprinfo); 1595 1596 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE, 1597 "nfs4_inactive_thread exiting for vfs %p\n", (void *)vfsp)); 1598 1599 MI4_RELE(mi); 1600 zthread_exit(); 1601 /* NOTREACHED */ 1602 } 1603 1604 /* 1605 * nfs_async_stop: 1606 * Wait for all outstanding putpage operations and the inactive thread to 1607 * complete; nfs4_async_stop_sig() without interruptibility. 1608 */ 1609 void 1610 nfs4_async_stop(struct vfs *vfsp) 1611 { 1612 mntinfo4_t *mi = VFTOMI4(vfsp); 1613 1614 /* 1615 * Wait for all outstanding async operations to complete and for 1616 * worker threads to exit. 1617 */ 1618 mutex_enter(&mi->mi_async_lock); 1619 mi->mi_max_threads = 0; 1620 NFS4_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv); 1621 while (mi->mi_threads[NFS4_ASYNC_QUEUE] != 0 || 1622 mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] != 0) 1623 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock); 1624 1625 /* 1626 * Wait for the inactive thread to finish doing what it's doing. It 1627 * won't exit until the last reference to the vfs_t goes away. 1628 */ 1629 if (mi->mi_inactive_thread != NULL) { 1630 mutex_enter(&mi->mi_lock); 1631 while (!(mi->mi_flags & MI4_INACTIVE_IDLE) || 1632 (mi->mi_async_reqs[NFS4_INACTIVE] != NULL)) { 1633 mutex_exit(&mi->mi_lock); 1634 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock); 1635 mutex_enter(&mi->mi_lock); 1636 } 1637 mutex_exit(&mi->mi_lock); 1638 } 1639 mutex_exit(&mi->mi_async_lock); 1640 } 1641 1642 /* 1643 * nfs_async_stop_sig: 1644 * Wait for all outstanding putpage operations and the inactive thread to 1645 * complete. If a signal is delivered we will abort and return non-zero; 1646 * otherwise return 0. Since this routine is called from nfs4_unmount, we 1647 * need to make it interruptible. 1648 */ 1649 int 1650 nfs4_async_stop_sig(struct vfs *vfsp) 1651 { 1652 mntinfo4_t *mi = VFTOMI4(vfsp); 1653 ushort_t omax; 1654 bool_t intr = FALSE; 1655 1656 /* 1657 * Wait for all outstanding putpage operations to complete and for 1658 * worker threads to exit. 1659 */ 1660 mutex_enter(&mi->mi_async_lock); 1661 omax = mi->mi_max_threads; 1662 mi->mi_max_threads = 0; 1663 NFS4_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv); 1664 while (mi->mi_threads[NFS4_ASYNC_QUEUE] != 0 || 1665 mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] != 0) { 1666 if (!cv_wait_sig(&mi->mi_async_cv, &mi->mi_async_lock)) { 1667 intr = TRUE; 1668 goto interrupted; 1669 } 1670 } 1671 1672 /* 1673 * Wait for the inactive thread to finish doing what it's doing. It 1674 * won't exit until the a last reference to the vfs_t goes away. 1675 */ 1676 if (mi->mi_inactive_thread != NULL) { 1677 mutex_enter(&mi->mi_lock); 1678 while (!(mi->mi_flags & MI4_INACTIVE_IDLE) || 1679 (mi->mi_async_reqs[NFS4_INACTIVE] != NULL)) { 1680 mutex_exit(&mi->mi_lock); 1681 if (!cv_wait_sig(&mi->mi_async_cv, 1682 &mi->mi_async_lock)) { 1683 intr = TRUE; 1684 goto interrupted; 1685 } 1686 mutex_enter(&mi->mi_lock); 1687 } 1688 mutex_exit(&mi->mi_lock); 1689 } 1690 interrupted: 1691 if (intr) 1692 mi->mi_max_threads = omax; 1693 mutex_exit(&mi->mi_async_lock); 1694 1695 return (intr); 1696 } 1697 1698 int 1699 nfs4_async_putapage(vnode_t *vp, page_t *pp, u_offset_t off, size_t len, 1700 int flags, cred_t *cr, int (*putapage)(vnode_t *, page_t *, 1701 u_offset_t, size_t, int, cred_t *)) 1702 { 1703 rnode4_t *rp; 1704 mntinfo4_t *mi; 1705 struct nfs4_async_reqs *args; 1706 1707 ASSERT(flags & B_ASYNC); 1708 ASSERT(vp->v_vfsp != NULL); 1709 1710 rp = VTOR4(vp); 1711 ASSERT(rp->r_count > 0); 1712 1713 mi = VTOMI4(vp); 1714 1715 /* 1716 * If we can't allocate a request structure, do the putpage 1717 * operation synchronously in this thread's context. 1718 */ 1719 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1720 goto noasync; 1721 1722 args->a_next = NULL; 1723 #ifdef DEBUG 1724 args->a_queuer = curthread; 1725 #endif 1726 VN_HOLD(vp); 1727 args->a_vp = vp; 1728 ASSERT(cr != NULL); 1729 crhold(cr); 1730 args->a_cred = cr; 1731 args->a_io = NFS4_PUTAPAGE; 1732 args->a_nfs4_putapage = putapage; 1733 args->a_nfs4_pp = pp; 1734 args->a_nfs4_off = off; 1735 args->a_nfs4_len = (uint_t)len; 1736 args->a_nfs4_flags = flags; 1737 1738 mutex_enter(&mi->mi_async_lock); 1739 1740 /* 1741 * If asyncio has been disabled, then make a synchronous request. 1742 * This check is done a second time in case async io was diabled 1743 * while this thread was blocked waiting for memory pressure to 1744 * reduce or for the queue to drain. 1745 */ 1746 if (mi->mi_max_threads == 0) { 1747 mutex_exit(&mi->mi_async_lock); 1748 1749 VN_RELE(vp); 1750 crfree(cr); 1751 kmem_free(args, sizeof (*args)); 1752 goto noasync; 1753 } 1754 1755 /* 1756 * Link request structure into the async list and 1757 * wakeup async thread to do the i/o. 1758 */ 1759 if (mi->mi_async_reqs[NFS4_PUTAPAGE] == NULL) { 1760 mi->mi_async_reqs[NFS4_PUTAPAGE] = args; 1761 mi->mi_async_tail[NFS4_PUTAPAGE] = args; 1762 } else { 1763 mi->mi_async_tail[NFS4_PUTAPAGE]->a_next = args; 1764 mi->mi_async_tail[NFS4_PUTAPAGE] = args; 1765 } 1766 1767 mutex_enter(&rp->r_statelock); 1768 rp->r_count++; 1769 rp->r_awcount++; 1770 mutex_exit(&rp->r_statelock); 1771 1772 if (mi->mi_io_kstats) { 1773 mutex_enter(&mi->mi_lock); 1774 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1775 mutex_exit(&mi->mi_lock); 1776 } 1777 1778 mi->mi_async_req_count++; 1779 ASSERT(mi->mi_async_req_count != 0); 1780 cv_signal(&mi->mi_async_reqs_cv); 1781 mutex_exit(&mi->mi_async_lock); 1782 return (0); 1783 1784 noasync: 1785 1786 if (curproc == proc_pageout || curproc == proc_fsflush || 1787 nfs_zone() == mi->mi_zone) { 1788 /* 1789 * If we get here in the context of the pageout/fsflush, 1790 * or we have run out of memory or we're attempting to 1791 * unmount we refuse to do a sync write, because this may 1792 * hang pageout/fsflush and the machine. In this case, 1793 * we just re-mark the page as dirty and punt on the page. 1794 * 1795 * Make sure B_FORCE isn't set. We can re-mark the 1796 * pages as dirty and unlock the pages in one swoop by 1797 * passing in B_ERROR to pvn_write_done(). However, 1798 * we should make sure B_FORCE isn't set - we don't 1799 * want the page tossed before it gets written out. 1800 */ 1801 if (flags & B_FORCE) 1802 flags &= ~(B_INVAL | B_FORCE); 1803 pvn_write_done(pp, flags | B_ERROR); 1804 return (0); 1805 } 1806 1807 /* 1808 * We'll get here only if (nfs_zone() != mi->mi_zone) 1809 * which means that this was a cross-zone sync putpage. 1810 * 1811 * We pass in B_ERROR to pvn_write_done() to re-mark the pages 1812 * as dirty and unlock them. 1813 * 1814 * We don't want to clear B_FORCE here as the caller presumably 1815 * knows what they're doing if they set it. 1816 */ 1817 pvn_write_done(pp, flags | B_ERROR); 1818 return (EPERM); 1819 } 1820 1821 int 1822 nfs4_async_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len, 1823 int flags, cred_t *cr, int (*pageio)(vnode_t *, page_t *, u_offset_t, 1824 size_t, int, cred_t *)) 1825 { 1826 rnode4_t *rp; 1827 mntinfo4_t *mi; 1828 struct nfs4_async_reqs *args; 1829 1830 ASSERT(flags & B_ASYNC); 1831 ASSERT(vp->v_vfsp != NULL); 1832 1833 rp = VTOR4(vp); 1834 ASSERT(rp->r_count > 0); 1835 1836 mi = VTOMI4(vp); 1837 1838 /* 1839 * If we can't allocate a request structure, do the pageio 1840 * request synchronously in this thread's context. 1841 */ 1842 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1843 goto noasync; 1844 1845 args->a_next = NULL; 1846 #ifdef DEBUG 1847 args->a_queuer = curthread; 1848 #endif 1849 VN_HOLD(vp); 1850 args->a_vp = vp; 1851 ASSERT(cr != NULL); 1852 crhold(cr); 1853 args->a_cred = cr; 1854 args->a_io = NFS4_PAGEIO; 1855 args->a_nfs4_pageio = pageio; 1856 args->a_nfs4_pp = pp; 1857 args->a_nfs4_off = io_off; 1858 args->a_nfs4_len = (uint_t)io_len; 1859 args->a_nfs4_flags = flags; 1860 1861 mutex_enter(&mi->mi_async_lock); 1862 1863 /* 1864 * If asyncio has been disabled, then make a synchronous request. 1865 * This check is done a second time in case async io was diabled 1866 * while this thread was blocked waiting for memory pressure to 1867 * reduce or for the queue to drain. 1868 */ 1869 if (mi->mi_max_threads == 0) { 1870 mutex_exit(&mi->mi_async_lock); 1871 1872 VN_RELE(vp); 1873 crfree(cr); 1874 kmem_free(args, sizeof (*args)); 1875 goto noasync; 1876 } 1877 1878 /* 1879 * Link request structure into the async list and 1880 * wakeup async thread to do the i/o. 1881 */ 1882 if (mi->mi_async_reqs[NFS4_PAGEIO] == NULL) { 1883 mi->mi_async_reqs[NFS4_PAGEIO] = args; 1884 mi->mi_async_tail[NFS4_PAGEIO] = args; 1885 } else { 1886 mi->mi_async_tail[NFS4_PAGEIO]->a_next = args; 1887 mi->mi_async_tail[NFS4_PAGEIO] = args; 1888 } 1889 1890 mutex_enter(&rp->r_statelock); 1891 rp->r_count++; 1892 rp->r_awcount++; 1893 mutex_exit(&rp->r_statelock); 1894 1895 if (mi->mi_io_kstats) { 1896 mutex_enter(&mi->mi_lock); 1897 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1898 mutex_exit(&mi->mi_lock); 1899 } 1900 1901 mi->mi_async_req_count++; 1902 ASSERT(mi->mi_async_req_count != 0); 1903 cv_signal(&mi->mi_async_reqs_cv); 1904 mutex_exit(&mi->mi_async_lock); 1905 return (0); 1906 1907 noasync: 1908 /* 1909 * If we can't do it ASYNC, for reads we do nothing (but cleanup 1910 * the page list), for writes we do it synchronously, except for 1911 * proc_pageout/proc_fsflush as described below. 1912 */ 1913 if (flags & B_READ) { 1914 pvn_read_done(pp, flags | B_ERROR); 1915 return (0); 1916 } 1917 1918 if (curproc == proc_pageout || curproc == proc_fsflush) { 1919 /* 1920 * If we get here in the context of the pageout/fsflush, 1921 * we refuse to do a sync write, because this may hang 1922 * pageout/fsflush (and the machine). In this case, we just 1923 * re-mark the page as dirty and punt on the page. 1924 * 1925 * Make sure B_FORCE isn't set. We can re-mark the 1926 * pages as dirty and unlock the pages in one swoop by 1927 * passing in B_ERROR to pvn_write_done(). However, 1928 * we should make sure B_FORCE isn't set - we don't 1929 * want the page tossed before it gets written out. 1930 */ 1931 if (flags & B_FORCE) 1932 flags &= ~(B_INVAL | B_FORCE); 1933 pvn_write_done(pp, flags | B_ERROR); 1934 return (0); 1935 } 1936 1937 if (nfs_zone() != mi->mi_zone) { 1938 /* 1939 * So this was a cross-zone sync pageio. We pass in B_ERROR 1940 * to pvn_write_done() to re-mark the pages as dirty and unlock 1941 * them. 1942 * 1943 * We don't want to clear B_FORCE here as the caller presumably 1944 * knows what they're doing if they set it. 1945 */ 1946 pvn_write_done(pp, flags | B_ERROR); 1947 return (EPERM); 1948 } 1949 return ((*pageio)(vp, pp, io_off, io_len, flags, cr)); 1950 } 1951 1952 void 1953 nfs4_async_readdir(vnode_t *vp, rddir4_cache *rdc, cred_t *cr, 1954 int (*readdir)(vnode_t *, rddir4_cache *, cred_t *)) 1955 { 1956 rnode4_t *rp; 1957 mntinfo4_t *mi; 1958 struct nfs4_async_reqs *args; 1959 1960 rp = VTOR4(vp); 1961 ASSERT(rp->r_freef == NULL); 1962 1963 mi = VTOMI4(vp); 1964 1965 /* 1966 * If we can't allocate a request structure, skip the readdir. 1967 */ 1968 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1969 goto noasync; 1970 1971 args->a_next = NULL; 1972 #ifdef DEBUG 1973 args->a_queuer = curthread; 1974 #endif 1975 VN_HOLD(vp); 1976 args->a_vp = vp; 1977 ASSERT(cr != NULL); 1978 crhold(cr); 1979 args->a_cred = cr; 1980 args->a_io = NFS4_READDIR; 1981 args->a_nfs4_readdir = readdir; 1982 args->a_nfs4_rdc = rdc; 1983 1984 mutex_enter(&mi->mi_async_lock); 1985 1986 /* 1987 * If asyncio has been disabled, then skip this request 1988 */ 1989 if (mi->mi_max_threads == 0) { 1990 mutex_exit(&mi->mi_async_lock); 1991 1992 VN_RELE(vp); 1993 crfree(cr); 1994 kmem_free(args, sizeof (*args)); 1995 goto noasync; 1996 } 1997 1998 /* 1999 * Link request structure into the async list and 2000 * wakeup async thread to do the i/o. 2001 */ 2002 if (mi->mi_async_reqs[NFS4_READDIR] == NULL) { 2003 mi->mi_async_reqs[NFS4_READDIR] = args; 2004 mi->mi_async_tail[NFS4_READDIR] = args; 2005 } else { 2006 mi->mi_async_tail[NFS4_READDIR]->a_next = args; 2007 mi->mi_async_tail[NFS4_READDIR] = args; 2008 } 2009 2010 mutex_enter(&rp->r_statelock); 2011 rp->r_count++; 2012 mutex_exit(&rp->r_statelock); 2013 2014 if (mi->mi_io_kstats) { 2015 mutex_enter(&mi->mi_lock); 2016 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 2017 mutex_exit(&mi->mi_lock); 2018 } 2019 2020 mi->mi_async_req_count++; 2021 ASSERT(mi->mi_async_req_count != 0); 2022 cv_signal(&mi->mi_async_reqs_cv); 2023 mutex_exit(&mi->mi_async_lock); 2024 return; 2025 2026 noasync: 2027 mutex_enter(&rp->r_statelock); 2028 rdc->entries = NULL; 2029 /* 2030 * Indicate that no one is trying to fill this entry and 2031 * it still needs to be filled. 2032 */ 2033 rdc->flags &= ~RDDIR; 2034 rdc->flags |= RDDIRREQ; 2035 rddir4_cache_rele(rp, rdc); 2036 mutex_exit(&rp->r_statelock); 2037 } 2038 2039 void 2040 nfs4_async_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count, 2041 cred_t *cr, void (*commit)(vnode_t *, page_t *, offset3, count3, 2042 cred_t *)) 2043 { 2044 rnode4_t *rp; 2045 mntinfo4_t *mi; 2046 struct nfs4_async_reqs *args; 2047 page_t *pp; 2048 2049 rp = VTOR4(vp); 2050 mi = VTOMI4(vp); 2051 2052 /* 2053 * If we can't allocate a request structure, do the commit 2054 * operation synchronously in this thread's context. 2055 */ 2056 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 2057 goto noasync; 2058 2059 args->a_next = NULL; 2060 #ifdef DEBUG 2061 args->a_queuer = curthread; 2062 #endif 2063 VN_HOLD(vp); 2064 args->a_vp = vp; 2065 ASSERT(cr != NULL); 2066 crhold(cr); 2067 args->a_cred = cr; 2068 args->a_io = NFS4_COMMIT; 2069 args->a_nfs4_commit = commit; 2070 args->a_nfs4_plist = plist; 2071 args->a_nfs4_offset = offset; 2072 args->a_nfs4_count = count; 2073 2074 mutex_enter(&mi->mi_async_lock); 2075 2076 /* 2077 * If asyncio has been disabled, then make a synchronous request. 2078 * This check is done a second time in case async io was diabled 2079 * while this thread was blocked waiting for memory pressure to 2080 * reduce or for the queue to drain. 2081 */ 2082 if (mi->mi_max_threads == 0) { 2083 mutex_exit(&mi->mi_async_lock); 2084 2085 VN_RELE(vp); 2086 crfree(cr); 2087 kmem_free(args, sizeof (*args)); 2088 goto noasync; 2089 } 2090 2091 /* 2092 * Link request structure into the async list and 2093 * wakeup async thread to do the i/o. 2094 */ 2095 if (mi->mi_async_reqs[NFS4_COMMIT] == NULL) { 2096 mi->mi_async_reqs[NFS4_COMMIT] = args; 2097 mi->mi_async_tail[NFS4_COMMIT] = args; 2098 } else { 2099 mi->mi_async_tail[NFS4_COMMIT]->a_next = args; 2100 mi->mi_async_tail[NFS4_COMMIT] = args; 2101 } 2102 2103 mutex_enter(&rp->r_statelock); 2104 rp->r_count++; 2105 mutex_exit(&rp->r_statelock); 2106 2107 if (mi->mi_io_kstats) { 2108 mutex_enter(&mi->mi_lock); 2109 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 2110 mutex_exit(&mi->mi_lock); 2111 } 2112 2113 mi->mi_async_req_count++; 2114 ASSERT(mi->mi_async_req_count != 0); 2115 cv_signal(&mi->mi_async_reqs_cv); 2116 mutex_exit(&mi->mi_async_lock); 2117 return; 2118 2119 noasync: 2120 if (curproc == proc_pageout || curproc == proc_fsflush || 2121 nfs_zone() != mi->mi_zone) { 2122 while (plist != NULL) { 2123 pp = plist; 2124 page_sub(&plist, pp); 2125 pp->p_fsdata = C_COMMIT; 2126 page_unlock(pp); 2127 } 2128 return; 2129 } 2130 (*commit)(vp, plist, offset, count, cr); 2131 } 2132 2133 /* 2134 * nfs4_async_inactive - hand off a VOP_INACTIVE call to a thread. The 2135 * reference to the vnode is handed over to the thread; the caller should 2136 * no longer refer to the vnode. 2137 * 2138 * Unlike most of the async routines, this handoff is needed for 2139 * correctness reasons, not just performance. So doing operations in the 2140 * context of the current thread is not an option. 2141 */ 2142 void 2143 nfs4_async_inactive(vnode_t *vp, cred_t *cr) 2144 { 2145 mntinfo4_t *mi; 2146 struct nfs4_async_reqs *args; 2147 boolean_t signal_inactive_thread = B_FALSE; 2148 2149 mi = VTOMI4(vp); 2150 2151 args = kmem_alloc(sizeof (*args), KM_SLEEP); 2152 args->a_next = NULL; 2153 #ifdef DEBUG 2154 args->a_queuer = curthread; 2155 #endif 2156 args->a_vp = vp; 2157 ASSERT(cr != NULL); 2158 crhold(cr); 2159 args->a_cred = cr; 2160 args->a_io = NFS4_INACTIVE; 2161 2162 /* 2163 * Note that we don't check mi->mi_max_threads here, since we 2164 * *need* to get rid of this vnode regardless of whether someone 2165 * set nfs4_max_threads to zero in /etc/system. 2166 * 2167 * The manager thread knows about this and is willing to create 2168 * at least one thread to accommodate us. 2169 */ 2170 mutex_enter(&mi->mi_async_lock); 2171 if (mi->mi_inactive_thread == NULL) { 2172 rnode4_t *rp; 2173 vnode_t *unldvp = NULL; 2174 char *unlname; 2175 cred_t *unlcred; 2176 2177 mutex_exit(&mi->mi_async_lock); 2178 /* 2179 * We just need to free up the memory associated with the 2180 * vnode, which can be safely done from within the current 2181 * context. 2182 */ 2183 crfree(cr); /* drop our reference */ 2184 kmem_free(args, sizeof (*args)); 2185 rp = VTOR4(vp); 2186 mutex_enter(&rp->r_statelock); 2187 if (rp->r_unldvp != NULL) { 2188 unldvp = rp->r_unldvp; 2189 rp->r_unldvp = NULL; 2190 unlname = rp->r_unlname; 2191 rp->r_unlname = NULL; 2192 unlcred = rp->r_unlcred; 2193 rp->r_unlcred = NULL; 2194 } 2195 mutex_exit(&rp->r_statelock); 2196 /* 2197 * No need to explicitly throw away any cached pages. The 2198 * eventual r4inactive() will attempt a synchronous 2199 * VOP_PUTPAGE() which will immediately fail since the request 2200 * is coming from the wrong zone, and then will proceed to call 2201 * nfs4_invalidate_pages() which will clean things up for us. 2202 * 2203 * Throw away the delegation here so rp4_addfree()'s attempt to 2204 * return any existing delegations becomes a no-op. 2205 */ 2206 if (rp->r_deleg_type != OPEN_DELEGATE_NONE) { 2207 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 2208 FALSE); 2209 (void) nfs4delegreturn(rp, NFS4_DR_DISCARD); 2210 nfs_rw_exit(&mi->mi_recovlock); 2211 } 2212 nfs4_clear_open_streams(rp); 2213 2214 rp4_addfree(rp, cr); 2215 if (unldvp != NULL) { 2216 kmem_free(unlname, MAXNAMELEN); 2217 VN_RELE(unldvp); 2218 crfree(unlcred); 2219 } 2220 return; 2221 } 2222 2223 if (mi->mi_manager_thread == NULL) { 2224 /* 2225 * We want to talk to the inactive thread. 2226 */ 2227 signal_inactive_thread = B_TRUE; 2228 } 2229 2230 /* 2231 * Enqueue the vnode and wake up either the special thread (empty 2232 * list) or an async thread. 2233 */ 2234 if (mi->mi_async_reqs[NFS4_INACTIVE] == NULL) { 2235 mi->mi_async_reqs[NFS4_INACTIVE] = args; 2236 mi->mi_async_tail[NFS4_INACTIVE] = args; 2237 signal_inactive_thread = B_TRUE; 2238 } else { 2239 mi->mi_async_tail[NFS4_INACTIVE]->a_next = args; 2240 mi->mi_async_tail[NFS4_INACTIVE] = args; 2241 } 2242 if (signal_inactive_thread) { 2243 cv_signal(&mi->mi_inact_req_cv); 2244 } else { 2245 mi->mi_async_req_count++; 2246 ASSERT(mi->mi_async_req_count != 0); 2247 cv_signal(&mi->mi_async_reqs_cv); 2248 } 2249 2250 mutex_exit(&mi->mi_async_lock); 2251 } 2252 2253 int 2254 writerp4(rnode4_t *rp, caddr_t base, int tcount, struct uio *uio, int pgcreated) 2255 { 2256 int pagecreate; 2257 int n; 2258 int saved_n; 2259 caddr_t saved_base; 2260 u_offset_t offset; 2261 int error; 2262 int sm_error; 2263 vnode_t *vp = RTOV(rp); 2264 2265 ASSERT(tcount <= MAXBSIZE && tcount <= uio->uio_resid); 2266 ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_WRITER)); 2267 if (!vpm_enable) { 2268 ASSERT(((uintptr_t)base & MAXBOFFSET) + tcount <= MAXBSIZE); 2269 } 2270 2271 /* 2272 * Move bytes in at most PAGESIZE chunks. We must avoid 2273 * spanning pages in uiomove() because page faults may cause 2274 * the cache to be invalidated out from under us. The r_size is not 2275 * updated until after the uiomove. If we push the last page of a 2276 * file before r_size is correct, we will lose the data written past 2277 * the current (and invalid) r_size. 2278 */ 2279 do { 2280 offset = uio->uio_loffset; 2281 pagecreate = 0; 2282 2283 /* 2284 * n is the number of bytes required to satisfy the request 2285 * or the number of bytes to fill out the page. 2286 */ 2287 n = (int)MIN((PAGESIZE - (offset & PAGEOFFSET)), tcount); 2288 2289 /* 2290 * Check to see if we can skip reading in the page 2291 * and just allocate the memory. We can do this 2292 * if we are going to rewrite the entire mapping 2293 * or if we are going to write to or beyond the current 2294 * end of file from the beginning of the mapping. 2295 * 2296 * The read of r_size is now protected by r_statelock. 2297 */ 2298 mutex_enter(&rp->r_statelock); 2299 /* 2300 * When pgcreated is nonzero the caller has already done 2301 * a segmap_getmapflt with forcefault 0 and S_WRITE. With 2302 * segkpm this means we already have at least one page 2303 * created and mapped at base. 2304 */ 2305 pagecreate = pgcreated || 2306 ((offset & PAGEOFFSET) == 0 && 2307 (n == PAGESIZE || ((offset + n) >= rp->r_size))); 2308 2309 mutex_exit(&rp->r_statelock); 2310 2311 if (!vpm_enable && pagecreate) { 2312 /* 2313 * The last argument tells segmap_pagecreate() to 2314 * always lock the page, as opposed to sometimes 2315 * returning with the page locked. This way we avoid a 2316 * fault on the ensuing uiomove(), but also 2317 * more importantly (to fix bug 1094402) we can 2318 * call segmap_fault() to unlock the page in all 2319 * cases. An alternative would be to modify 2320 * segmap_pagecreate() to tell us when it is 2321 * locking a page, but that's a fairly major 2322 * interface change. 2323 */ 2324 if (pgcreated == 0) 2325 (void) segmap_pagecreate(segkmap, base, 2326 (uint_t)n, 1); 2327 saved_base = base; 2328 saved_n = n; 2329 } 2330 2331 /* 2332 * The number of bytes of data in the last page can not 2333 * be accurately be determined while page is being 2334 * uiomove'd to and the size of the file being updated. 2335 * Thus, inform threads which need to know accurately 2336 * how much data is in the last page of the file. They 2337 * will not do the i/o immediately, but will arrange for 2338 * the i/o to happen later when this modify operation 2339 * will have finished. 2340 */ 2341 ASSERT(!(rp->r_flags & R4MODINPROGRESS)); 2342 mutex_enter(&rp->r_statelock); 2343 rp->r_flags |= R4MODINPROGRESS; 2344 rp->r_modaddr = (offset & MAXBMASK); 2345 mutex_exit(&rp->r_statelock); 2346 2347 if (vpm_enable) { 2348 /* 2349 * Copy data. If new pages are created, part of 2350 * the page that is not written will be initizliazed 2351 * with zeros. 2352 */ 2353 error = vpm_data_copy(vp, offset, n, uio, 2354 !pagecreate, NULL, 0, S_WRITE); 2355 } else { 2356 error = uiomove(base, n, UIO_WRITE, uio); 2357 } 2358 2359 /* 2360 * r_size is the maximum number of 2361 * bytes known to be in the file. 2362 * Make sure it is at least as high as the 2363 * first unwritten byte pointed to by uio_loffset. 2364 */ 2365 mutex_enter(&rp->r_statelock); 2366 if (rp->r_size < uio->uio_loffset) 2367 rp->r_size = uio->uio_loffset; 2368 rp->r_flags &= ~R4MODINPROGRESS; 2369 rp->r_flags |= R4DIRTY; 2370 mutex_exit(&rp->r_statelock); 2371 2372 /* n = # of bytes written */ 2373 n = (int)(uio->uio_loffset - offset); 2374 2375 if (!vpm_enable) { 2376 base += n; 2377 } 2378 2379 tcount -= n; 2380 /* 2381 * If we created pages w/o initializing them completely, 2382 * we need to zero the part that wasn't set up. 2383 * This happens on a most EOF write cases and if 2384 * we had some sort of error during the uiomove. 2385 */ 2386 if (!vpm_enable && pagecreate) { 2387 if ((uio->uio_loffset & PAGEOFFSET) || n == 0) 2388 (void) kzero(base, PAGESIZE - n); 2389 2390 if (pgcreated) { 2391 /* 2392 * Caller is responsible for this page, 2393 * it was not created in this loop. 2394 */ 2395 pgcreated = 0; 2396 } else { 2397 /* 2398 * For bug 1094402: segmap_pagecreate locks 2399 * page. Unlock it. This also unlocks the 2400 * pages allocated by page_create_va() in 2401 * segmap_pagecreate(). 2402 */ 2403 sm_error = segmap_fault(kas.a_hat, segkmap, 2404 saved_base, saved_n, 2405 F_SOFTUNLOCK, S_WRITE); 2406 if (error == 0) 2407 error = sm_error; 2408 } 2409 } 2410 } while (tcount > 0 && error == 0); 2411 2412 return (error); 2413 } 2414 2415 int 2416 nfs4_putpages(vnode_t *vp, u_offset_t off, size_t len, int flags, cred_t *cr) 2417 { 2418 rnode4_t *rp; 2419 page_t *pp; 2420 u_offset_t eoff; 2421 u_offset_t io_off; 2422 size_t io_len; 2423 int error; 2424 int rdirty; 2425 int err; 2426 2427 rp = VTOR4(vp); 2428 ASSERT(rp->r_count > 0); 2429 2430 if (!nfs4_has_pages(vp)) 2431 return (0); 2432 2433 ASSERT(vp->v_type != VCHR); 2434 2435 /* 2436 * If R4OUTOFSPACE is set, then all writes turn into B_INVAL 2437 * writes. B_FORCE is set to force the VM system to actually 2438 * invalidate the pages, even if the i/o failed. The pages 2439 * need to get invalidated because they can't be written out 2440 * because there isn't any space left on either the server's 2441 * file system or in the user's disk quota. The B_FREE bit 2442 * is cleared to avoid confusion as to whether this is a 2443 * request to place the page on the freelist or to destroy 2444 * it. 2445 */ 2446 if ((rp->r_flags & R4OUTOFSPACE) || 2447 (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)) 2448 flags = (flags & ~B_FREE) | B_INVAL | B_FORCE; 2449 2450 if (len == 0) { 2451 /* 2452 * If doing a full file synchronous operation, then clear 2453 * the R4DIRTY bit. If a page gets dirtied while the flush 2454 * is happening, then R4DIRTY will get set again. The 2455 * R4DIRTY bit must get cleared before the flush so that 2456 * we don't lose this information. 2457 * 2458 * If there are no full file async write operations 2459 * pending and RDIRTY bit is set, clear it. 2460 */ 2461 if (off == (u_offset_t)0 && 2462 !(flags & B_ASYNC) && 2463 (rp->r_flags & R4DIRTY)) { 2464 mutex_enter(&rp->r_statelock); 2465 rdirty = (rp->r_flags & R4DIRTY); 2466 rp->r_flags &= ~R4DIRTY; 2467 mutex_exit(&rp->r_statelock); 2468 } else if (flags & B_ASYNC && off == (u_offset_t)0) { 2469 mutex_enter(&rp->r_statelock); 2470 if (rp->r_flags & R4DIRTY && rp->r_awcount == 0) { 2471 rdirty = (rp->r_flags & R4DIRTY); 2472 rp->r_flags &= ~R4DIRTY; 2473 } 2474 mutex_exit(&rp->r_statelock); 2475 } else 2476 rdirty = 0; 2477 2478 /* 2479 * Search the entire vp list for pages >= off, and flush 2480 * the dirty pages. 2481 */ 2482 error = pvn_vplist_dirty(vp, off, rp->r_putapage, 2483 flags, cr); 2484 2485 /* 2486 * If an error occurred and the file was marked as dirty 2487 * before and we aren't forcibly invalidating pages, then 2488 * reset the R4DIRTY flag. 2489 */ 2490 if (error && rdirty && 2491 (flags & (B_INVAL | B_FORCE)) != (B_INVAL | B_FORCE)) { 2492 mutex_enter(&rp->r_statelock); 2493 rp->r_flags |= R4DIRTY; 2494 mutex_exit(&rp->r_statelock); 2495 } 2496 } else { 2497 /* 2498 * Do a range from [off...off + len) looking for pages 2499 * to deal with. 2500 */ 2501 error = 0; 2502 io_len = 0; 2503 eoff = off + len; 2504 mutex_enter(&rp->r_statelock); 2505 for (io_off = off; io_off < eoff && io_off < rp->r_size; 2506 io_off += io_len) { 2507 mutex_exit(&rp->r_statelock); 2508 /* 2509 * If we are not invalidating, synchronously 2510 * freeing or writing pages use the routine 2511 * page_lookup_nowait() to prevent reclaiming 2512 * them from the free list. 2513 */ 2514 if ((flags & B_INVAL) || !(flags & B_ASYNC)) { 2515 pp = page_lookup(vp, io_off, 2516 (flags & (B_INVAL | B_FREE)) ? 2517 SE_EXCL : SE_SHARED); 2518 } else { 2519 pp = page_lookup_nowait(vp, io_off, 2520 (flags & B_FREE) ? SE_EXCL : SE_SHARED); 2521 } 2522 2523 if (pp == NULL || !pvn_getdirty(pp, flags)) 2524 io_len = PAGESIZE; 2525 else { 2526 err = (*rp->r_putapage)(vp, pp, &io_off, 2527 &io_len, flags, cr); 2528 if (!error) 2529 error = err; 2530 /* 2531 * "io_off" and "io_len" are returned as 2532 * the range of pages we actually wrote. 2533 * This allows us to skip ahead more quickly 2534 * since several pages may've been dealt 2535 * with by this iteration of the loop. 2536 */ 2537 } 2538 mutex_enter(&rp->r_statelock); 2539 } 2540 mutex_exit(&rp->r_statelock); 2541 } 2542 2543 return (error); 2544 } 2545 2546 void 2547 nfs4_invalidate_pages(vnode_t *vp, u_offset_t off, cred_t *cr) 2548 { 2549 rnode4_t *rp; 2550 2551 rp = VTOR4(vp); 2552 if (IS_SHADOW(vp, rp)) 2553 vp = RTOV4(rp); 2554 mutex_enter(&rp->r_statelock); 2555 while (rp->r_flags & R4TRUNCATE) 2556 cv_wait(&rp->r_cv, &rp->r_statelock); 2557 rp->r_flags |= R4TRUNCATE; 2558 if (off == (u_offset_t)0) { 2559 rp->r_flags &= ~R4DIRTY; 2560 if (!(rp->r_flags & R4STALE)) 2561 rp->r_error = 0; 2562 } 2563 rp->r_truncaddr = off; 2564 mutex_exit(&rp->r_statelock); 2565 (void) pvn_vplist_dirty(vp, off, rp->r_putapage, 2566 B_INVAL | B_TRUNC, cr); 2567 mutex_enter(&rp->r_statelock); 2568 rp->r_flags &= ~R4TRUNCATE; 2569 cv_broadcast(&rp->r_cv); 2570 mutex_exit(&rp->r_statelock); 2571 } 2572 2573 static int 2574 nfs4_mnt_kstat_update(kstat_t *ksp, int rw) 2575 { 2576 mntinfo4_t *mi; 2577 struct mntinfo_kstat *mik; 2578 vfs_t *vfsp; 2579 2580 /* this is a read-only kstat. Bail out on a write */ 2581 if (rw == KSTAT_WRITE) 2582 return (EACCES); 2583 2584 2585 /* 2586 * We don't want to wait here as kstat_chain_lock could be held by 2587 * dounmount(). dounmount() takes vfs_reflock before the chain lock 2588 * and thus could lead to a deadlock. 2589 */ 2590 vfsp = (struct vfs *)ksp->ks_private; 2591 2592 mi = VFTOMI4(vfsp); 2593 mik = (struct mntinfo_kstat *)ksp->ks_data; 2594 2595 (void) strcpy(mik->mik_proto, mi->mi_curr_serv->sv_knconf->knc_proto); 2596 2597 mik->mik_vers = (uint32_t)mi->mi_vers; 2598 mik->mik_flags = mi->mi_flags; 2599 /* 2600 * The sv_secdata holds the flavor the client specifies. 2601 * If the client uses default and a security negotiation 2602 * occurs, sv_currsec will point to the current flavor 2603 * selected from the server flavor list. 2604 * sv_currsec is NULL if no security negotiation takes place. 2605 */ 2606 mik->mik_secmod = mi->mi_curr_serv->sv_currsec ? 2607 mi->mi_curr_serv->sv_currsec->secmod : 2608 mi->mi_curr_serv->sv_secdata->secmod; 2609 mik->mik_curread = (uint32_t)mi->mi_curread; 2610 mik->mik_curwrite = (uint32_t)mi->mi_curwrite; 2611 mik->mik_retrans = mi->mi_retrans; 2612 mik->mik_timeo = mi->mi_timeo; 2613 mik->mik_acregmin = HR2SEC(mi->mi_acregmin); 2614 mik->mik_acregmax = HR2SEC(mi->mi_acregmax); 2615 mik->mik_acdirmin = HR2SEC(mi->mi_acdirmin); 2616 mik->mik_acdirmax = HR2SEC(mi->mi_acdirmax); 2617 mik->mik_noresponse = (uint32_t)mi->mi_noresponse; 2618 mik->mik_failover = (uint32_t)mi->mi_failover; 2619 mik->mik_remap = (uint32_t)mi->mi_remap; 2620 2621 (void) strcpy(mik->mik_curserver, mi->mi_curr_serv->sv_hostname); 2622 2623 return (0); 2624 } 2625 2626 void 2627 nfs4_mnt_kstat_init(struct vfs *vfsp) 2628 { 2629 mntinfo4_t *mi = VFTOMI4(vfsp); 2630 2631 /* 2632 * PSARC 2001/697 Contract Private Interface 2633 * All nfs kstats are under SunMC contract 2634 * Please refer to the PSARC listed above and contact 2635 * SunMC before making any changes! 2636 * 2637 * Changes must be reviewed by Solaris File Sharing 2638 * Changes must be communicated to contract-2001-697@sun.com 2639 * 2640 */ 2641 2642 mi->mi_io_kstats = kstat_create_zone("nfs", getminor(vfsp->vfs_dev), 2643 NULL, "nfs", KSTAT_TYPE_IO, 1, 0, mi->mi_zone->zone_id); 2644 if (mi->mi_io_kstats) { 2645 if (mi->mi_zone->zone_id != GLOBAL_ZONEID) 2646 kstat_zone_add(mi->mi_io_kstats, GLOBAL_ZONEID); 2647 mi->mi_io_kstats->ks_lock = &mi->mi_lock; 2648 kstat_install(mi->mi_io_kstats); 2649 } 2650 2651 if ((mi->mi_ro_kstats = kstat_create_zone("nfs", 2652 getminor(vfsp->vfs_dev), "mntinfo", "misc", KSTAT_TYPE_RAW, 2653 sizeof (struct mntinfo_kstat), 0, mi->mi_zone->zone_id)) != NULL) { 2654 if (mi->mi_zone->zone_id != GLOBAL_ZONEID) 2655 kstat_zone_add(mi->mi_ro_kstats, GLOBAL_ZONEID); 2656 mi->mi_ro_kstats->ks_update = nfs4_mnt_kstat_update; 2657 mi->mi_ro_kstats->ks_private = (void *)vfsp; 2658 kstat_install(mi->mi_ro_kstats); 2659 } 2660 2661 nfs4_mnt_recov_kstat_init(vfsp); 2662 } 2663 2664 void 2665 nfs4_write_error(vnode_t *vp, int error, cred_t *cr) 2666 { 2667 mntinfo4_t *mi; 2668 clock_t now = ddi_get_lbolt(); 2669 2670 mi = VTOMI4(vp); 2671 /* 2672 * In case of forced unmount, do not print any messages 2673 * since it can flood the console with error messages. 2674 */ 2675 if (mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED) 2676 return; 2677 2678 /* 2679 * If the mount point is dead, not recoverable, do not 2680 * print error messages that can flood the console. 2681 */ 2682 if (mi->mi_flags & MI4_RECOV_FAIL) 2683 return; 2684 2685 /* 2686 * No use in flooding the console with ENOSPC 2687 * messages from the same file system. 2688 */ 2689 if ((error != ENOSPC && error != EDQUOT) || 2690 now - mi->mi_printftime > 0) { 2691 zoneid_t zoneid = mi->mi_zone->zone_id; 2692 2693 #ifdef DEBUG 2694 nfs_perror(error, "NFS%ld write error on host %s: %m.\n", 2695 mi->mi_vers, VTOR4(vp)->r_server->sv_hostname, NULL); 2696 #else 2697 nfs_perror(error, "NFS write error on host %s: %m.\n", 2698 VTOR4(vp)->r_server->sv_hostname, NULL); 2699 #endif 2700 if (error == ENOSPC || error == EDQUOT) { 2701 zcmn_err(zoneid, CE_CONT, 2702 "^File: userid=%d, groupid=%d\n", 2703 crgetuid(cr), crgetgid(cr)); 2704 if (crgetuid(curthread->t_cred) != crgetuid(cr) || 2705 crgetgid(curthread->t_cred) != crgetgid(cr)) { 2706 zcmn_err(zoneid, CE_CONT, 2707 "^User: userid=%d, groupid=%d\n", 2708 crgetuid(curthread->t_cred), 2709 crgetgid(curthread->t_cred)); 2710 } 2711 mi->mi_printftime = now + 2712 nfs_write_error_interval * hz; 2713 } 2714 sfh4_printfhandle(VTOR4(vp)->r_fh); 2715 #ifdef DEBUG 2716 if (error == EACCES) { 2717 zcmn_err(zoneid, CE_CONT, 2718 "nfs_bio: cred is%s kcred\n", 2719 cr == kcred ? "" : " not"); 2720 } 2721 #endif 2722 } 2723 } 2724 2725 /* 2726 * Return non-zero if the given file can be safely memory mapped. Locks 2727 * are safe if whole-file (length and offset are both zero). 2728 */ 2729 2730 #define SAFE_LOCK(flk) ((flk).l_start == 0 && (flk).l_len == 0) 2731 2732 static int 2733 nfs4_safemap(const vnode_t *vp) 2734 { 2735 locklist_t *llp, *next_llp; 2736 int safe = 1; 2737 rnode4_t *rp = VTOR4(vp); 2738 2739 ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER)); 2740 2741 NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, "nfs4_safemap: " 2742 "vp = %p", (void *)vp)); 2743 2744 /* 2745 * Review all the locks for the vnode, both ones that have been 2746 * acquired and ones that are pending. We assume that 2747 * flk_active_locks_for_vp() has merged any locks that can be 2748 * merged (so that if a process has the entire file locked, it is 2749 * represented as a single lock). 2750 * 2751 * Note that we can't bail out of the loop if we find a non-safe 2752 * lock, because we have to free all the elements in the llp list. 2753 * We might be able to speed up this code slightly by not looking 2754 * at each lock's l_start and l_len fields once we've found a 2755 * non-safe lock. 2756 */ 2757 2758 llp = flk_active_locks_for_vp(vp); 2759 while (llp) { 2760 NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, 2761 "nfs4_safemap: active lock (%" PRId64 ", %" PRId64 ")", 2762 llp->ll_flock.l_start, llp->ll_flock.l_len)); 2763 if (!SAFE_LOCK(llp->ll_flock)) { 2764 safe = 0; 2765 NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, 2766 "nfs4_safemap: unsafe active lock (%" PRId64 2767 ", %" PRId64 ")", llp->ll_flock.l_start, 2768 llp->ll_flock.l_len)); 2769 } 2770 next_llp = llp->ll_next; 2771 VN_RELE(llp->ll_vp); 2772 kmem_free(llp, sizeof (*llp)); 2773 llp = next_llp; 2774 } 2775 2776 NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, "nfs4_safemap: %s", 2777 safe ? "safe" : "unsafe")); 2778 return (safe); 2779 } 2780 2781 /* 2782 * Return whether there is a lost LOCK or LOCKU queued up for the given 2783 * file that would make an mmap request unsafe. cf. nfs4_safemap(). 2784 */ 2785 2786 bool_t 2787 nfs4_map_lost_lock_conflict(vnode_t *vp) 2788 { 2789 bool_t conflict = FALSE; 2790 nfs4_lost_rqst_t *lrp; 2791 mntinfo4_t *mi = VTOMI4(vp); 2792 2793 mutex_enter(&mi->mi_lock); 2794 for (lrp = list_head(&mi->mi_lost_state); lrp != NULL; 2795 lrp = list_next(&mi->mi_lost_state, lrp)) { 2796 if (lrp->lr_op != OP_LOCK && lrp->lr_op != OP_LOCKU) 2797 continue; 2798 ASSERT(lrp->lr_vp != NULL); 2799 if (!VOP_CMP(lrp->lr_vp, vp, NULL)) 2800 continue; /* different file */ 2801 if (!SAFE_LOCK(*lrp->lr_flk)) { 2802 conflict = TRUE; 2803 break; 2804 } 2805 } 2806 2807 mutex_exit(&mi->mi_lock); 2808 return (conflict); 2809 } 2810 2811 /* 2812 * nfs_lockcompletion: 2813 * 2814 * If the vnode has a lock that makes it unsafe to cache the file, mark it 2815 * as non cachable (set VNOCACHE bit). 2816 */ 2817 2818 void 2819 nfs4_lockcompletion(vnode_t *vp, int cmd) 2820 { 2821 rnode4_t *rp = VTOR4(vp); 2822 2823 ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER)); 2824 ASSERT(!IS_SHADOW(vp, rp)); 2825 2826 if (cmd == F_SETLK || cmd == F_SETLKW) { 2827 2828 if (!nfs4_safemap(vp)) { 2829 mutex_enter(&vp->v_lock); 2830 vp->v_flag |= VNOCACHE; 2831 mutex_exit(&vp->v_lock); 2832 } else { 2833 mutex_enter(&vp->v_lock); 2834 vp->v_flag &= ~VNOCACHE; 2835 mutex_exit(&vp->v_lock); 2836 } 2837 } 2838 /* 2839 * The cached attributes of the file are stale after acquiring 2840 * the lock on the file. They were updated when the file was 2841 * opened, but not updated when the lock was acquired. Therefore the 2842 * cached attributes are invalidated after the lock is obtained. 2843 */ 2844 PURGE_ATTRCACHE4(vp); 2845 } 2846 2847 /* ARGSUSED */ 2848 static void * 2849 nfs4_mi_init(zoneid_t zoneid) 2850 { 2851 struct mi4_globals *mig; 2852 2853 mig = kmem_alloc(sizeof (*mig), KM_SLEEP); 2854 mutex_init(&mig->mig_lock, NULL, MUTEX_DEFAULT, NULL); 2855 list_create(&mig->mig_list, sizeof (mntinfo4_t), 2856 offsetof(mntinfo4_t, mi_zone_node)); 2857 mig->mig_destructor_called = B_FALSE; 2858 return (mig); 2859 } 2860 2861 /* 2862 * Callback routine to tell all NFSv4 mounts in the zone to start tearing down 2863 * state and killing off threads. 2864 */ 2865 /* ARGSUSED */ 2866 static void 2867 nfs4_mi_shutdown(zoneid_t zoneid, void *data) 2868 { 2869 struct mi4_globals *mig = data; 2870 mntinfo4_t *mi; 2871 nfs4_server_t *np; 2872 2873 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE, 2874 "nfs4_mi_shutdown zone %d\n", zoneid)); 2875 ASSERT(mig != NULL); 2876 for (;;) { 2877 mutex_enter(&mig->mig_lock); 2878 mi = list_head(&mig->mig_list); 2879 if (mi == NULL) { 2880 mutex_exit(&mig->mig_lock); 2881 break; 2882 } 2883 2884 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE, 2885 "nfs4_mi_shutdown stopping vfs %p\n", (void *)mi->mi_vfsp)); 2886 /* 2887 * purge the DNLC for this filesystem 2888 */ 2889 (void) dnlc_purge_vfsp(mi->mi_vfsp, 0); 2890 /* 2891 * Tell existing async worker threads to exit. 2892 */ 2893 mutex_enter(&mi->mi_async_lock); 2894 mi->mi_max_threads = 0; 2895 NFS4_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv); 2896 /* 2897 * Set the appropriate flags, signal and wait for both the 2898 * async manager and the inactive thread to exit when they're 2899 * done with their current work. 2900 */ 2901 mutex_enter(&mi->mi_lock); 2902 mi->mi_flags |= (MI4_ASYNC_MGR_STOP|MI4_DEAD); 2903 mutex_exit(&mi->mi_lock); 2904 mutex_exit(&mi->mi_async_lock); 2905 if (mi->mi_manager_thread) { 2906 nfs4_async_manager_stop(mi->mi_vfsp); 2907 } 2908 if (mi->mi_inactive_thread) { 2909 mutex_enter(&mi->mi_async_lock); 2910 cv_signal(&mi->mi_inact_req_cv); 2911 /* 2912 * Wait for the inactive thread to exit. 2913 */ 2914 while (mi->mi_inactive_thread != NULL) { 2915 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock); 2916 } 2917 mutex_exit(&mi->mi_async_lock); 2918 } 2919 /* 2920 * Wait for the recovery thread to complete, that is, it will 2921 * signal when it is done using the "mi" structure and about 2922 * to exit 2923 */ 2924 mutex_enter(&mi->mi_lock); 2925 while (mi->mi_in_recovery > 0) 2926 cv_wait(&mi->mi_cv_in_recov, &mi->mi_lock); 2927 mutex_exit(&mi->mi_lock); 2928 /* 2929 * We're done when every mi has been done or the list is empty. 2930 * This one is done, remove it from the list. 2931 */ 2932 list_remove(&mig->mig_list, mi); 2933 mutex_exit(&mig->mig_lock); 2934 zone_rele_ref(&mi->mi_zone_ref, ZONE_REF_NFSV4); 2935 2936 /* 2937 * Release hold on vfs and mi done to prevent race with zone 2938 * shutdown. This releases the hold in nfs4_mi_zonelist_add. 2939 */ 2940 VFS_RELE(mi->mi_vfsp); 2941 MI4_RELE(mi); 2942 } 2943 /* 2944 * Tell each renew thread in the zone to exit 2945 */ 2946 mutex_enter(&nfs4_server_lst_lock); 2947 for (np = nfs4_server_lst.forw; np != &nfs4_server_lst; np = np->forw) { 2948 mutex_enter(&np->s_lock); 2949 if (np->zoneid == zoneid) { 2950 /* 2951 * We add another hold onto the nfs4_server_t 2952 * because this will make sure tha the nfs4_server_t 2953 * stays around until nfs4_callback_fini_zone destroys 2954 * the zone. This way, the renew thread can 2955 * unconditionally release its holds on the 2956 * nfs4_server_t. 2957 */ 2958 np->s_refcnt++; 2959 nfs4_mark_srv_dead(np); 2960 } 2961 mutex_exit(&np->s_lock); 2962 } 2963 mutex_exit(&nfs4_server_lst_lock); 2964 } 2965 2966 static void 2967 nfs4_mi_free_globals(struct mi4_globals *mig) 2968 { 2969 list_destroy(&mig->mig_list); /* makes sure the list is empty */ 2970 mutex_destroy(&mig->mig_lock); 2971 kmem_free(mig, sizeof (*mig)); 2972 } 2973 2974 /* ARGSUSED */ 2975 static void 2976 nfs4_mi_destroy(zoneid_t zoneid, void *data) 2977 { 2978 struct mi4_globals *mig = data; 2979 2980 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE, 2981 "nfs4_mi_destroy zone %d\n", zoneid)); 2982 ASSERT(mig != NULL); 2983 mutex_enter(&mig->mig_lock); 2984 if (list_head(&mig->mig_list) != NULL) { 2985 /* Still waiting for VFS_FREEVFS() */ 2986 mig->mig_destructor_called = B_TRUE; 2987 mutex_exit(&mig->mig_lock); 2988 return; 2989 } 2990 nfs4_mi_free_globals(mig); 2991 } 2992 2993 /* 2994 * Add an NFS mount to the per-zone list of NFS mounts. 2995 */ 2996 void 2997 nfs4_mi_zonelist_add(mntinfo4_t *mi) 2998 { 2999 struct mi4_globals *mig; 3000 3001 mig = zone_getspecific(mi4_list_key, mi->mi_zone); 3002 mutex_enter(&mig->mig_lock); 3003 list_insert_head(&mig->mig_list, mi); 3004 /* 3005 * hold added to eliminate race with zone shutdown -this will be 3006 * released in mi_shutdown 3007 */ 3008 MI4_HOLD(mi); 3009 VFS_HOLD(mi->mi_vfsp); 3010 mutex_exit(&mig->mig_lock); 3011 } 3012 3013 /* 3014 * Remove an NFS mount from the per-zone list of NFS mounts. 3015 */ 3016 int 3017 nfs4_mi_zonelist_remove(mntinfo4_t *mi) 3018 { 3019 struct mi4_globals *mig; 3020 int ret = 0; 3021 3022 mig = zone_getspecific(mi4_list_key, mi->mi_zone); 3023 mutex_enter(&mig->mig_lock); 3024 mutex_enter(&mi->mi_lock); 3025 /* if this mi is marked dead, then the zone already released it */ 3026 if (!(mi->mi_flags & MI4_DEAD)) { 3027 list_remove(&mig->mig_list, mi); 3028 mutex_exit(&mi->mi_lock); 3029 3030 /* release the holds put on in zonelist_add(). */ 3031 VFS_RELE(mi->mi_vfsp); 3032 MI4_RELE(mi); 3033 ret = 1; 3034 } else { 3035 mutex_exit(&mi->mi_lock); 3036 } 3037 3038 /* 3039 * We can be called asynchronously by VFS_FREEVFS() after the zone 3040 * shutdown/destroy callbacks have executed; if so, clean up the zone's 3041 * mi globals. 3042 */ 3043 if (list_head(&mig->mig_list) == NULL && 3044 mig->mig_destructor_called == B_TRUE) { 3045 nfs4_mi_free_globals(mig); 3046 return (ret); 3047 } 3048 mutex_exit(&mig->mig_lock); 3049 return (ret); 3050 } 3051 3052 void 3053 nfs_free_mi4(mntinfo4_t *mi) 3054 { 3055 nfs4_open_owner_t *foop; 3056 nfs4_oo_hash_bucket_t *bucketp; 3057 nfs4_debug_msg_t *msgp; 3058 int i; 3059 servinfo4_t *svp; 3060 3061 /* 3062 * Code introduced here should be carefully evaluated to make 3063 * sure none of the freed resources are accessed either directly 3064 * or indirectly after freeing them. For eg: Introducing calls to 3065 * NFS4_DEBUG that use mntinfo4_t structure member after freeing 3066 * the structure members or other routines calling back into NFS 3067 * accessing freed mntinfo4_t structure member. 3068 */ 3069 mutex_enter(&mi->mi_lock); 3070 ASSERT(mi->mi_recovthread == NULL); 3071 ASSERT(mi->mi_flags & MI4_ASYNC_MGR_STOP); 3072 mutex_exit(&mi->mi_lock); 3073 mutex_enter(&mi->mi_async_lock); 3074 ASSERT(mi->mi_threads[NFS4_ASYNC_QUEUE] == 0 && 3075 mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] == 0); 3076 ASSERT(mi->mi_manager_thread == NULL); 3077 mutex_exit(&mi->mi_async_lock); 3078 if (mi->mi_io_kstats) { 3079 kstat_delete(mi->mi_io_kstats); 3080 mi->mi_io_kstats = NULL; 3081 } 3082 if (mi->mi_ro_kstats) { 3083 kstat_delete(mi->mi_ro_kstats); 3084 mi->mi_ro_kstats = NULL; 3085 } 3086 if (mi->mi_recov_ksp) { 3087 kstat_delete(mi->mi_recov_ksp); 3088 mi->mi_recov_ksp = NULL; 3089 } 3090 mutex_enter(&mi->mi_msg_list_lock); 3091 while (msgp = list_head(&mi->mi_msg_list)) { 3092 list_remove(&mi->mi_msg_list, msgp); 3093 nfs4_free_msg(msgp); 3094 } 3095 mutex_exit(&mi->mi_msg_list_lock); 3096 list_destroy(&mi->mi_msg_list); 3097 if (mi->mi_fname != NULL) 3098 fn_rele(&mi->mi_fname); 3099 if (mi->mi_rootfh != NULL) 3100 sfh4_rele(&mi->mi_rootfh); 3101 if (mi->mi_srvparentfh != NULL) 3102 sfh4_rele(&mi->mi_srvparentfh); 3103 svp = mi->mi_servers; 3104 sv4_free(svp); 3105 mutex_destroy(&mi->mi_lock); 3106 mutex_destroy(&mi->mi_async_lock); 3107 mutex_destroy(&mi->mi_msg_list_lock); 3108 nfs_rw_destroy(&mi->mi_recovlock); 3109 nfs_rw_destroy(&mi->mi_rename_lock); 3110 nfs_rw_destroy(&mi->mi_fh_lock); 3111 cv_destroy(&mi->mi_failover_cv); 3112 cv_destroy(&mi->mi_async_reqs_cv); 3113 cv_destroy(&mi->mi_async_work_cv[NFS4_ASYNC_QUEUE]); 3114 cv_destroy(&mi->mi_async_work_cv[NFS4_ASYNC_PGOPS_QUEUE]); 3115 cv_destroy(&mi->mi_async_cv); 3116 cv_destroy(&mi->mi_inact_req_cv); 3117 /* 3118 * Destroy the oo hash lists and mutexes for the cred hash table. 3119 */ 3120 for (i = 0; i < NFS4_NUM_OO_BUCKETS; i++) { 3121 bucketp = &(mi->mi_oo_list[i]); 3122 /* Destroy any remaining open owners on the list */ 3123 foop = list_head(&bucketp->b_oo_hash_list); 3124 while (foop != NULL) { 3125 list_remove(&bucketp->b_oo_hash_list, foop); 3126 nfs4_destroy_open_owner(foop); 3127 foop = list_head(&bucketp->b_oo_hash_list); 3128 } 3129 list_destroy(&bucketp->b_oo_hash_list); 3130 mutex_destroy(&bucketp->b_lock); 3131 } 3132 /* 3133 * Empty and destroy the freed open owner list. 3134 */ 3135 foop = list_head(&mi->mi_foo_list); 3136 while (foop != NULL) { 3137 list_remove(&mi->mi_foo_list, foop); 3138 nfs4_destroy_open_owner(foop); 3139 foop = list_head(&mi->mi_foo_list); 3140 } 3141 list_destroy(&mi->mi_foo_list); 3142 list_destroy(&mi->mi_bseqid_list); 3143 list_destroy(&mi->mi_lost_state); 3144 avl_destroy(&mi->mi_filehandles); 3145 kmem_free(mi, sizeof (*mi)); 3146 } 3147 void 3148 mi_hold(mntinfo4_t *mi) 3149 { 3150 atomic_add_32(&mi->mi_count, 1); 3151 ASSERT(mi->mi_count != 0); 3152 } 3153 3154 void 3155 mi_rele(mntinfo4_t *mi) 3156 { 3157 ASSERT(mi->mi_count != 0); 3158 if (atomic_add_32_nv(&mi->mi_count, -1) == 0) { 3159 nfs_free_mi4(mi); 3160 } 3161 } 3162 3163 vnode_t nfs4_xattr_notsupp_vnode; 3164 3165 void 3166 nfs4_clnt_init(void) 3167 { 3168 nfs4_vnops_init(); 3169 (void) nfs4_rnode_init(); 3170 (void) nfs4_shadow_init(); 3171 (void) nfs4_acache_init(); 3172 (void) nfs4_subr_init(); 3173 nfs4_acl_init(); 3174 nfs_idmap_init(); 3175 nfs4_callback_init(); 3176 nfs4_secinfo_init(); 3177 #ifdef DEBUG 3178 tsd_create(&nfs4_tsd_key, NULL); 3179 #endif 3180 3181 /* 3182 * Add a CPR callback so that we can update client 3183 * lease after a suspend and resume. 3184 */ 3185 cid = callb_add(nfs4_client_cpr_callb, 0, CB_CL_CPR_RPC, "nfs4"); 3186 3187 zone_key_create(&mi4_list_key, nfs4_mi_init, nfs4_mi_shutdown, 3188 nfs4_mi_destroy); 3189 3190 /* 3191 * Initialise the reference count of the notsupp xattr cache vnode to 1 3192 * so that it never goes away (VOP_INACTIVE isn't called on it). 3193 */ 3194 nfs4_xattr_notsupp_vnode.v_count = 1; 3195 } 3196 3197 void 3198 nfs4_clnt_fini(void) 3199 { 3200 (void) zone_key_delete(mi4_list_key); 3201 nfs4_vnops_fini(); 3202 (void) nfs4_rnode_fini(); 3203 (void) nfs4_shadow_fini(); 3204 (void) nfs4_acache_fini(); 3205 (void) nfs4_subr_fini(); 3206 nfs_idmap_fini(); 3207 nfs4_callback_fini(); 3208 nfs4_secinfo_fini(); 3209 #ifdef DEBUG 3210 tsd_destroy(&nfs4_tsd_key); 3211 #endif 3212 if (cid) 3213 (void) callb_delete(cid); 3214 } 3215 3216 /*ARGSUSED*/ 3217 static boolean_t 3218 nfs4_client_cpr_callb(void *arg, int code) 3219 { 3220 /* 3221 * We get called for Suspend and Resume events. 3222 * For the suspend case we simply don't care! 3223 */ 3224 if (code == CB_CODE_CPR_CHKPT) { 3225 return (B_TRUE); 3226 } 3227 3228 /* 3229 * When we get to here we are in the process of 3230 * resuming the system from a previous suspend. 3231 */ 3232 nfs4_client_resumed = gethrestime_sec(); 3233 return (B_TRUE); 3234 } 3235 3236 void 3237 nfs4_renew_lease_thread(nfs4_server_t *sp) 3238 { 3239 int error = 0; 3240 time_t tmp_last_renewal_time, tmp_time, tmp_now_time, kip_secs; 3241 clock_t tick_delay = 0; 3242 clock_t time_left = 0; 3243 callb_cpr_t cpr_info; 3244 kmutex_t cpr_lock; 3245 3246 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3247 "nfs4_renew_lease_thread: acting on sp 0x%p", (void*)sp)); 3248 mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL); 3249 CALLB_CPR_INIT(&cpr_info, &cpr_lock, callb_generic_cpr, "nfsv4Lease"); 3250 3251 mutex_enter(&sp->s_lock); 3252 /* sp->s_lease_time is set via a GETATTR */ 3253 sp->last_renewal_time = gethrestime_sec(); 3254 sp->lease_valid = NFS4_LEASE_UNINITIALIZED; 3255 ASSERT(sp->s_refcnt >= 1); 3256 3257 for (;;) { 3258 if (!sp->state_ref_count || 3259 sp->lease_valid != NFS4_LEASE_VALID) { 3260 3261 kip_secs = MAX((sp->s_lease_time >> 1) - 3262 (3 * sp->propagation_delay.tv_sec), 1); 3263 3264 tick_delay = SEC_TO_TICK(kip_secs); 3265 3266 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3267 "nfs4_renew_lease_thread: no renew : thread " 3268 "wait %ld secs", kip_secs)); 3269 3270 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3271 "nfs4_renew_lease_thread: no renew : " 3272 "state_ref_count %d, lease_valid %d", 3273 sp->state_ref_count, sp->lease_valid)); 3274 3275 mutex_enter(&cpr_lock); 3276 CALLB_CPR_SAFE_BEGIN(&cpr_info); 3277 mutex_exit(&cpr_lock); 3278 time_left = cv_reltimedwait(&sp->cv_thread_exit, 3279 &sp->s_lock, tick_delay, TR_CLOCK_TICK); 3280 mutex_enter(&cpr_lock); 3281 CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock); 3282 mutex_exit(&cpr_lock); 3283 3284 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3285 "nfs4_renew_lease_thread: no renew: " 3286 "time left %ld", time_left)); 3287 3288 if (sp->s_thread_exit == NFS4_THREAD_EXIT) 3289 goto die; 3290 continue; 3291 } 3292 3293 tmp_last_renewal_time = sp->last_renewal_time; 3294 3295 tmp_time = gethrestime_sec() - sp->last_renewal_time + 3296 (3 * sp->propagation_delay.tv_sec); 3297 3298 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3299 "nfs4_renew_lease_thread: tmp_time %ld, " 3300 "sp->last_renewal_time %ld", tmp_time, 3301 sp->last_renewal_time)); 3302 3303 kip_secs = MAX((sp->s_lease_time >> 1) - tmp_time, 1); 3304 3305 tick_delay = SEC_TO_TICK(kip_secs); 3306 3307 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3308 "nfs4_renew_lease_thread: valid lease: sleep for %ld " 3309 "secs", kip_secs)); 3310 3311 mutex_enter(&cpr_lock); 3312 CALLB_CPR_SAFE_BEGIN(&cpr_info); 3313 mutex_exit(&cpr_lock); 3314 time_left = cv_reltimedwait(&sp->cv_thread_exit, &sp->s_lock, 3315 tick_delay, TR_CLOCK_TICK); 3316 mutex_enter(&cpr_lock); 3317 CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock); 3318 mutex_exit(&cpr_lock); 3319 3320 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3321 "nfs4_renew_lease_thread: valid lease: time left %ld :" 3322 "sp last_renewal_time %ld, nfs4_client_resumed %ld, " 3323 "tmp_last_renewal_time %ld", time_left, 3324 sp->last_renewal_time, nfs4_client_resumed, 3325 tmp_last_renewal_time)); 3326 3327 if (sp->s_thread_exit == NFS4_THREAD_EXIT) 3328 goto die; 3329 3330 if (tmp_last_renewal_time == sp->last_renewal_time || 3331 (nfs4_client_resumed != 0 && 3332 nfs4_client_resumed > sp->last_renewal_time)) { 3333 /* 3334 * Issue RENEW op since we haven't renewed the lease 3335 * since we slept. 3336 */ 3337 tmp_now_time = gethrestime_sec(); 3338 error = nfs4renew(sp); 3339 /* 3340 * Need to re-acquire sp's lock, nfs4renew() 3341 * relinqueshes it. 3342 */ 3343 mutex_enter(&sp->s_lock); 3344 3345 /* 3346 * See if someone changed s_thread_exit while we gave 3347 * up s_lock. 3348 */ 3349 if (sp->s_thread_exit == NFS4_THREAD_EXIT) 3350 goto die; 3351 3352 if (!error) { 3353 /* 3354 * check to see if we implicitly renewed while 3355 * we waited for a reply for our RENEW call. 3356 */ 3357 if (tmp_last_renewal_time == 3358 sp->last_renewal_time) { 3359 /* no implicit renew came */ 3360 sp->last_renewal_time = tmp_now_time; 3361 } else { 3362 NFS4_DEBUG(nfs4_client_lease_debug, 3363 (CE_NOTE, "renew_thread: did " 3364 "implicit renewal before reply " 3365 "from server for RENEW")); 3366 } 3367 } else { 3368 /* figure out error */ 3369 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3370 "renew_thread: nfs4renew returned error" 3371 " %d", error)); 3372 } 3373 3374 } 3375 } 3376 3377 die: 3378 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3379 "nfs4_renew_lease_thread: thread exiting")); 3380 3381 while (sp->s_otw_call_count != 0) { 3382 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3383 "nfs4_renew_lease_thread: waiting for outstanding " 3384 "otw calls to finish for sp 0x%p, current " 3385 "s_otw_call_count %d", (void *)sp, 3386 sp->s_otw_call_count)); 3387 mutex_enter(&cpr_lock); 3388 CALLB_CPR_SAFE_BEGIN(&cpr_info); 3389 mutex_exit(&cpr_lock); 3390 cv_wait(&sp->s_cv_otw_count, &sp->s_lock); 3391 mutex_enter(&cpr_lock); 3392 CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock); 3393 mutex_exit(&cpr_lock); 3394 } 3395 mutex_exit(&sp->s_lock); 3396 3397 nfs4_server_rele(sp); /* free the thread's reference */ 3398 nfs4_server_rele(sp); /* free the list's reference */ 3399 sp = NULL; 3400 3401 done: 3402 mutex_enter(&cpr_lock); 3403 CALLB_CPR_EXIT(&cpr_info); /* drops cpr_lock */ 3404 mutex_destroy(&cpr_lock); 3405 3406 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3407 "nfs4_renew_lease_thread: renew thread exit officially")); 3408 3409 zthread_exit(); 3410 /* NOT REACHED */ 3411 } 3412 3413 /* 3414 * Send out a RENEW op to the server. 3415 * Assumes sp is locked down. 3416 */ 3417 static int 3418 nfs4renew(nfs4_server_t *sp) 3419 { 3420 COMPOUND4args_clnt args; 3421 COMPOUND4res_clnt res; 3422 nfs_argop4 argop[1]; 3423 int doqueue = 1; 3424 int rpc_error; 3425 cred_t *cr; 3426 mntinfo4_t *mi; 3427 timespec_t prop_time, after_time; 3428 int needrecov = FALSE; 3429 nfs4_recov_state_t recov_state; 3430 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 3431 3432 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "nfs4renew")); 3433 3434 recov_state.rs_flags = 0; 3435 recov_state.rs_num_retry_despite_err = 0; 3436 3437 recov_retry: 3438 mi = sp->mntinfo4_list; 3439 VFS_HOLD(mi->mi_vfsp); 3440 mutex_exit(&sp->s_lock); 3441 ASSERT(mi != NULL); 3442 3443 e.error = nfs4_start_op(mi, NULL, NULL, &recov_state); 3444 if (e.error) { 3445 VFS_RELE(mi->mi_vfsp); 3446 return (e.error); 3447 } 3448 3449 /* Check to see if we're dealing with a marked-dead sp */ 3450 mutex_enter(&sp->s_lock); 3451 if (sp->s_thread_exit == NFS4_THREAD_EXIT) { 3452 mutex_exit(&sp->s_lock); 3453 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov); 3454 VFS_RELE(mi->mi_vfsp); 3455 return (0); 3456 } 3457 3458 /* Make sure mi hasn't changed on us */ 3459 if (mi != sp->mntinfo4_list) { 3460 /* Must drop sp's lock to avoid a recursive mutex enter */ 3461 mutex_exit(&sp->s_lock); 3462 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov); 3463 VFS_RELE(mi->mi_vfsp); 3464 mutex_enter(&sp->s_lock); 3465 goto recov_retry; 3466 } 3467 mutex_exit(&sp->s_lock); 3468 3469 args.ctag = TAG_RENEW; 3470 3471 args.array_len = 1; 3472 args.array = argop; 3473 3474 argop[0].argop = OP_RENEW; 3475 3476 mutex_enter(&sp->s_lock); 3477 argop[0].nfs_argop4_u.oprenew.clientid = sp->clientid; 3478 cr = sp->s_cred; 3479 crhold(cr); 3480 mutex_exit(&sp->s_lock); 3481 3482 ASSERT(cr != NULL); 3483 3484 /* used to figure out RTT for sp */ 3485 gethrestime(&prop_time); 3486 3487 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 3488 "nfs4renew: %s call, sp 0x%p", needrecov ? "recov" : "first", 3489 (void*)sp)); 3490 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "before: %ld s %ld ns ", 3491 prop_time.tv_sec, prop_time.tv_nsec)); 3492 3493 DTRACE_PROBE2(nfs4__renew__start, nfs4_server_t *, sp, 3494 mntinfo4_t *, mi); 3495 3496 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 3497 crfree(cr); 3498 3499 DTRACE_PROBE2(nfs4__renew__end, nfs4_server_t *, sp, 3500 mntinfo4_t *, mi); 3501 3502 gethrestime(&after_time); 3503 3504 mutex_enter(&sp->s_lock); 3505 sp->propagation_delay.tv_sec = 3506 MAX(1, after_time.tv_sec - prop_time.tv_sec); 3507 mutex_exit(&sp->s_lock); 3508 3509 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "after : %ld s %ld ns ", 3510 after_time.tv_sec, after_time.tv_nsec)); 3511 3512 if (e.error == 0 && res.status == NFS4ERR_CB_PATH_DOWN) { 3513 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3514 nfs4_delegreturn_all(sp); 3515 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov); 3516 VFS_RELE(mi->mi_vfsp); 3517 /* 3518 * If the server returns CB_PATH_DOWN, it has renewed 3519 * the lease and informed us that the callback path is 3520 * down. Since the lease is renewed, just return 0 and 3521 * let the renew thread proceed as normal. 3522 */ 3523 return (0); 3524 } 3525 3526 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 3527 if (!needrecov && e.error) { 3528 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov); 3529 VFS_RELE(mi->mi_vfsp); 3530 return (e.error); 3531 } 3532 3533 rpc_error = e.error; 3534 3535 if (needrecov) { 3536 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 3537 "nfs4renew: initiating recovery\n")); 3538 3539 if (nfs4_start_recovery(&e, mi, NULL, NULL, NULL, NULL, 3540 OP_RENEW, NULL, NULL, NULL) == FALSE) { 3541 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov); 3542 VFS_RELE(mi->mi_vfsp); 3543 if (!e.error) 3544 (void) xdr_free(xdr_COMPOUND4res_clnt, 3545 (caddr_t)&res); 3546 mutex_enter(&sp->s_lock); 3547 goto recov_retry; 3548 } 3549 /* fall through for res.status case */ 3550 } 3551 3552 if (res.status) { 3553 if (res.status == NFS4ERR_LEASE_MOVED) { 3554 /*EMPTY*/ 3555 /* 3556 * XXX need to try every mntinfo4 in sp->mntinfo4_list 3557 * to renew the lease on that server 3558 */ 3559 } 3560 e.error = geterrno4(res.status); 3561 } 3562 3563 if (!rpc_error) 3564 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3565 3566 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov); 3567 3568 VFS_RELE(mi->mi_vfsp); 3569 3570 return (e.error); 3571 } 3572 3573 void 3574 nfs4_inc_state_ref_count(mntinfo4_t *mi) 3575 { 3576 nfs4_server_t *sp; 3577 3578 /* this locks down sp if it is found */ 3579 sp = find_nfs4_server(mi); 3580 3581 if (sp != NULL) { 3582 nfs4_inc_state_ref_count_nolock(sp, mi); 3583 mutex_exit(&sp->s_lock); 3584 nfs4_server_rele(sp); 3585 } 3586 } 3587 3588 /* 3589 * Bump the number of OPEN files (ie: those with state) so we know if this 3590 * nfs4_server has any state to maintain a lease for or not. 3591 * 3592 * Also, marks the nfs4_server's lease valid if it hasn't been done so already. 3593 */ 3594 void 3595 nfs4_inc_state_ref_count_nolock(nfs4_server_t *sp, mntinfo4_t *mi) 3596 { 3597 ASSERT(mutex_owned(&sp->s_lock)); 3598 3599 sp->state_ref_count++; 3600 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3601 "nfs4_inc_state_ref_count: state_ref_count now %d", 3602 sp->state_ref_count)); 3603 3604 if (sp->lease_valid == NFS4_LEASE_UNINITIALIZED) 3605 sp->lease_valid = NFS4_LEASE_VALID; 3606 3607 /* 3608 * If this call caused the lease to be marked valid and/or 3609 * took the state_ref_count from 0 to 1, then start the time 3610 * on lease renewal. 3611 */ 3612 if (sp->lease_valid == NFS4_LEASE_VALID && sp->state_ref_count == 1) 3613 sp->last_renewal_time = gethrestime_sec(); 3614 3615 /* update the number of open files for mi */ 3616 mi->mi_open_files++; 3617 } 3618 3619 void 3620 nfs4_dec_state_ref_count(mntinfo4_t *mi) 3621 { 3622 nfs4_server_t *sp; 3623 3624 /* this locks down sp if it is found */ 3625 sp = find_nfs4_server_all(mi, 1); 3626 3627 if (sp != NULL) { 3628 nfs4_dec_state_ref_count_nolock(sp, mi); 3629 mutex_exit(&sp->s_lock); 3630 nfs4_server_rele(sp); 3631 } 3632 } 3633 3634 /* 3635 * Decrement the number of OPEN files (ie: those with state) so we know if 3636 * this nfs4_server has any state to maintain a lease for or not. 3637 */ 3638 void 3639 nfs4_dec_state_ref_count_nolock(nfs4_server_t *sp, mntinfo4_t *mi) 3640 { 3641 ASSERT(mutex_owned(&sp->s_lock)); 3642 ASSERT(sp->state_ref_count != 0); 3643 sp->state_ref_count--; 3644 3645 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3646 "nfs4_dec_state_ref_count: state ref count now %d", 3647 sp->state_ref_count)); 3648 3649 mi->mi_open_files--; 3650 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3651 "nfs4_dec_state_ref_count: mi open files %d, v4 flags 0x%x", 3652 mi->mi_open_files, mi->mi_flags)); 3653 3654 /* We don't have to hold the mi_lock to test mi_flags */ 3655 if (mi->mi_open_files == 0 && 3656 (mi->mi_flags & MI4_REMOVE_ON_LAST_CLOSE)) { 3657 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3658 "nfs4_dec_state_ref_count: remove mntinfo4 %p since " 3659 "we have closed the last open file", (void*)mi)); 3660 nfs4_remove_mi_from_server(mi, sp); 3661 } 3662 } 3663 3664 bool_t 3665 inlease(nfs4_server_t *sp) 3666 { 3667 bool_t result; 3668 3669 ASSERT(mutex_owned(&sp->s_lock)); 3670 3671 if (sp->lease_valid == NFS4_LEASE_VALID && 3672 gethrestime_sec() < sp->last_renewal_time + sp->s_lease_time) 3673 result = TRUE; 3674 else 3675 result = FALSE; 3676 3677 return (result); 3678 } 3679 3680 3681 /* 3682 * Return non-zero if the given nfs4_server_t is going through recovery. 3683 */ 3684 3685 int 3686 nfs4_server_in_recovery(nfs4_server_t *sp) 3687 { 3688 return (nfs_rw_lock_held(&sp->s_recovlock, RW_WRITER)); 3689 } 3690 3691 /* 3692 * Compare two shared filehandle objects. Returns -1, 0, or +1, if the 3693 * first is less than, equal to, or greater than the second. 3694 */ 3695 3696 int 3697 sfh4cmp(const void *p1, const void *p2) 3698 { 3699 const nfs4_sharedfh_t *sfh1 = (const nfs4_sharedfh_t *)p1; 3700 const nfs4_sharedfh_t *sfh2 = (const nfs4_sharedfh_t *)p2; 3701 3702 return (nfs4cmpfh(&sfh1->sfh_fh, &sfh2->sfh_fh)); 3703 } 3704 3705 /* 3706 * Create a table for shared filehandle objects. 3707 */ 3708 3709 void 3710 sfh4_createtab(avl_tree_t *tab) 3711 { 3712 avl_create(tab, sfh4cmp, sizeof (nfs4_sharedfh_t), 3713 offsetof(nfs4_sharedfh_t, sfh_tree)); 3714 } 3715 3716 /* 3717 * Return a shared filehandle object for the given filehandle. The caller 3718 * is responsible for eventually calling sfh4_rele(). 3719 */ 3720 3721 nfs4_sharedfh_t * 3722 sfh4_put(const nfs_fh4 *fh, mntinfo4_t *mi, nfs4_sharedfh_t *key) 3723 { 3724 nfs4_sharedfh_t *sfh, *nsfh; 3725 avl_index_t where; 3726 nfs4_sharedfh_t skey; 3727 3728 if (!key) { 3729 skey.sfh_fh = *fh; 3730 key = &skey; 3731 } 3732 3733 nsfh = kmem_alloc(sizeof (nfs4_sharedfh_t), KM_SLEEP); 3734 nsfh->sfh_fh.nfs_fh4_len = fh->nfs_fh4_len; 3735 /* 3736 * We allocate the largest possible filehandle size because it's 3737 * not that big, and it saves us from possibly having to resize the 3738 * buffer later. 3739 */ 3740 nsfh->sfh_fh.nfs_fh4_val = kmem_alloc(NFS4_FHSIZE, KM_SLEEP); 3741 bcopy(fh->nfs_fh4_val, nsfh->sfh_fh.nfs_fh4_val, fh->nfs_fh4_len); 3742 mutex_init(&nsfh->sfh_lock, NULL, MUTEX_DEFAULT, NULL); 3743 nsfh->sfh_refcnt = 1; 3744 nsfh->sfh_flags = SFH4_IN_TREE; 3745 nsfh->sfh_mi = mi; 3746 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE, "sfh4_get: new object (%p)", 3747 (void *)nsfh)); 3748 3749 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0); 3750 sfh = avl_find(&mi->mi_filehandles, key, &where); 3751 if (sfh != NULL) { 3752 mutex_enter(&sfh->sfh_lock); 3753 sfh->sfh_refcnt++; 3754 mutex_exit(&sfh->sfh_lock); 3755 nfs_rw_exit(&mi->mi_fh_lock); 3756 /* free our speculative allocs */ 3757 kmem_free(nsfh->sfh_fh.nfs_fh4_val, NFS4_FHSIZE); 3758 kmem_free(nsfh, sizeof (nfs4_sharedfh_t)); 3759 return (sfh); 3760 } 3761 3762 avl_insert(&mi->mi_filehandles, nsfh, where); 3763 nfs_rw_exit(&mi->mi_fh_lock); 3764 3765 return (nsfh); 3766 } 3767 3768 /* 3769 * Return a shared filehandle object for the given filehandle. The caller 3770 * is responsible for eventually calling sfh4_rele(). 3771 */ 3772 3773 nfs4_sharedfh_t * 3774 sfh4_get(const nfs_fh4 *fh, mntinfo4_t *mi) 3775 { 3776 nfs4_sharedfh_t *sfh; 3777 nfs4_sharedfh_t key; 3778 3779 ASSERT(fh->nfs_fh4_len <= NFS4_FHSIZE); 3780 3781 #ifdef DEBUG 3782 if (nfs4_sharedfh_debug) { 3783 nfs4_fhandle_t fhandle; 3784 3785 fhandle.fh_len = fh->nfs_fh4_len; 3786 bcopy(fh->nfs_fh4_val, fhandle.fh_buf, fhandle.fh_len); 3787 zcmn_err(mi->mi_zone->zone_id, CE_NOTE, "sfh4_get:"); 3788 nfs4_printfhandle(&fhandle); 3789 } 3790 #endif 3791 3792 /* 3793 * If there's already an object for the given filehandle, bump the 3794 * reference count and return it. Otherwise, create a new object 3795 * and add it to the AVL tree. 3796 */ 3797 3798 key.sfh_fh = *fh; 3799 3800 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0); 3801 sfh = avl_find(&mi->mi_filehandles, &key, NULL); 3802 if (sfh != NULL) { 3803 mutex_enter(&sfh->sfh_lock); 3804 sfh->sfh_refcnt++; 3805 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE, 3806 "sfh4_get: found existing %p, new refcnt=%d", 3807 (void *)sfh, sfh->sfh_refcnt)); 3808 mutex_exit(&sfh->sfh_lock); 3809 nfs_rw_exit(&mi->mi_fh_lock); 3810 return (sfh); 3811 } 3812 nfs_rw_exit(&mi->mi_fh_lock); 3813 3814 return (sfh4_put(fh, mi, &key)); 3815 } 3816 3817 /* 3818 * Get a reference to the given shared filehandle object. 3819 */ 3820 3821 void 3822 sfh4_hold(nfs4_sharedfh_t *sfh) 3823 { 3824 ASSERT(sfh->sfh_refcnt > 0); 3825 3826 mutex_enter(&sfh->sfh_lock); 3827 sfh->sfh_refcnt++; 3828 NFS4_DEBUG(nfs4_sharedfh_debug, 3829 (CE_NOTE, "sfh4_hold %p, new refcnt=%d", 3830 (void *)sfh, sfh->sfh_refcnt)); 3831 mutex_exit(&sfh->sfh_lock); 3832 } 3833 3834 /* 3835 * Release a reference to the given shared filehandle object and null out 3836 * the given pointer. 3837 */ 3838 3839 void 3840 sfh4_rele(nfs4_sharedfh_t **sfhpp) 3841 { 3842 mntinfo4_t *mi; 3843 nfs4_sharedfh_t *sfh = *sfhpp; 3844 3845 ASSERT(sfh->sfh_refcnt > 0); 3846 3847 mutex_enter(&sfh->sfh_lock); 3848 if (sfh->sfh_refcnt > 1) { 3849 sfh->sfh_refcnt--; 3850 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE, 3851 "sfh4_rele %p, new refcnt=%d", 3852 (void *)sfh, sfh->sfh_refcnt)); 3853 mutex_exit(&sfh->sfh_lock); 3854 goto finish; 3855 } 3856 mutex_exit(&sfh->sfh_lock); 3857 3858 /* 3859 * Possibly the last reference, so get the lock for the table in 3860 * case it's time to remove the object from the table. 3861 */ 3862 mi = sfh->sfh_mi; 3863 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0); 3864 mutex_enter(&sfh->sfh_lock); 3865 sfh->sfh_refcnt--; 3866 if (sfh->sfh_refcnt > 0) { 3867 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE, 3868 "sfh4_rele %p, new refcnt=%d", 3869 (void *)sfh, sfh->sfh_refcnt)); 3870 mutex_exit(&sfh->sfh_lock); 3871 nfs_rw_exit(&mi->mi_fh_lock); 3872 goto finish; 3873 } 3874 3875 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE, 3876 "sfh4_rele %p, last ref", (void *)sfh)); 3877 if (sfh->sfh_flags & SFH4_IN_TREE) { 3878 avl_remove(&mi->mi_filehandles, sfh); 3879 sfh->sfh_flags &= ~SFH4_IN_TREE; 3880 } 3881 mutex_exit(&sfh->sfh_lock); 3882 nfs_rw_exit(&mi->mi_fh_lock); 3883 mutex_destroy(&sfh->sfh_lock); 3884 kmem_free(sfh->sfh_fh.nfs_fh4_val, NFS4_FHSIZE); 3885 kmem_free(sfh, sizeof (nfs4_sharedfh_t)); 3886 3887 finish: 3888 *sfhpp = NULL; 3889 } 3890 3891 /* 3892 * Update the filehandle for the given shared filehandle object. 3893 */ 3894 3895 int nfs4_warn_dupfh = 0; /* if set, always warn about dup fhs below */ 3896 3897 void 3898 sfh4_update(nfs4_sharedfh_t *sfh, const nfs_fh4 *newfh) 3899 { 3900 mntinfo4_t *mi = sfh->sfh_mi; 3901 nfs4_sharedfh_t *dupsfh; 3902 avl_index_t where; 3903 nfs4_sharedfh_t key; 3904 3905 #ifdef DEBUG 3906 mutex_enter(&sfh->sfh_lock); 3907 ASSERT(sfh->sfh_refcnt > 0); 3908 mutex_exit(&sfh->sfh_lock); 3909 #endif 3910 ASSERT(newfh->nfs_fh4_len <= NFS4_FHSIZE); 3911 3912 /* 3913 * The basic plan is to remove the shared filehandle object from 3914 * the table, update it to have the new filehandle, then reinsert 3915 * it. 3916 */ 3917 3918 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0); 3919 mutex_enter(&sfh->sfh_lock); 3920 if (sfh->sfh_flags & SFH4_IN_TREE) { 3921 avl_remove(&mi->mi_filehandles, sfh); 3922 sfh->sfh_flags &= ~SFH4_IN_TREE; 3923 } 3924 mutex_exit(&sfh->sfh_lock); 3925 sfh->sfh_fh.nfs_fh4_len = newfh->nfs_fh4_len; 3926 bcopy(newfh->nfs_fh4_val, sfh->sfh_fh.nfs_fh4_val, 3927 sfh->sfh_fh.nfs_fh4_len); 3928 3929 /* 3930 * XXX If there is already a shared filehandle object with the new 3931 * filehandle, we're in trouble, because the rnode code assumes 3932 * that there is only one shared filehandle object for a given 3933 * filehandle. So issue a warning (for read-write mounts only) 3934 * and don't try to re-insert the given object into the table. 3935 * Hopefully the given object will quickly go away and everyone 3936 * will use the new object. 3937 */ 3938 key.sfh_fh = *newfh; 3939 dupsfh = avl_find(&mi->mi_filehandles, &key, &where); 3940 if (dupsfh != NULL) { 3941 if (!(mi->mi_vfsp->vfs_flag & VFS_RDONLY) || nfs4_warn_dupfh) { 3942 zcmn_err(mi->mi_zone->zone_id, CE_WARN, "sfh4_update: " 3943 "duplicate filehandle detected"); 3944 sfh4_printfhandle(dupsfh); 3945 } 3946 } else { 3947 avl_insert(&mi->mi_filehandles, sfh, where); 3948 mutex_enter(&sfh->sfh_lock); 3949 sfh->sfh_flags |= SFH4_IN_TREE; 3950 mutex_exit(&sfh->sfh_lock); 3951 } 3952 nfs_rw_exit(&mi->mi_fh_lock); 3953 } 3954 3955 /* 3956 * Copy out the current filehandle for the given shared filehandle object. 3957 */ 3958 3959 void 3960 sfh4_copyval(const nfs4_sharedfh_t *sfh, nfs4_fhandle_t *fhp) 3961 { 3962 mntinfo4_t *mi = sfh->sfh_mi; 3963 3964 ASSERT(sfh->sfh_refcnt > 0); 3965 3966 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0); 3967 fhp->fh_len = sfh->sfh_fh.nfs_fh4_len; 3968 ASSERT(fhp->fh_len <= NFS4_FHSIZE); 3969 bcopy(sfh->sfh_fh.nfs_fh4_val, fhp->fh_buf, fhp->fh_len); 3970 nfs_rw_exit(&mi->mi_fh_lock); 3971 } 3972 3973 /* 3974 * Print out the filehandle for the given shared filehandle object. 3975 */ 3976 3977 void 3978 sfh4_printfhandle(const nfs4_sharedfh_t *sfh) 3979 { 3980 nfs4_fhandle_t fhandle; 3981 3982 sfh4_copyval(sfh, &fhandle); 3983 nfs4_printfhandle(&fhandle); 3984 } 3985 3986 /* 3987 * Compare 2 fnames. Returns -1 if the first is "less" than the second, 0 3988 * if they're the same, +1 if the first is "greater" than the second. The 3989 * caller (or whoever's calling the AVL package) is responsible for 3990 * handling locking issues. 3991 */ 3992 3993 static int 3994 fncmp(const void *p1, const void *p2) 3995 { 3996 const nfs4_fname_t *f1 = p1; 3997 const nfs4_fname_t *f2 = p2; 3998 int res; 3999 4000 res = strcmp(f1->fn_name, f2->fn_name); 4001 /* 4002 * The AVL package wants +/-1, not arbitrary positive or negative 4003 * integers. 4004 */ 4005 if (res > 0) 4006 res = 1; 4007 else if (res < 0) 4008 res = -1; 4009 return (res); 4010 } 4011 4012 /* 4013 * Get or create an fname with the given name, as a child of the given 4014 * fname. The caller is responsible for eventually releasing the reference 4015 * (fn_rele()). parent may be NULL. 4016 */ 4017 4018 nfs4_fname_t * 4019 fn_get(nfs4_fname_t *parent, char *name, nfs4_sharedfh_t *sfh) 4020 { 4021 nfs4_fname_t key; 4022 nfs4_fname_t *fnp; 4023 avl_index_t where; 4024 4025 key.fn_name = name; 4026 4027 /* 4028 * If there's already an fname registered with the given name, bump 4029 * its reference count and return it. Otherwise, create a new one 4030 * and add it to the parent's AVL tree. 4031 * 4032 * fname entries we are looking for should match both name 4033 * and sfh stored in the fname. 4034 */ 4035 again: 4036 if (parent != NULL) { 4037 mutex_enter(&parent->fn_lock); 4038 fnp = avl_find(&parent->fn_children, &key, &where); 4039 if (fnp != NULL) { 4040 /* 4041 * This hold on fnp is released below later, 4042 * in case this is not the fnp we want. 4043 */ 4044 fn_hold(fnp); 4045 4046 if (fnp->fn_sfh == sfh) { 4047 /* 4048 * We have found our entry. 4049 * put an hold and return it. 4050 */ 4051 mutex_exit(&parent->fn_lock); 4052 return (fnp); 4053 } 4054 4055 /* 4056 * We have found an entry that has a mismatching 4057 * fn_sfh. This could be a stale entry due to 4058 * server side rename. We will remove this entry 4059 * and make sure no such entries exist. 4060 */ 4061 mutex_exit(&parent->fn_lock); 4062 mutex_enter(&fnp->fn_lock); 4063 if (fnp->fn_parent == parent) { 4064 /* 4065 * Remove ourselves from parent's 4066 * fn_children tree. 4067 */ 4068 mutex_enter(&parent->fn_lock); 4069 avl_remove(&parent->fn_children, fnp); 4070 mutex_exit(&parent->fn_lock); 4071 fn_rele(&fnp->fn_parent); 4072 } 4073 mutex_exit(&fnp->fn_lock); 4074 fn_rele(&fnp); 4075 goto again; 4076 } 4077 } 4078 4079 fnp = kmem_alloc(sizeof (nfs4_fname_t), KM_SLEEP); 4080 mutex_init(&fnp->fn_lock, NULL, MUTEX_DEFAULT, NULL); 4081 fnp->fn_parent = parent; 4082 if (parent != NULL) 4083 fn_hold(parent); 4084 fnp->fn_len = strlen(name); 4085 ASSERT(fnp->fn_len < MAXNAMELEN); 4086 fnp->fn_name = kmem_alloc(fnp->fn_len + 1, KM_SLEEP); 4087 (void) strcpy(fnp->fn_name, name); 4088 fnp->fn_refcnt = 1; 4089 4090 /* 4091 * This hold on sfh is later released 4092 * when we do the final fn_rele() on this fname. 4093 */ 4094 sfh4_hold(sfh); 4095 fnp->fn_sfh = sfh; 4096 4097 avl_create(&fnp->fn_children, fncmp, sizeof (nfs4_fname_t), 4098 offsetof(nfs4_fname_t, fn_tree)); 4099 NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE, 4100 "fn_get %p:%s, a new nfs4_fname_t!", 4101 (void *)fnp, fnp->fn_name)); 4102 if (parent != NULL) { 4103 avl_insert(&parent->fn_children, fnp, where); 4104 mutex_exit(&parent->fn_lock); 4105 } 4106 4107 return (fnp); 4108 } 4109 4110 void 4111 fn_hold(nfs4_fname_t *fnp) 4112 { 4113 atomic_add_32(&fnp->fn_refcnt, 1); 4114 NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE, 4115 "fn_hold %p:%s, new refcnt=%d", 4116 (void *)fnp, fnp->fn_name, fnp->fn_refcnt)); 4117 } 4118 4119 /* 4120 * Decrement the reference count of the given fname, and destroy it if its 4121 * reference count goes to zero. Nulls out the given pointer. 4122 */ 4123 4124 void 4125 fn_rele(nfs4_fname_t **fnpp) 4126 { 4127 nfs4_fname_t *parent; 4128 uint32_t newref; 4129 nfs4_fname_t *fnp; 4130 4131 recur: 4132 fnp = *fnpp; 4133 *fnpp = NULL; 4134 4135 mutex_enter(&fnp->fn_lock); 4136 parent = fnp->fn_parent; 4137 if (parent != NULL) 4138 mutex_enter(&parent->fn_lock); /* prevent new references */ 4139 newref = atomic_add_32_nv(&fnp->fn_refcnt, -1); 4140 if (newref > 0) { 4141 NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE, 4142 "fn_rele %p:%s, new refcnt=%d", 4143 (void *)fnp, fnp->fn_name, fnp->fn_refcnt)); 4144 if (parent != NULL) 4145 mutex_exit(&parent->fn_lock); 4146 mutex_exit(&fnp->fn_lock); 4147 return; 4148 } 4149 4150 NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE, 4151 "fn_rele %p:%s, last reference, deleting...", 4152 (void *)fnp, fnp->fn_name)); 4153 if (parent != NULL) { 4154 avl_remove(&parent->fn_children, fnp); 4155 mutex_exit(&parent->fn_lock); 4156 } 4157 kmem_free(fnp->fn_name, fnp->fn_len + 1); 4158 sfh4_rele(&fnp->fn_sfh); 4159 mutex_destroy(&fnp->fn_lock); 4160 avl_destroy(&fnp->fn_children); 4161 kmem_free(fnp, sizeof (nfs4_fname_t)); 4162 /* 4163 * Recursivly fn_rele the parent. 4164 * Use goto instead of a recursive call to avoid stack overflow. 4165 */ 4166 if (parent != NULL) { 4167 fnpp = &parent; 4168 goto recur; 4169 } 4170 } 4171 4172 /* 4173 * Returns the single component name of the given fname, in a MAXNAMELEN 4174 * string buffer, which the caller is responsible for freeing. Note that 4175 * the name may become invalid as a result of fn_move(). 4176 */ 4177 4178 char * 4179 fn_name(nfs4_fname_t *fnp) 4180 { 4181 char *name; 4182 4183 ASSERT(fnp->fn_len < MAXNAMELEN); 4184 name = kmem_alloc(MAXNAMELEN, KM_SLEEP); 4185 mutex_enter(&fnp->fn_lock); 4186 (void) strcpy(name, fnp->fn_name); 4187 mutex_exit(&fnp->fn_lock); 4188 4189 return (name); 4190 } 4191 4192 4193 /* 4194 * fn_path_realloc 4195 * 4196 * This function, used only by fn_path, constructs 4197 * a new string which looks like "prepend" + "/" + "current". 4198 * by allocating a new string and freeing the old one. 4199 */ 4200 static void 4201 fn_path_realloc(char **curses, char *prepend) 4202 { 4203 int len, curlen = 0; 4204 char *news; 4205 4206 if (*curses == NULL) { 4207 /* 4208 * Prime the pump, allocate just the 4209 * space for prepend and return that. 4210 */ 4211 len = strlen(prepend) + 1; 4212 news = kmem_alloc(len, KM_SLEEP); 4213 (void) strncpy(news, prepend, len); 4214 } else { 4215 /* 4216 * Allocate the space for a new string 4217 * +1 +1 is for the "/" and the NULL 4218 * byte at the end of it all. 4219 */ 4220 curlen = strlen(*curses); 4221 len = curlen + strlen(prepend) + 1 + 1; 4222 news = kmem_alloc(len, KM_SLEEP); 4223 (void) strncpy(news, prepend, len); 4224 (void) strcat(news, "/"); 4225 (void) strcat(news, *curses); 4226 kmem_free(*curses, curlen + 1); 4227 } 4228 *curses = news; 4229 } 4230 4231 /* 4232 * Returns the path name (starting from the fs root) for the given fname. 4233 * The caller is responsible for freeing. Note that the path may be or 4234 * become invalid as a result of fn_move(). 4235 */ 4236 4237 char * 4238 fn_path(nfs4_fname_t *fnp) 4239 { 4240 char *path; 4241 nfs4_fname_t *nextfnp; 4242 4243 if (fnp == NULL) 4244 return (NULL); 4245 4246 path = NULL; 4247 4248 /* walk up the tree constructing the pathname. */ 4249 4250 fn_hold(fnp); /* adjust for later rele */ 4251 do { 4252 mutex_enter(&fnp->fn_lock); 4253 /* 4254 * Add fn_name in front of the current path 4255 */ 4256 fn_path_realloc(&path, fnp->fn_name); 4257 nextfnp = fnp->fn_parent; 4258 if (nextfnp != NULL) 4259 fn_hold(nextfnp); 4260 mutex_exit(&fnp->fn_lock); 4261 fn_rele(&fnp); 4262 fnp = nextfnp; 4263 } while (fnp != NULL); 4264 4265 return (path); 4266 } 4267 4268 /* 4269 * Return a reference to the parent of the given fname, which the caller is 4270 * responsible for eventually releasing. 4271 */ 4272 4273 nfs4_fname_t * 4274 fn_parent(nfs4_fname_t *fnp) 4275 { 4276 nfs4_fname_t *parent; 4277 4278 mutex_enter(&fnp->fn_lock); 4279 parent = fnp->fn_parent; 4280 if (parent != NULL) 4281 fn_hold(parent); 4282 mutex_exit(&fnp->fn_lock); 4283 4284 return (parent); 4285 } 4286 4287 /* 4288 * Update fnp so that its parent is newparent and its name is newname. 4289 */ 4290 4291 void 4292 fn_move(nfs4_fname_t *fnp, nfs4_fname_t *newparent, char *newname) 4293 { 4294 nfs4_fname_t *parent, *tmpfnp; 4295 ssize_t newlen; 4296 nfs4_fname_t key; 4297 avl_index_t where; 4298 4299 /* 4300 * This assert exists to catch the client trying to rename 4301 * a dir to be a child of itself. This happened at a recent 4302 * bakeoff against a 3rd party (broken) server which allowed 4303 * the rename to succeed. If it trips it means that: 4304 * a) the code in nfs4rename that detects this case is broken 4305 * b) the server is broken (since it allowed the bogus rename) 4306 * 4307 * For non-DEBUG kernels, prepare for a recursive mutex_enter 4308 * panic below from: mutex_enter(&newparent->fn_lock); 4309 */ 4310 ASSERT(fnp != newparent); 4311 4312 /* 4313 * Remove fnp from its current parent, change its name, then add it 4314 * to newparent. It might happen that fnp was replaced by another 4315 * nfs4_fname_t with the same fn_name in parent->fn_children. 4316 * In such case, fnp->fn_parent is NULL and we skip the removal 4317 * of fnp from its current parent. 4318 */ 4319 mutex_enter(&fnp->fn_lock); 4320 parent = fnp->fn_parent; 4321 if (parent != NULL) { 4322 mutex_enter(&parent->fn_lock); 4323 avl_remove(&parent->fn_children, fnp); 4324 mutex_exit(&parent->fn_lock); 4325 fn_rele(&fnp->fn_parent); 4326 } 4327 4328 newlen = strlen(newname); 4329 if (newlen != fnp->fn_len) { 4330 ASSERT(newlen < MAXNAMELEN); 4331 kmem_free(fnp->fn_name, fnp->fn_len + 1); 4332 fnp->fn_name = kmem_alloc(newlen + 1, KM_SLEEP); 4333 fnp->fn_len = newlen; 4334 } 4335 (void) strcpy(fnp->fn_name, newname); 4336 4337 again: 4338 mutex_enter(&newparent->fn_lock); 4339 key.fn_name = fnp->fn_name; 4340 tmpfnp = avl_find(&newparent->fn_children, &key, &where); 4341 if (tmpfnp != NULL) { 4342 /* 4343 * This could be due to a file that was unlinked while 4344 * open, or perhaps the rnode is in the free list. Remove 4345 * it from newparent and let it go away on its own. The 4346 * contorted code is to deal with lock order issues and 4347 * race conditions. 4348 */ 4349 fn_hold(tmpfnp); 4350 mutex_exit(&newparent->fn_lock); 4351 mutex_enter(&tmpfnp->fn_lock); 4352 if (tmpfnp->fn_parent == newparent) { 4353 mutex_enter(&newparent->fn_lock); 4354 avl_remove(&newparent->fn_children, tmpfnp); 4355 mutex_exit(&newparent->fn_lock); 4356 fn_rele(&tmpfnp->fn_parent); 4357 } 4358 mutex_exit(&tmpfnp->fn_lock); 4359 fn_rele(&tmpfnp); 4360 goto again; 4361 } 4362 fnp->fn_parent = newparent; 4363 fn_hold(newparent); 4364 avl_insert(&newparent->fn_children, fnp, where); 4365 mutex_exit(&newparent->fn_lock); 4366 mutex_exit(&fnp->fn_lock); 4367 } 4368 4369 #ifdef DEBUG 4370 /* 4371 * Return non-zero if the type information makes sense for the given vnode. 4372 * Otherwise panic. 4373 */ 4374 int 4375 nfs4_consistent_type(vnode_t *vp) 4376 { 4377 rnode4_t *rp = VTOR4(vp); 4378 4379 if (nfs4_vtype_debug && vp->v_type != VNON && 4380 rp->r_attr.va_type != VNON && vp->v_type != rp->r_attr.va_type) { 4381 cmn_err(CE_PANIC, "vnode %p type mismatch; v_type=%d, " 4382 "rnode attr type=%d", (void *)vp, vp->v_type, 4383 rp->r_attr.va_type); 4384 } 4385 4386 return (1); 4387 } 4388 #endif /* DEBUG */ --- EOF ---