1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 25 /* 26 * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T. 27 * All Rights Reserved 28 */ 29 30 #include <sys/param.h> 31 #include <sys/types.h> 32 #include <sys/systm.h> 33 #include <sys/thread.h> 34 #include <sys/t_lock.h> 35 #include <sys/time.h> 36 #include <sys/vnode.h> 37 #include <sys/vfs.h> 38 #include <sys/errno.h> 39 #include <sys/buf.h> 40 #include <sys/stat.h> 41 #include <sys/cred.h> 42 #include <sys/kmem.h> 43 #include <sys/debug.h> 44 #include <sys/dnlc.h> 45 #include <sys/vmsystm.h> 46 #include <sys/flock.h> 47 #include <sys/share.h> 48 #include <sys/cmn_err.h> 49 #include <sys/tiuser.h> 50 #include <sys/sysmacros.h> 51 #include <sys/callb.h> 52 #include <sys/acl.h> 53 #include <sys/kstat.h> 54 #include <sys/signal.h> 55 #include <sys/disp.h> 56 #include <sys/atomic.h> 57 #include <sys/list.h> 58 #include <sys/sdt.h> 59 60 #include <rpc/types.h> 61 #include <rpc/xdr.h> 62 #include <rpc/auth.h> 63 #include <rpc/clnt.h> 64 65 #include <nfs/nfs.h> 66 #include <nfs/nfs_clnt.h> 67 #include <nfs/nfs_acl.h> 68 69 #include <nfs/nfs4.h> 70 #include <nfs/rnode4.h> 71 #include <nfs/nfs4_clnt.h> 72 73 #include <vm/hat.h> 74 #include <vm/as.h> 75 #include <vm/page.h> 76 #include <vm/pvn.h> 77 #include <vm/seg.h> 78 #include <vm/seg_map.h> 79 #include <vm/seg_vn.h> 80 81 #include <sys/ddi.h> 82 83 /* 84 * Arguments to page-flush thread. 85 */ 86 typedef struct { 87 vnode_t *vp; 88 cred_t *cr; 89 } pgflush_t; 90 91 #ifdef DEBUG 92 int nfs4_client_lease_debug; 93 int nfs4_sharedfh_debug; 94 int nfs4_fname_debug; 95 96 /* temporary: panic if v_type is inconsistent with r_attr va_type */ 97 int nfs4_vtype_debug; 98 99 uint_t nfs4_tsd_key; 100 #endif 101 102 static time_t nfs4_client_resumed = 0; 103 static callb_id_t cid = 0; 104 105 static int nfs4renew(nfs4_server_t *); 106 static void nfs4_attrcache_va(vnode_t *, nfs4_ga_res_t *, int); 107 static void nfs4_pgflush_thread(pgflush_t *); 108 109 static boolean_t nfs4_client_cpr_callb(void *, int); 110 111 struct mi4_globals { 112 kmutex_t mig_lock; /* lock protecting mig_list */ 113 list_t mig_list; /* list of NFS v4 mounts in zone */ 114 boolean_t mig_destructor_called; 115 }; 116 117 static zone_key_t mi4_list_key; 118 119 /* 120 * Attributes caching: 121 * 122 * Attributes are cached in the rnode in struct vattr form. 123 * There is a time associated with the cached attributes (r_time_attr_inval) 124 * which tells whether the attributes are valid. The time is initialized 125 * to the difference between current time and the modify time of the vnode 126 * when new attributes are cached. This allows the attributes for 127 * files that have changed recently to be timed out sooner than for files 128 * that have not changed for a long time. There are minimum and maximum 129 * timeout values that can be set per mount point. 130 */ 131 132 /* 133 * If a cache purge is in progress, wait for it to finish. 134 * 135 * The current thread must not be in the middle of an 136 * nfs4_start_op/nfs4_end_op region. Otherwise, there could be a deadlock 137 * between this thread, a recovery thread, and the page flush thread. 138 */ 139 int 140 nfs4_waitfor_purge_complete(vnode_t *vp) 141 { 142 rnode4_t *rp; 143 k_sigset_t smask; 144 145 rp = VTOR4(vp); 146 if ((rp->r_serial != NULL && rp->r_serial != curthread) || 147 ((rp->r_flags & R4PGFLUSH) && rp->r_pgflush != curthread)) { 148 mutex_enter(&rp->r_statelock); 149 sigintr(&smask, VTOMI4(vp)->mi_flags & MI4_INT); 150 while ((rp->r_serial != NULL && rp->r_serial != curthread) || 151 ((rp->r_flags & R4PGFLUSH) && 152 rp->r_pgflush != curthread)) { 153 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) { 154 sigunintr(&smask); 155 mutex_exit(&rp->r_statelock); 156 return (EINTR); 157 } 158 } 159 sigunintr(&smask); 160 mutex_exit(&rp->r_statelock); 161 } 162 return (0); 163 } 164 165 /* 166 * Validate caches by checking cached attributes. If they have timed out, 167 * then get new attributes from the server. As a side effect, cache 168 * invalidation is done if the attributes have changed. 169 * 170 * If the attributes have not timed out and if there is a cache 171 * invalidation being done by some other thread, then wait until that 172 * thread has completed the cache invalidation. 173 */ 174 int 175 nfs4_validate_caches(vnode_t *vp, cred_t *cr) 176 { 177 int error; 178 nfs4_ga_res_t gar; 179 180 if (ATTRCACHE4_VALID(vp)) { 181 error = nfs4_waitfor_purge_complete(vp); 182 if (error) 183 return (error); 184 return (0); 185 } 186 187 gar.n4g_va.va_mask = AT_ALL; 188 return (nfs4_getattr_otw(vp, &gar, cr, 0)); 189 } 190 191 /* 192 * Fill in attribute from the cache. 193 * If valid, then return 0 to indicate that no error occurred, 194 * otherwise return 1 to indicate that an error occurred. 195 */ 196 static int 197 nfs4_getattr_cache(vnode_t *vp, struct vattr *vap) 198 { 199 rnode4_t *rp; 200 201 rp = VTOR4(vp); 202 mutex_enter(&rp->r_statelock); 203 mutex_enter(&rp->r_statev4_lock); 204 if (ATTRCACHE4_VALID(vp)) { 205 mutex_exit(&rp->r_statev4_lock); 206 /* 207 * Cached attributes are valid 208 */ 209 *vap = rp->r_attr; 210 mutex_exit(&rp->r_statelock); 211 return (0); 212 } 213 mutex_exit(&rp->r_statev4_lock); 214 mutex_exit(&rp->r_statelock); 215 return (1); 216 } 217 218 219 /* 220 * If returned error is ESTALE flush all caches. The nfs4_purge_caches() 221 * call is synchronous because all the pages were invalidated by the 222 * nfs4_invalidate_pages() call. 223 */ 224 void 225 nfs4_purge_stale_fh(int errno, vnode_t *vp, cred_t *cr) 226 { 227 struct rnode4 *rp = VTOR4(vp); 228 229 /* Ensure that the ..._end_op() call has been done */ 230 ASSERT(tsd_get(nfs4_tsd_key) == NULL); 231 232 if (errno != ESTALE) 233 return; 234 235 mutex_enter(&rp->r_statelock); 236 rp->r_flags |= R4STALE; 237 if (!rp->r_error) 238 rp->r_error = errno; 239 mutex_exit(&rp->r_statelock); 240 if (nfs4_has_pages(vp)) 241 nfs4_invalidate_pages(vp, (u_offset_t)0, cr); 242 nfs4_purge_caches(vp, NFS4_PURGE_DNLC, cr, FALSE); 243 } 244 245 /* 246 * Purge all of the various NFS `data' caches. If "asyncpg" is TRUE, the 247 * page purge is done asynchronously. 248 */ 249 void 250 nfs4_purge_caches(vnode_t *vp, int purge_dnlc, cred_t *cr, int asyncpg) 251 { 252 rnode4_t *rp; 253 char *contents; 254 vnode_t *xattr; 255 int size; 256 int pgflush; /* are we the page flush thread? */ 257 258 /* 259 * Purge the DNLC for any entries which refer to this file. 260 */ 261 if (vp->v_count > 1 && 262 (vp->v_type == VDIR || purge_dnlc == NFS4_PURGE_DNLC)) 263 dnlc_purge_vp(vp); 264 265 /* 266 * Clear any readdir state bits and purge the readlink response cache. 267 */ 268 rp = VTOR4(vp); 269 mutex_enter(&rp->r_statelock); 270 rp->r_flags &= ~R4LOOKUP; 271 contents = rp->r_symlink.contents; 272 size = rp->r_symlink.size; 273 rp->r_symlink.contents = NULL; 274 275 xattr = rp->r_xattr_dir; 276 rp->r_xattr_dir = NULL; 277 278 /* 279 * Purge pathconf cache too. 280 */ 281 rp->r_pathconf.pc4_xattr_valid = 0; 282 rp->r_pathconf.pc4_cache_valid = 0; 283 284 pgflush = (curthread == rp->r_pgflush); 285 mutex_exit(&rp->r_statelock); 286 287 if (contents != NULL) { 288 289 kmem_free((void *)contents, size); 290 } 291 292 if (xattr != NULL) 293 VN_RELE(xattr); 294 295 /* 296 * Flush the page cache. If the current thread is the page flush 297 * thread, don't initiate a new page flush. There's no need for 298 * it, and doing it correctly is hard. 299 */ 300 if (nfs4_has_pages(vp) && !pgflush) { 301 if (!asyncpg) { 302 (void) nfs4_waitfor_purge_complete(vp); 303 nfs4_flush_pages(vp, cr); 304 } else { 305 pgflush_t *args; 306 307 /* 308 * We don't hold r_statelock while creating the 309 * thread, in case the call blocks. So we use a 310 * flag to indicate that a page flush thread is 311 * active. 312 */ 313 mutex_enter(&rp->r_statelock); 314 if (rp->r_flags & R4PGFLUSH) { 315 mutex_exit(&rp->r_statelock); 316 } else { 317 rp->r_flags |= R4PGFLUSH; 318 mutex_exit(&rp->r_statelock); 319 320 args = kmem_alloc(sizeof (pgflush_t), 321 KM_SLEEP); 322 args->vp = vp; 323 VN_HOLD(args->vp); 324 args->cr = cr; 325 crhold(args->cr); 326 (void) zthread_create(NULL, 0, 327 nfs4_pgflush_thread, args, 0, 328 minclsyspri); 329 } 330 } 331 } 332 333 /* 334 * Flush the readdir response cache. 335 */ 336 nfs4_purge_rddir_cache(vp); 337 } 338 339 /* 340 * Invalidate all pages for the given file, after writing back the dirty 341 * ones. 342 */ 343 344 void 345 nfs4_flush_pages(vnode_t *vp, cred_t *cr) 346 { 347 int error; 348 rnode4_t *rp = VTOR4(vp); 349 350 error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_INVAL, cr, NULL); 351 if (error == ENOSPC || error == EDQUOT) { 352 mutex_enter(&rp->r_statelock); 353 if (!rp->r_error) 354 rp->r_error = error; 355 mutex_exit(&rp->r_statelock); 356 } 357 } 358 359 /* 360 * Page flush thread. 361 */ 362 363 static void 364 nfs4_pgflush_thread(pgflush_t *args) 365 { 366 rnode4_t *rp = VTOR4(args->vp); 367 368 /* remember which thread we are, so we don't deadlock ourselves */ 369 mutex_enter(&rp->r_statelock); 370 ASSERT(rp->r_pgflush == NULL); 371 rp->r_pgflush = curthread; 372 mutex_exit(&rp->r_statelock); 373 374 nfs4_flush_pages(args->vp, args->cr); 375 376 mutex_enter(&rp->r_statelock); 377 rp->r_pgflush = NULL; 378 rp->r_flags &= ~R4PGFLUSH; 379 cv_broadcast(&rp->r_cv); 380 mutex_exit(&rp->r_statelock); 381 382 VN_RELE(args->vp); 383 crfree(args->cr); 384 kmem_free(args, sizeof (pgflush_t)); 385 zthread_exit(); 386 } 387 388 /* 389 * Purge the readdir cache of all entries which are not currently 390 * being filled. 391 */ 392 void 393 nfs4_purge_rddir_cache(vnode_t *vp) 394 { 395 rnode4_t *rp; 396 397 rp = VTOR4(vp); 398 399 mutex_enter(&rp->r_statelock); 400 rp->r_direof = NULL; 401 rp->r_flags &= ~R4LOOKUP; 402 rp->r_flags |= R4READDIRWATTR; 403 rddir4_cache_purge(rp); 404 mutex_exit(&rp->r_statelock); 405 } 406 407 /* 408 * Set attributes cache for given vnode using virtual attributes. There is 409 * no cache validation, but if the attributes are deemed to be stale, they 410 * are ignored. This corresponds to nfs3_attrcache(). 411 * 412 * Set the timeout value on the attribute cache and fill it 413 * with the passed in attributes. 414 */ 415 void 416 nfs4_attrcache_noinval(vnode_t *vp, nfs4_ga_res_t *garp, hrtime_t t) 417 { 418 rnode4_t *rp = VTOR4(vp); 419 420 mutex_enter(&rp->r_statelock); 421 if (rp->r_time_attr_saved <= t) 422 nfs4_attrcache_va(vp, garp, FALSE); 423 mutex_exit(&rp->r_statelock); 424 } 425 426 /* 427 * Use the passed in virtual attributes to check to see whether the 428 * data and metadata caches are valid, cache the new attributes, and 429 * then do the cache invalidation if required. 430 * 431 * The cache validation and caching of the new attributes is done 432 * atomically via the use of the mutex, r_statelock. If required, 433 * the cache invalidation is done atomically w.r.t. the cache 434 * validation and caching of the attributes via the pseudo lock, 435 * r_serial. 436 * 437 * This routine is used to do cache validation and attributes caching 438 * for operations with a single set of post operation attributes. 439 */ 440 441 void 442 nfs4_attr_cache(vnode_t *vp, nfs4_ga_res_t *garp, 443 hrtime_t t, cred_t *cr, int async, 444 change_info4 *cinfo) 445 { 446 rnode4_t *rp; 447 int mtime_changed = 0; 448 int ctime_changed = 0; 449 vsecattr_t *vsp; 450 int was_serial, set_time_cache_inval, recov; 451 vattr_t *vap = &garp->n4g_va; 452 mntinfo4_t *mi = VTOMI4(vp); 453 len_t preattr_rsize; 454 boolean_t writemodify_set = B_FALSE; 455 boolean_t cachepurge_set = B_FALSE; 456 457 ASSERT(mi->mi_vfsp->vfs_dev == garp->n4g_va.va_fsid); 458 459 /* Is curthread the recovery thread? */ 460 mutex_enter(&mi->mi_lock); 461 recov = (VTOMI4(vp)->mi_recovthread == curthread); 462 mutex_exit(&mi->mi_lock); 463 464 rp = VTOR4(vp); 465 mutex_enter(&rp->r_statelock); 466 was_serial = (rp->r_serial == curthread); 467 if (rp->r_serial && !was_serial) { 468 klwp_t *lwp = ttolwp(curthread); 469 470 /* 471 * If we're the recovery thread, then purge current attrs 472 * and bail out to avoid potential deadlock between another 473 * thread caching attrs (r_serial thread), recov thread, 474 * and an async writer thread. 475 */ 476 if (recov) { 477 PURGE_ATTRCACHE4_LOCKED(rp); 478 mutex_exit(&rp->r_statelock); 479 return; 480 } 481 482 if (lwp != NULL) 483 lwp->lwp_nostop++; 484 while (rp->r_serial != NULL) { 485 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) { 486 mutex_exit(&rp->r_statelock); 487 if (lwp != NULL) 488 lwp->lwp_nostop--; 489 return; 490 } 491 } 492 if (lwp != NULL) 493 lwp->lwp_nostop--; 494 } 495 496 /* 497 * If there is a page flush thread, the current thread needs to 498 * bail out, to prevent a possible deadlock between the current 499 * thread (which might be in a start_op/end_op region), the 500 * recovery thread, and the page flush thread. Expire the 501 * attribute cache, so that any attributes the current thread was 502 * going to set are not lost. 503 */ 504 if ((rp->r_flags & R4PGFLUSH) && rp->r_pgflush != curthread) { 505 PURGE_ATTRCACHE4_LOCKED(rp); 506 mutex_exit(&rp->r_statelock); 507 return; 508 } 509 510 if (rp->r_time_attr_saved > t) { 511 /* 512 * Attributes have been cached since these attributes were 513 * probably made. If there is an inconsistency in what is 514 * cached, mark them invalid. If not, don't act on them. 515 */ 516 if (!CACHE4_VALID(rp, vap->va_mtime, vap->va_size)) 517 PURGE_ATTRCACHE4_LOCKED(rp); 518 mutex_exit(&rp->r_statelock); 519 return; 520 } 521 set_time_cache_inval = 0; 522 if (cinfo) { 523 /* 524 * Only directory modifying callers pass non-NULL cinfo. 525 */ 526 ASSERT(vp->v_type == VDIR); 527 /* 528 * If the cache timeout either doesn't exist or hasn't expired, 529 * and dir didn't changed on server before dirmod op 530 * and dir didn't change after dirmod op but before getattr 531 * then there's a chance that the client's cached data for 532 * this object is current (not stale). No immediate cache 533 * flush is required. 534 * 535 */ 536 if ((! rp->r_time_cache_inval || t < rp->r_time_cache_inval) && 537 cinfo->before == rp->r_change && 538 (garp->n4g_change_valid && 539 cinfo->after == garp->n4g_change)) { 540 541 /* 542 * If atomic isn't set, then the before/after info 543 * cannot be blindly trusted. For this case, we tell 544 * nfs4_attrcache_va to cache the attrs but also 545 * establish an absolute maximum cache timeout. When 546 * the timeout is reached, caches will be flushed. 547 */ 548 if (! cinfo->atomic) 549 set_time_cache_inval = 1; 550 } else { 551 552 /* 553 * We're not sure exactly what changed, but we know 554 * what to do. flush all caches for dir. remove the 555 * attr timeout. 556 * 557 * a) timeout expired. flush all caches. 558 * b) r_change != cinfo.before. flush all caches. 559 * c) r_change == cinfo.before, but cinfo.after != 560 * post-op getattr(change). flush all caches. 561 * d) post-op getattr(change) not provided by server. 562 * flush all caches. 563 */ 564 mtime_changed = 1; 565 ctime_changed = 1; 566 rp->r_time_cache_inval = 0; 567 } 568 } else { 569 /* 570 * Write thread after writing data to file on remote server, 571 * will always set R4WRITEMODIFIED to indicate that file on 572 * remote server was modified with a WRITE operation and would 573 * have marked attribute cache as timed out. If R4WRITEMODIFIED 574 * is set, then do not check for mtime and ctime change. 575 */ 576 if (!(rp->r_flags & R4WRITEMODIFIED)) { 577 if (!CACHE4_VALID(rp, vap->va_mtime, vap->va_size)) 578 mtime_changed = 1; 579 580 if (rp->r_attr.va_ctime.tv_sec != 581 vap->va_ctime.tv_sec || 582 rp->r_attr.va_ctime.tv_nsec != 583 vap->va_ctime.tv_nsec) 584 ctime_changed = 1; 585 } else { 586 writemodify_set = B_TRUE; 587 } 588 } 589 590 preattr_rsize = rp->r_size; 591 592 nfs4_attrcache_va(vp, garp, set_time_cache_inval); 593 594 /* 595 * If we have updated filesize in nfs4_attrcache_va, as soon as we 596 * drop statelock we will be in transition of purging all 597 * our caches and updating them. It is possible for another 598 * thread to pick this new file size and read in zeroed data. 599 * stall other threads till cache purge is complete. 600 */ 601 if ((!cinfo) && (rp->r_size != preattr_rsize)) { 602 /* 603 * If R4WRITEMODIFIED was set and we have updated the file 604 * size, Server's returned file size need not necessarily 605 * be because of this Client's WRITE. We need to purge 606 * all caches. 607 */ 608 if (writemodify_set) 609 mtime_changed = 1; 610 611 if (mtime_changed && !(rp->r_flags & R4INCACHEPURGE)) { 612 rp->r_flags |= R4INCACHEPURGE; 613 cachepurge_set = B_TRUE; 614 } 615 } 616 617 if (!mtime_changed && !ctime_changed) { 618 mutex_exit(&rp->r_statelock); 619 return; 620 } 621 622 rp->r_serial = curthread; 623 624 mutex_exit(&rp->r_statelock); 625 626 /* 627 * If we're the recov thread, then force async nfs4_purge_caches 628 * to avoid potential deadlock. 629 */ 630 if (mtime_changed) 631 nfs4_purge_caches(vp, NFS4_NOPURGE_DNLC, cr, recov ? 1 : async); 632 633 if ((rp->r_flags & R4INCACHEPURGE) && cachepurge_set) { 634 mutex_enter(&rp->r_statelock); 635 rp->r_flags &= ~R4INCACHEPURGE; 636 cv_broadcast(&rp->r_cv); 637 mutex_exit(&rp->r_statelock); 638 cachepurge_set = B_FALSE; 639 } 640 641 if (ctime_changed) { 642 (void) nfs4_access_purge_rp(rp); 643 if (rp->r_secattr != NULL) { 644 mutex_enter(&rp->r_statelock); 645 vsp = rp->r_secattr; 646 rp->r_secattr = NULL; 647 mutex_exit(&rp->r_statelock); 648 if (vsp != NULL) 649 nfs4_acl_free_cache(vsp); 650 } 651 } 652 653 if (!was_serial) { 654 mutex_enter(&rp->r_statelock); 655 rp->r_serial = NULL; 656 cv_broadcast(&rp->r_cv); 657 mutex_exit(&rp->r_statelock); 658 } 659 } 660 661 /* 662 * Set attributes cache for given vnode using virtual attributes. 663 * 664 * Set the timeout value on the attribute cache and fill it 665 * with the passed in attributes. 666 * 667 * The caller must be holding r_statelock. 668 */ 669 static void 670 nfs4_attrcache_va(vnode_t *vp, nfs4_ga_res_t *garp, int set_cache_timeout) 671 { 672 rnode4_t *rp; 673 mntinfo4_t *mi; 674 hrtime_t delta; 675 hrtime_t now; 676 vattr_t *vap = &garp->n4g_va; 677 678 rp = VTOR4(vp); 679 680 ASSERT(MUTEX_HELD(&rp->r_statelock)); 681 ASSERT(vap->va_mask == AT_ALL); 682 683 /* Switch to master before checking v_flag */ 684 if (IS_SHADOW(vp, rp)) 685 vp = RTOV4(rp); 686 687 now = gethrtime(); 688 689 mi = VTOMI4(vp); 690 691 /* 692 * Only establish a new cache timeout (if requested). Never 693 * extend a timeout. Never clear a timeout. Clearing a timeout 694 * is done by nfs4_update_dircaches (ancestor in our call chain) 695 */ 696 if (set_cache_timeout && ! rp->r_time_cache_inval) 697 rp->r_time_cache_inval = now + mi->mi_acdirmax; 698 699 /* 700 * Delta is the number of nanoseconds that we will 701 * cache the attributes of the file. It is based on 702 * the number of nanoseconds since the last time that 703 * we detected a change. The assumption is that files 704 * that changed recently are likely to change again. 705 * There is a minimum and a maximum for regular files 706 * and for directories which is enforced though. 707 * 708 * Using the time since last change was detected 709 * eliminates direct comparison or calculation 710 * using mixed client and server times. NFS does 711 * not make any assumptions regarding the client 712 * and server clocks being synchronized. 713 */ 714 if (vap->va_mtime.tv_sec != rp->r_attr.va_mtime.tv_sec || 715 vap->va_mtime.tv_nsec != rp->r_attr.va_mtime.tv_nsec || 716 vap->va_size != rp->r_attr.va_size) { 717 rp->r_time_attr_saved = now; 718 } 719 720 if ((mi->mi_flags & MI4_NOAC) || (vp->v_flag & VNOCACHE)) 721 delta = 0; 722 else { 723 delta = now - rp->r_time_attr_saved; 724 if (vp->v_type == VDIR) { 725 if (delta < mi->mi_acdirmin) 726 delta = mi->mi_acdirmin; 727 else if (delta > mi->mi_acdirmax) 728 delta = mi->mi_acdirmax; 729 } else { 730 if (delta < mi->mi_acregmin) 731 delta = mi->mi_acregmin; 732 else if (delta > mi->mi_acregmax) 733 delta = mi->mi_acregmax; 734 } 735 } 736 rp->r_time_attr_inval = now + delta; 737 738 rp->r_attr = *vap; 739 if (garp->n4g_change_valid) 740 rp->r_change = garp->n4g_change; 741 742 /* 743 * The attributes that were returned may be valid and can 744 * be used, but they may not be allowed to be cached. 745 * Reset the timers to cause immediate invalidation and 746 * clear r_change so no VERIFY operations will suceed 747 */ 748 if (garp->n4g_attrwhy == NFS4_GETATTR_NOCACHE_OK) { 749 rp->r_time_attr_inval = now; 750 rp->r_time_attr_saved = now; 751 rp->r_change = 0; 752 } 753 754 /* 755 * If mounted_on_fileid returned AND the object is a stub, 756 * then set object's va_nodeid to the mounted over fid 757 * returned by server. 758 * 759 * If mounted_on_fileid not provided/supported, then 760 * just set it to 0 for now. Eventually it would be 761 * better to set it to a hashed version of FH. This 762 * would probably be good enough to provide a unique 763 * fid/d_ino within a dir. 764 * 765 * We don't need to carry mounted_on_fileid in the 766 * rnode as long as the client never requests fileid 767 * without also requesting mounted_on_fileid. For 768 * now, it stays. 769 */ 770 if (garp->n4g_mon_fid_valid) { 771 rp->r_mntd_fid = garp->n4g_mon_fid; 772 773 if (RP_ISSTUB(rp)) 774 rp->r_attr.va_nodeid = rp->r_mntd_fid; 775 } 776 777 /* 778 * Check to see if there are valid pathconf bits to 779 * cache in the rnode. 780 */ 781 if (garp->n4g_ext_res) { 782 if (garp->n4g_ext_res->n4g_pc4.pc4_cache_valid) { 783 rp->r_pathconf = garp->n4g_ext_res->n4g_pc4; 784 } else { 785 if (garp->n4g_ext_res->n4g_pc4.pc4_xattr_valid) { 786 rp->r_pathconf.pc4_xattr_valid = TRUE; 787 rp->r_pathconf.pc4_xattr_exists = 788 garp->n4g_ext_res->n4g_pc4.pc4_xattr_exists; 789 } 790 } 791 } 792 /* 793 * Update the size of the file if there is no cached data or if 794 * the cached data is clean and there is no data being written 795 * out. 796 */ 797 if (rp->r_size != vap->va_size && 798 (!vn_has_cached_data(vp) || 799 (!(rp->r_flags & R4DIRTY) && rp->r_count == 0))) { 800 rp->r_size = vap->va_size; 801 } 802 nfs_setswaplike(vp, vap); 803 rp->r_flags &= ~R4WRITEMODIFIED; 804 } 805 806 /* 807 * Get attributes over-the-wire and update attributes cache 808 * if no error occurred in the over-the-wire operation. 809 * Return 0 if successful, otherwise error. 810 */ 811 int 812 nfs4_getattr_otw(vnode_t *vp, nfs4_ga_res_t *garp, cred_t *cr, int get_acl) 813 { 814 mntinfo4_t *mi = VTOMI4(vp); 815 hrtime_t t; 816 nfs4_recov_state_t recov_state; 817 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 818 819 recov_state.rs_flags = 0; 820 recov_state.rs_num_retry_despite_err = 0; 821 822 /* Save the original mount point security flavor */ 823 (void) save_mnt_secinfo(mi->mi_curr_serv); 824 825 recov_retry: 826 827 if ((e.error = nfs4_start_fop(mi, vp, NULL, OH_GETATTR, 828 &recov_state, NULL))) { 829 (void) check_mnt_secinfo(mi->mi_curr_serv, vp); 830 return (e.error); 831 } 832 833 t = gethrtime(); 834 835 nfs4_getattr_otw_norecovery(vp, garp, &e, cr, get_acl); 836 837 if (nfs4_needs_recovery(&e, FALSE, vp->v_vfsp)) { 838 if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL, 839 NULL, OP_GETATTR, NULL, NULL, NULL) == FALSE) { 840 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, 841 &recov_state, 1); 842 goto recov_retry; 843 } 844 } 845 846 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state, 0); 847 848 if (!e.error) { 849 if (e.stat == NFS4_OK) { 850 nfs4_attr_cache(vp, garp, t, cr, FALSE, NULL); 851 } else { 852 e.error = geterrno4(e.stat); 853 854 nfs4_purge_stale_fh(e.error, vp, cr); 855 } 856 } 857 858 /* 859 * If getattr a node that is a stub for a crossed 860 * mount point, keep the original secinfo flavor for 861 * the current file system, not the crossed one. 862 */ 863 (void) check_mnt_secinfo(mi->mi_curr_serv, vp); 864 865 return (e.error); 866 } 867 868 /* 869 * Generate a compound to get attributes over-the-wire. 870 */ 871 void 872 nfs4_getattr_otw_norecovery(vnode_t *vp, nfs4_ga_res_t *garp, 873 nfs4_error_t *ep, cred_t *cr, int get_acl) 874 { 875 COMPOUND4args_clnt args; 876 COMPOUND4res_clnt res; 877 int doqueue; 878 rnode4_t *rp = VTOR4(vp); 879 nfs_argop4 argop[2]; 880 881 args.ctag = TAG_GETATTR; 882 883 args.array_len = 2; 884 args.array = argop; 885 886 /* putfh */ 887 argop[0].argop = OP_CPUTFH; 888 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 889 890 /* getattr */ 891 /* 892 * Unlike nfs version 2 and 3, where getattr returns all the 893 * attributes, nfs version 4 returns only the ones explicitly 894 * asked for. This creates problems, as some system functions 895 * (e.g. cache check) require certain attributes and if the 896 * cached node lacks some attributes such as uid/gid, it can 897 * affect system utilities (e.g. "ls") that rely on the information 898 * to be there. This can lead to anything from system crashes to 899 * corrupted information processed by user apps. 900 * So to ensure that all bases are covered, request at least 901 * the AT_ALL attribute mask. 902 */ 903 argop[1].argop = OP_GETATTR; 904 argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 905 if (get_acl) 906 argop[1].nfs_argop4_u.opgetattr.attr_request |= FATTR4_ACL_MASK; 907 argop[1].nfs_argop4_u.opgetattr.mi = VTOMI4(vp); 908 909 doqueue = 1; 910 911 rfs4call(VTOMI4(vp), &args, &res, cr, &doqueue, 0, ep); 912 913 if (ep->error) 914 return; 915 916 if (res.status != NFS4_OK) { 917 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 918 return; 919 } 920 921 *garp = res.array[1].nfs_resop4_u.opgetattr.ga_res; 922 923 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 924 } 925 926 /* 927 * Return either cached or remote attributes. If get remote attr 928 * use them to check and invalidate caches, then cache the new attributes. 929 */ 930 int 931 nfs4getattr(vnode_t *vp, vattr_t *vap, cred_t *cr) 932 { 933 int error; 934 rnode4_t *rp; 935 nfs4_ga_res_t gar; 936 937 ASSERT(nfs4_consistent_type(vp)); 938 939 /* 940 * If we've got cached attributes, we're done, otherwise go 941 * to the server to get attributes, which will update the cache 942 * in the process. Either way, use the cached attributes for 943 * the caller's vattr_t. 944 * 945 * Note that we ignore the gar set by the OTW call: the attr caching 946 * code may make adjustments when storing to the rnode, and we want 947 * to see those changes here. 948 */ 949 rp = VTOR4(vp); 950 error = 0; 951 mutex_enter(&rp->r_statelock); 952 if (!ATTRCACHE4_VALID(vp)) { 953 mutex_exit(&rp->r_statelock); 954 error = nfs4_getattr_otw(vp, &gar, cr, 0); 955 mutex_enter(&rp->r_statelock); 956 } 957 958 if (!error) 959 *vap = rp->r_attr; 960 961 /* Return the client's view of file size */ 962 vap->va_size = rp->r_size; 963 964 mutex_exit(&rp->r_statelock); 965 966 ASSERT(nfs4_consistent_type(vp)); 967 968 return (error); 969 } 970 971 int 972 nfs4_attr_otw(vnode_t *vp, nfs4_tag_type_t tag_type, 973 nfs4_ga_res_t *garp, bitmap4 reqbitmap, cred_t *cr) 974 { 975 COMPOUND4args_clnt args; 976 COMPOUND4res_clnt res; 977 int doqueue; 978 nfs_argop4 argop[2]; 979 mntinfo4_t *mi = VTOMI4(vp); 980 bool_t needrecov = FALSE; 981 nfs4_recov_state_t recov_state; 982 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 983 nfs4_ga_ext_res_t *gerp; 984 985 recov_state.rs_flags = 0; 986 recov_state.rs_num_retry_despite_err = 0; 987 988 recov_retry: 989 args.ctag = tag_type; 990 991 args.array_len = 2; 992 args.array = argop; 993 994 e.error = nfs4_start_fop(mi, vp, NULL, OH_GETATTR, &recov_state, NULL); 995 if (e.error) 996 return (e.error); 997 998 /* putfh */ 999 argop[0].argop = OP_CPUTFH; 1000 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh; 1001 1002 /* getattr */ 1003 argop[1].argop = OP_GETATTR; 1004 argop[1].nfs_argop4_u.opgetattr.attr_request = reqbitmap; 1005 argop[1].nfs_argop4_u.opgetattr.mi = mi; 1006 1007 doqueue = 1; 1008 1009 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 1010 "nfs4_attr_otw: %s call, rp %s", needrecov ? "recov" : "first", 1011 rnode4info(VTOR4(vp)))); 1012 1013 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 1014 1015 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp); 1016 if (!needrecov && e.error) { 1017 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state, 1018 needrecov); 1019 return (e.error); 1020 } 1021 1022 if (needrecov) { 1023 bool_t abort; 1024 1025 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 1026 "nfs4_attr_otw: initiating recovery\n")); 1027 1028 abort = nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL, 1029 NULL, OP_GETATTR, NULL, NULL, NULL); 1030 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state, 1031 needrecov); 1032 if (!e.error) { 1033 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1034 e.error = geterrno4(res.status); 1035 } 1036 if (abort == FALSE) 1037 goto recov_retry; 1038 return (e.error); 1039 } 1040 1041 if (res.status) { 1042 e.error = geterrno4(res.status); 1043 } else { 1044 gerp = garp->n4g_ext_res; 1045 bcopy(&res.array[1].nfs_resop4_u.opgetattr.ga_res, 1046 garp, sizeof (nfs4_ga_res_t)); 1047 garp->n4g_ext_res = gerp; 1048 if (garp->n4g_ext_res && 1049 res.array[1].nfs_resop4_u.opgetattr.ga_res.n4g_ext_res) 1050 bcopy(res.array[1].nfs_resop4_u.opgetattr. 1051 ga_res.n4g_ext_res, 1052 garp->n4g_ext_res, sizeof (nfs4_ga_ext_res_t)); 1053 } 1054 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1055 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state, 1056 needrecov); 1057 return (e.error); 1058 } 1059 1060 /* 1061 * Asynchronous I/O parameters. nfs_async_threads is the high-water mark 1062 * for the demand-based allocation of async threads per-mount. The 1063 * nfs_async_timeout is the amount of time a thread will live after it 1064 * becomes idle, unless new I/O requests are received before the thread 1065 * dies. See nfs4_async_putpage and nfs4_async_start. 1066 */ 1067 1068 static void nfs4_async_start(struct vfs *); 1069 static void nfs4_async_pgops_start(struct vfs *); 1070 static void nfs4_async_common_start(struct vfs *, int); 1071 1072 static void 1073 free_async_args4(struct nfs4_async_reqs *args) 1074 { 1075 rnode4_t *rp; 1076 1077 if (args->a_io != NFS4_INACTIVE) { 1078 rp = VTOR4(args->a_vp); 1079 mutex_enter(&rp->r_statelock); 1080 rp->r_count--; 1081 if (args->a_io == NFS4_PUTAPAGE || 1082 args->a_io == NFS4_PAGEIO) 1083 rp->r_awcount--; 1084 cv_broadcast(&rp->r_cv); 1085 mutex_exit(&rp->r_statelock); 1086 VN_RELE(args->a_vp); 1087 } 1088 crfree(args->a_cred); 1089 kmem_free(args, sizeof (*args)); 1090 } 1091 1092 /* 1093 * Cross-zone thread creation and NFS access is disallowed, yet fsflush() and 1094 * pageout(), running in the global zone, have legitimate reasons to do 1095 * VOP_PUTPAGE(B_ASYNC) on other zones' NFS mounts. We avoid the problem by 1096 * use of a a per-mount "asynchronous requests manager thread" which is 1097 * signaled by the various asynchronous work routines when there is 1098 * asynchronous work to be done. It is responsible for creating new 1099 * worker threads if necessary, and notifying existing worker threads 1100 * that there is work to be done. 1101 * 1102 * In other words, it will "take the specifications from the customers and 1103 * give them to the engineers." 1104 * 1105 * Worker threads die off of their own accord if they are no longer 1106 * needed. 1107 * 1108 * This thread is killed when the zone is going away or the filesystem 1109 * is being unmounted. 1110 */ 1111 void 1112 nfs4_async_manager(vfs_t *vfsp) 1113 { 1114 callb_cpr_t cprinfo; 1115 mntinfo4_t *mi; 1116 uint_t max_threads; 1117 1118 mi = VFTOMI4(vfsp); 1119 1120 CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, 1121 "nfs4_async_manager"); 1122 1123 mutex_enter(&mi->mi_async_lock); 1124 /* 1125 * We want to stash the max number of threads that this mount was 1126 * allowed so we can use it later when the variable is set to zero as 1127 * part of the zone/mount going away. 1128 * 1129 * We want to be able to create at least one thread to handle 1130 * asynchronous inactive calls. 1131 */ 1132 max_threads = MAX(mi->mi_max_threads, 1); 1133 /* 1134 * We don't want to wait for mi_max_threads to go to zero, since that 1135 * happens as part of a failed unmount, but this thread should only 1136 * exit when the mount is really going away. 1137 * 1138 * Once MI4_ASYNC_MGR_STOP is set, no more async operations will be 1139 * attempted: the various _async_*() functions know to do things 1140 * inline if mi_max_threads == 0. Henceforth we just drain out the 1141 * outstanding requests. 1142 * 1143 * Note that we still create zthreads even if we notice the zone is 1144 * shutting down (MI4_ASYNC_MGR_STOP is set); this may cause the zone 1145 * shutdown sequence to take slightly longer in some cases, but 1146 * doesn't violate the protocol, as all threads will exit as soon as 1147 * they're done processing the remaining requests. 1148 */ 1149 for (;;) { 1150 while (mi->mi_async_req_count > 0) { 1151 /* 1152 * Paranoia: If the mount started out having 1153 * (mi->mi_max_threads == 0), and the value was 1154 * later changed (via a debugger or somesuch), 1155 * we could be confused since we will think we 1156 * can't create any threads, and the calling 1157 * code (which looks at the current value of 1158 * mi->mi_max_threads, now non-zero) thinks we 1159 * can. 1160 * 1161 * So, because we're paranoid, we create threads 1162 * up to the maximum of the original and the 1163 * current value. This means that future 1164 * (debugger-induced) alterations of 1165 * mi->mi_max_threads are ignored for our 1166 * purposes, but who told them they could change 1167 * random values on a live kernel anyhow? 1168 */ 1169 if (mi->mi_threads[NFS4_ASYNC_QUEUE] < 1170 MAX(mi->mi_max_threads, max_threads)) { 1171 mi->mi_threads[NFS4_ASYNC_QUEUE]++; 1172 mutex_exit(&mi->mi_async_lock); 1173 MI4_HOLD(mi); 1174 VFS_HOLD(vfsp); /* hold for new thread */ 1175 (void) zthread_create(NULL, 0, nfs4_async_start, 1176 vfsp, 0, minclsyspri); 1177 mutex_enter(&mi->mi_async_lock); 1178 } else if (mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] < 1179 NUM_ASYNC_PGOPS_THREADS) { 1180 mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE]++; 1181 mutex_exit(&mi->mi_async_lock); 1182 MI4_HOLD(mi); 1183 VFS_HOLD(vfsp); /* hold for new thread */ 1184 (void) zthread_create(NULL, 0, 1185 nfs4_async_pgops_start, vfsp, 0, 1186 minclsyspri); 1187 mutex_enter(&mi->mi_async_lock); 1188 } 1189 NFS4_WAKE_ASYNC_WORKER(mi->mi_async_work_cv); 1190 ASSERT(mi->mi_async_req_count != 0); 1191 mi->mi_async_req_count--; 1192 } 1193 1194 mutex_enter(&mi->mi_lock); 1195 if (mi->mi_flags & MI4_ASYNC_MGR_STOP) { 1196 mutex_exit(&mi->mi_lock); 1197 break; 1198 } 1199 mutex_exit(&mi->mi_lock); 1200 1201 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1202 cv_wait(&mi->mi_async_reqs_cv, &mi->mi_async_lock); 1203 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock); 1204 } 1205 1206 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE, 1207 "nfs4_async_manager exiting for vfs %p\n", (void *)mi->mi_vfsp)); 1208 /* 1209 * Let everyone know we're done. 1210 */ 1211 mi->mi_manager_thread = NULL; 1212 /* 1213 * Wake up the inactive thread. 1214 */ 1215 cv_broadcast(&mi->mi_inact_req_cv); 1216 /* 1217 * Wake up anyone sitting in nfs4_async_manager_stop() 1218 */ 1219 cv_broadcast(&mi->mi_async_cv); 1220 /* 1221 * There is no explicit call to mutex_exit(&mi->mi_async_lock) 1222 * since CALLB_CPR_EXIT is actually responsible for releasing 1223 * 'mi_async_lock'. 1224 */ 1225 CALLB_CPR_EXIT(&cprinfo); 1226 VFS_RELE(vfsp); /* release thread's hold */ 1227 MI4_RELE(mi); 1228 zthread_exit(); 1229 } 1230 1231 /* 1232 * Signal (and wait for) the async manager thread to clean up and go away. 1233 */ 1234 void 1235 nfs4_async_manager_stop(vfs_t *vfsp) 1236 { 1237 mntinfo4_t *mi = VFTOMI4(vfsp); 1238 1239 mutex_enter(&mi->mi_async_lock); 1240 mutex_enter(&mi->mi_lock); 1241 mi->mi_flags |= MI4_ASYNC_MGR_STOP; 1242 mutex_exit(&mi->mi_lock); 1243 cv_broadcast(&mi->mi_async_reqs_cv); 1244 /* 1245 * Wait for the async manager thread to die. 1246 */ 1247 while (mi->mi_manager_thread != NULL) 1248 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock); 1249 mutex_exit(&mi->mi_async_lock); 1250 } 1251 1252 int 1253 nfs4_async_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr, 1254 struct seg *seg, cred_t *cr, void (*readahead)(vnode_t *, 1255 u_offset_t, caddr_t, struct seg *, cred_t *)) 1256 { 1257 rnode4_t *rp; 1258 mntinfo4_t *mi; 1259 struct nfs4_async_reqs *args; 1260 1261 rp = VTOR4(vp); 1262 ASSERT(rp->r_freef == NULL); 1263 1264 mi = VTOMI4(vp); 1265 1266 /* 1267 * If addr falls in a different segment, don't bother doing readahead. 1268 */ 1269 if (addr >= seg->s_base + seg->s_size) 1270 return (-1); 1271 1272 /* 1273 * If we can't allocate a request structure, punt on the readahead. 1274 */ 1275 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1276 return (-1); 1277 1278 /* 1279 * If a lock operation is pending, don't initiate any new 1280 * readaheads. Otherwise, bump r_count to indicate the new 1281 * asynchronous I/O. 1282 */ 1283 if (!nfs_rw_tryenter(&rp->r_lkserlock, RW_READER)) { 1284 kmem_free(args, sizeof (*args)); 1285 return (-1); 1286 } 1287 mutex_enter(&rp->r_statelock); 1288 rp->r_count++; 1289 mutex_exit(&rp->r_statelock); 1290 nfs_rw_exit(&rp->r_lkserlock); 1291 1292 args->a_next = NULL; 1293 #ifdef DEBUG 1294 args->a_queuer = curthread; 1295 #endif 1296 VN_HOLD(vp); 1297 args->a_vp = vp; 1298 ASSERT(cr != NULL); 1299 crhold(cr); 1300 args->a_cred = cr; 1301 args->a_io = NFS4_READ_AHEAD; 1302 args->a_nfs4_readahead = readahead; 1303 args->a_nfs4_blkoff = blkoff; 1304 args->a_nfs4_seg = seg; 1305 args->a_nfs4_addr = addr; 1306 1307 mutex_enter(&mi->mi_async_lock); 1308 1309 /* 1310 * If asyncio has been disabled, don't bother readahead. 1311 */ 1312 if (mi->mi_max_threads == 0) { 1313 mutex_exit(&mi->mi_async_lock); 1314 goto noasync; 1315 } 1316 1317 /* 1318 * Link request structure into the async list and 1319 * wakeup async thread to do the i/o. 1320 */ 1321 if (mi->mi_async_reqs[NFS4_READ_AHEAD] == NULL) { 1322 mi->mi_async_reqs[NFS4_READ_AHEAD] = args; 1323 mi->mi_async_tail[NFS4_READ_AHEAD] = args; 1324 } else { 1325 mi->mi_async_tail[NFS4_READ_AHEAD]->a_next = args; 1326 mi->mi_async_tail[NFS4_READ_AHEAD] = args; 1327 } 1328 1329 if (mi->mi_io_kstats) { 1330 mutex_enter(&mi->mi_lock); 1331 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1332 mutex_exit(&mi->mi_lock); 1333 } 1334 1335 mi->mi_async_req_count++; 1336 ASSERT(mi->mi_async_req_count != 0); 1337 cv_signal(&mi->mi_async_reqs_cv); 1338 mutex_exit(&mi->mi_async_lock); 1339 return (0); 1340 1341 noasync: 1342 mutex_enter(&rp->r_statelock); 1343 rp->r_count--; 1344 cv_broadcast(&rp->r_cv); 1345 mutex_exit(&rp->r_statelock); 1346 VN_RELE(vp); 1347 crfree(cr); 1348 kmem_free(args, sizeof (*args)); 1349 return (-1); 1350 } 1351 1352 static void 1353 nfs4_async_start(struct vfs *vfsp) 1354 { 1355 nfs4_async_common_start(vfsp, NFS4_ASYNC_QUEUE); 1356 } 1357 1358 static void 1359 nfs4_async_pgops_start(struct vfs *vfsp) 1360 { 1361 nfs4_async_common_start(vfsp, NFS4_ASYNC_PGOPS_QUEUE); 1362 } 1363 1364 /* 1365 * The async queues for each mounted file system are arranged as a 1366 * set of queues, one for each async i/o type. Requests are taken 1367 * from the queues in a round-robin fashion. A number of consecutive 1368 * requests are taken from each queue before moving on to the next 1369 * queue. This functionality may allow the NFS Version 2 server to do 1370 * write clustering, even if the client is mixing writes and reads 1371 * because it will take multiple write requests from the queue 1372 * before processing any of the other async i/o types. 1373 * 1374 * XXX The nfs4_async_common_start thread is unsafe in the light of the present 1375 * model defined by cpr to suspend the system. Specifically over the 1376 * wire calls are cpr-unsafe. The thread should be reevaluated in 1377 * case of future updates to the cpr model. 1378 */ 1379 static void 1380 nfs4_async_common_start(struct vfs *vfsp, int async_queue) 1381 { 1382 struct nfs4_async_reqs *args; 1383 mntinfo4_t *mi = VFTOMI4(vfsp); 1384 clock_t time_left = 1; 1385 callb_cpr_t cprinfo; 1386 int i; 1387 extern int nfs_async_timeout; 1388 int async_types; 1389 kcondvar_t *async_work_cv; 1390 1391 if (async_queue == NFS4_ASYNC_QUEUE) { 1392 async_types = NFS4_ASYNC_TYPES; 1393 async_work_cv = &mi->mi_async_work_cv[NFS4_ASYNC_QUEUE]; 1394 } else { 1395 async_types = NFS4_ASYNC_PGOPS_TYPES; 1396 async_work_cv = &mi->mi_async_work_cv[NFS4_ASYNC_PGOPS_QUEUE]; 1397 } 1398 1399 /* 1400 * Dynamic initialization of nfs_async_timeout to allow nfs to be 1401 * built in an implementation independent manner. 1402 */ 1403 if (nfs_async_timeout == -1) 1404 nfs_async_timeout = NFS_ASYNC_TIMEOUT; 1405 1406 CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, "nas"); 1407 1408 mutex_enter(&mi->mi_async_lock); 1409 for (;;) { 1410 /* 1411 * Find the next queue containing an entry. We start 1412 * at the current queue pointer and then round robin 1413 * through all of them until we either find a non-empty 1414 * queue or have looked through all of them. 1415 */ 1416 for (i = 0; i < async_types; i++) { 1417 args = *mi->mi_async_curr[async_queue]; 1418 if (args != NULL) 1419 break; 1420 mi->mi_async_curr[async_queue]++; 1421 if (mi->mi_async_curr[async_queue] == 1422 &mi->mi_async_reqs[async_types]) { 1423 mi->mi_async_curr[async_queue] = 1424 &mi->mi_async_reqs[0]; 1425 } 1426 } 1427 /* 1428 * If we didn't find a entry, then block until woken up 1429 * again and then look through the queues again. 1430 */ 1431 if (args == NULL) { 1432 /* 1433 * Exiting is considered to be safe for CPR as well 1434 */ 1435 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1436 1437 /* 1438 * Wakeup thread waiting to unmount the file 1439 * system only if all async threads are inactive. 1440 * 1441 * If we've timed-out and there's nothing to do, 1442 * then get rid of this thread. 1443 */ 1444 if (mi->mi_max_threads == 0 || time_left <= 0) { 1445 --mi->mi_threads[async_queue]; 1446 1447 if (mi->mi_threads[NFS4_ASYNC_QUEUE] == 0 && 1448 mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] == 0) 1449 cv_signal(&mi->mi_async_cv); 1450 CALLB_CPR_EXIT(&cprinfo); 1451 VFS_RELE(vfsp); /* release thread's hold */ 1452 MI4_RELE(mi); 1453 zthread_exit(); 1454 /* NOTREACHED */ 1455 } 1456 time_left = cv_reltimedwait(async_work_cv, 1457 &mi->mi_async_lock, nfs_async_timeout, 1458 TR_CLOCK_TICK); 1459 1460 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock); 1461 1462 continue; 1463 } else { 1464 time_left = 1; 1465 } 1466 1467 /* 1468 * Remove the request from the async queue and then 1469 * update the current async request queue pointer. If 1470 * the current queue is empty or we have removed enough 1471 * consecutive entries from it, then reset the counter 1472 * for this queue and then move the current pointer to 1473 * the next queue. 1474 */ 1475 *mi->mi_async_curr[async_queue] = args->a_next; 1476 if (*mi->mi_async_curr[async_queue] == NULL || 1477 --mi->mi_async_clusters[args->a_io] == 0) { 1478 mi->mi_async_clusters[args->a_io] = 1479 mi->mi_async_init_clusters; 1480 mi->mi_async_curr[async_queue]++; 1481 if (mi->mi_async_curr[async_queue] == 1482 &mi->mi_async_reqs[async_types]) { 1483 mi->mi_async_curr[async_queue] = 1484 &mi->mi_async_reqs[0]; 1485 } 1486 } 1487 1488 if (args->a_io != NFS4_INACTIVE && mi->mi_io_kstats) { 1489 mutex_enter(&mi->mi_lock); 1490 kstat_waitq_exit(KSTAT_IO_PTR(mi->mi_io_kstats)); 1491 mutex_exit(&mi->mi_lock); 1492 } 1493 1494 mutex_exit(&mi->mi_async_lock); 1495 1496 /* 1497 * Obtain arguments from the async request structure. 1498 */ 1499 if (args->a_io == NFS4_READ_AHEAD && mi->mi_max_threads > 0) { 1500 (*args->a_nfs4_readahead)(args->a_vp, 1501 args->a_nfs4_blkoff, args->a_nfs4_addr, 1502 args->a_nfs4_seg, args->a_cred); 1503 } else if (args->a_io == NFS4_PUTAPAGE) { 1504 (void) (*args->a_nfs4_putapage)(args->a_vp, 1505 args->a_nfs4_pp, args->a_nfs4_off, 1506 args->a_nfs4_len, args->a_nfs4_flags, 1507 args->a_cred); 1508 } else if (args->a_io == NFS4_PAGEIO) { 1509 (void) (*args->a_nfs4_pageio)(args->a_vp, 1510 args->a_nfs4_pp, args->a_nfs4_off, 1511 args->a_nfs4_len, args->a_nfs4_flags, 1512 args->a_cred); 1513 } else if (args->a_io == NFS4_READDIR) { 1514 (void) ((*args->a_nfs4_readdir)(args->a_vp, 1515 args->a_nfs4_rdc, args->a_cred)); 1516 } else if (args->a_io == NFS4_COMMIT) { 1517 (*args->a_nfs4_commit)(args->a_vp, args->a_nfs4_plist, 1518 args->a_nfs4_offset, args->a_nfs4_count, 1519 args->a_cred); 1520 } else if (args->a_io == NFS4_INACTIVE) { 1521 nfs4_inactive_otw(args->a_vp, args->a_cred); 1522 } 1523 1524 /* 1525 * Now, release the vnode and free the credentials 1526 * structure. 1527 */ 1528 free_async_args4(args); 1529 /* 1530 * Reacquire the mutex because it will be needed above. 1531 */ 1532 mutex_enter(&mi->mi_async_lock); 1533 } 1534 } 1535 1536 /* 1537 * nfs4_inactive_thread - look for vnodes that need over-the-wire calls as 1538 * part of VOP_INACTIVE. 1539 */ 1540 1541 void 1542 nfs4_inactive_thread(mntinfo4_t *mi) 1543 { 1544 struct nfs4_async_reqs *args; 1545 callb_cpr_t cprinfo; 1546 vfs_t *vfsp = mi->mi_vfsp; 1547 1548 CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, 1549 "nfs4_inactive_thread"); 1550 1551 for (;;) { 1552 mutex_enter(&mi->mi_async_lock); 1553 args = mi->mi_async_reqs[NFS4_INACTIVE]; 1554 if (args == NULL) { 1555 mutex_enter(&mi->mi_lock); 1556 /* 1557 * We don't want to exit until the async manager is done 1558 * with its work; hence the check for mi_manager_thread 1559 * being NULL. 1560 * 1561 * The async manager thread will cv_broadcast() on 1562 * mi_inact_req_cv when it's done, at which point we'll 1563 * wake up and exit. 1564 */ 1565 if (mi->mi_manager_thread == NULL) 1566 goto die; 1567 mi->mi_flags |= MI4_INACTIVE_IDLE; 1568 mutex_exit(&mi->mi_lock); 1569 cv_signal(&mi->mi_async_cv); 1570 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1571 cv_wait(&mi->mi_inact_req_cv, &mi->mi_async_lock); 1572 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock); 1573 mutex_exit(&mi->mi_async_lock); 1574 } else { 1575 mutex_enter(&mi->mi_lock); 1576 mi->mi_flags &= ~MI4_INACTIVE_IDLE; 1577 mutex_exit(&mi->mi_lock); 1578 mi->mi_async_reqs[NFS4_INACTIVE] = args->a_next; 1579 mutex_exit(&mi->mi_async_lock); 1580 nfs4_inactive_otw(args->a_vp, args->a_cred); 1581 crfree(args->a_cred); 1582 kmem_free(args, sizeof (*args)); 1583 } 1584 } 1585 die: 1586 mutex_exit(&mi->mi_lock); 1587 mi->mi_inactive_thread = NULL; 1588 cv_signal(&mi->mi_async_cv); 1589 1590 /* 1591 * There is no explicit call to mutex_exit(&mi->mi_async_lock) since 1592 * CALLB_CPR_EXIT is actually responsible for releasing 'mi_async_lock'. 1593 */ 1594 CALLB_CPR_EXIT(&cprinfo); 1595 1596 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE, 1597 "nfs4_inactive_thread exiting for vfs %p\n", (void *)vfsp)); 1598 1599 MI4_RELE(mi); 1600 zthread_exit(); 1601 /* NOTREACHED */ 1602 } 1603 1604 /* 1605 * nfs_async_stop: 1606 * Wait for all outstanding putpage operations and the inactive thread to 1607 * complete; nfs4_async_stop_sig() without interruptibility. 1608 */ 1609 void 1610 nfs4_async_stop(struct vfs *vfsp) 1611 { 1612 mntinfo4_t *mi = VFTOMI4(vfsp); 1613 1614 /* 1615 * Wait for all outstanding async operations to complete and for 1616 * worker threads to exit. 1617 */ 1618 mutex_enter(&mi->mi_async_lock); 1619 mi->mi_max_threads = 0; 1620 NFS4_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv); 1621 while (mi->mi_threads[NFS4_ASYNC_QUEUE] != 0 || 1622 mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] != 0) 1623 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock); 1624 1625 /* 1626 * Wait for the inactive thread to finish doing what it's doing. It 1627 * won't exit until the last reference to the vfs_t goes away. 1628 */ 1629 if (mi->mi_inactive_thread != NULL) { 1630 mutex_enter(&mi->mi_lock); 1631 while (!(mi->mi_flags & MI4_INACTIVE_IDLE) || 1632 (mi->mi_async_reqs[NFS4_INACTIVE] != NULL)) { 1633 mutex_exit(&mi->mi_lock); 1634 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock); 1635 mutex_enter(&mi->mi_lock); 1636 } 1637 mutex_exit(&mi->mi_lock); 1638 } 1639 mutex_exit(&mi->mi_async_lock); 1640 } 1641 1642 /* 1643 * nfs_async_stop_sig: 1644 * Wait for all outstanding putpage operations and the inactive thread to 1645 * complete. If a signal is delivered we will abort and return non-zero; 1646 * otherwise return 0. Since this routine is called from nfs4_unmount, we 1647 * need to make it interruptible. 1648 */ 1649 int 1650 nfs4_async_stop_sig(struct vfs *vfsp) 1651 { 1652 mntinfo4_t *mi = VFTOMI4(vfsp); 1653 ushort_t omax; 1654 bool_t intr = FALSE; 1655 1656 /* 1657 * Wait for all outstanding putpage operations to complete and for 1658 * worker threads to exit. 1659 */ 1660 mutex_enter(&mi->mi_async_lock); 1661 omax = mi->mi_max_threads; 1662 mi->mi_max_threads = 0; 1663 NFS4_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv); 1664 while (mi->mi_threads[NFS4_ASYNC_QUEUE] != 0 || 1665 mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] != 0) { 1666 if (!cv_wait_sig(&mi->mi_async_cv, &mi->mi_async_lock)) { 1667 intr = TRUE; 1668 goto interrupted; 1669 } 1670 } 1671 1672 /* 1673 * Wait for the inactive thread to finish doing what it's doing. It 1674 * won't exit until the a last reference to the vfs_t goes away. 1675 */ 1676 if (mi->mi_inactive_thread != NULL) { 1677 mutex_enter(&mi->mi_lock); 1678 while (!(mi->mi_flags & MI4_INACTIVE_IDLE) || 1679 (mi->mi_async_reqs[NFS4_INACTIVE] != NULL)) { 1680 mutex_exit(&mi->mi_lock); 1681 if (!cv_wait_sig(&mi->mi_async_cv, 1682 &mi->mi_async_lock)) { 1683 intr = TRUE; 1684 goto interrupted; 1685 } 1686 mutex_enter(&mi->mi_lock); 1687 } 1688 mutex_exit(&mi->mi_lock); 1689 } 1690 interrupted: 1691 if (intr) 1692 mi->mi_max_threads = omax; 1693 mutex_exit(&mi->mi_async_lock); 1694 1695 return (intr); 1696 } 1697 1698 int 1699 nfs4_async_putapage(vnode_t *vp, page_t *pp, u_offset_t off, size_t len, 1700 int flags, cred_t *cr, int (*putapage)(vnode_t *, page_t *, 1701 u_offset_t, size_t, int, cred_t *)) 1702 { 1703 rnode4_t *rp; 1704 mntinfo4_t *mi; 1705 struct nfs4_async_reqs *args; 1706 1707 ASSERT(flags & B_ASYNC); 1708 ASSERT(vp->v_vfsp != NULL); 1709 1710 rp = VTOR4(vp); 1711 ASSERT(rp->r_count > 0); 1712 1713 mi = VTOMI4(vp); 1714 1715 /* 1716 * If we can't allocate a request structure, do the putpage 1717 * operation synchronously in this thread's context. 1718 */ 1719 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1720 goto noasync; 1721 1722 args->a_next = NULL; 1723 #ifdef DEBUG 1724 args->a_queuer = curthread; 1725 #endif 1726 VN_HOLD(vp); 1727 args->a_vp = vp; 1728 ASSERT(cr != NULL); 1729 crhold(cr); 1730 args->a_cred = cr; 1731 args->a_io = NFS4_PUTAPAGE; 1732 args->a_nfs4_putapage = putapage; 1733 args->a_nfs4_pp = pp; 1734 args->a_nfs4_off = off; 1735 args->a_nfs4_len = (uint_t)len; 1736 args->a_nfs4_flags = flags; 1737 1738 mutex_enter(&mi->mi_async_lock); 1739 1740 /* 1741 * If asyncio has been disabled, then make a synchronous request. 1742 * This check is done a second time in case async io was diabled 1743 * while this thread was blocked waiting for memory pressure to 1744 * reduce or for the queue to drain. 1745 */ 1746 if (mi->mi_max_threads == 0) { 1747 mutex_exit(&mi->mi_async_lock); 1748 1749 VN_RELE(vp); 1750 crfree(cr); 1751 kmem_free(args, sizeof (*args)); 1752 goto noasync; 1753 } 1754 1755 /* 1756 * Link request structure into the async list and 1757 * wakeup async thread to do the i/o. 1758 */ 1759 if (mi->mi_async_reqs[NFS4_PUTAPAGE] == NULL) { 1760 mi->mi_async_reqs[NFS4_PUTAPAGE] = args; 1761 mi->mi_async_tail[NFS4_PUTAPAGE] = args; 1762 } else { 1763 mi->mi_async_tail[NFS4_PUTAPAGE]->a_next = args; 1764 mi->mi_async_tail[NFS4_PUTAPAGE] = args; 1765 } 1766 1767 mutex_enter(&rp->r_statelock); 1768 rp->r_count++; 1769 rp->r_awcount++; 1770 mutex_exit(&rp->r_statelock); 1771 1772 if (mi->mi_io_kstats) { 1773 mutex_enter(&mi->mi_lock); 1774 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1775 mutex_exit(&mi->mi_lock); 1776 } 1777 1778 mi->mi_async_req_count++; 1779 ASSERT(mi->mi_async_req_count != 0); 1780 cv_signal(&mi->mi_async_reqs_cv); 1781 mutex_exit(&mi->mi_async_lock); 1782 return (0); 1783 1784 noasync: 1785 1786 if (curproc == proc_pageout || curproc == proc_fsflush) { 1787 /* 1788 * If we get here in the context of the pageout/fsflush, 1789 * or we have run out of memory or we're attempting to 1790 * unmount we refuse to do a sync write, because this may 1791 * hang pageout/fsflush and the machine. In this case, 1792 * we just re-mark the page as dirty and punt on the page. 1793 * 1794 * Make sure B_FORCE isn't set. We can re-mark the 1795 * pages as dirty and unlock the pages in one swoop by 1796 * passing in B_ERROR to pvn_write_done(). However, 1797 * we should make sure B_FORCE isn't set - we don't 1798 * want the page tossed before it gets written out. 1799 */ 1800 if (flags & B_FORCE) 1801 flags &= ~(B_INVAL | B_FORCE); 1802 pvn_write_done(pp, flags | B_ERROR); 1803 return (0); 1804 } 1805 1806 if (nfs_zone() != mi->mi_zone) { 1807 /* 1808 * So this was a cross-zone sync putpage. 1809 * 1810 * We pass in B_ERROR to pvn_write_done() to re-mark the pages 1811 * as dirty and unlock them. 1812 * 1813 * We don't want to clear B_FORCE here as the caller presumably 1814 * knows what they're doing if they set it. 1815 */ 1816 pvn_write_done(pp, flags | B_ERROR); 1817 return (EPERM); 1818 } 1819 return ((*putapage)(vp, pp, off, len, flags, cr)); 1820 } 1821 1822 int 1823 nfs4_async_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len, 1824 int flags, cred_t *cr, int (*pageio)(vnode_t *, page_t *, u_offset_t, 1825 size_t, int, cred_t *)) 1826 { 1827 rnode4_t *rp; 1828 mntinfo4_t *mi; 1829 struct nfs4_async_reqs *args; 1830 1831 ASSERT(flags & B_ASYNC); 1832 ASSERT(vp->v_vfsp != NULL); 1833 1834 rp = VTOR4(vp); 1835 ASSERT(rp->r_count > 0); 1836 1837 mi = VTOMI4(vp); 1838 1839 /* 1840 * If we can't allocate a request structure, do the pageio 1841 * request synchronously in this thread's context. 1842 */ 1843 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1844 goto noasync; 1845 1846 args->a_next = NULL; 1847 #ifdef DEBUG 1848 args->a_queuer = curthread; 1849 #endif 1850 VN_HOLD(vp); 1851 args->a_vp = vp; 1852 ASSERT(cr != NULL); 1853 crhold(cr); 1854 args->a_cred = cr; 1855 args->a_io = NFS4_PAGEIO; 1856 args->a_nfs4_pageio = pageio; 1857 args->a_nfs4_pp = pp; 1858 args->a_nfs4_off = io_off; 1859 args->a_nfs4_len = (uint_t)io_len; 1860 args->a_nfs4_flags = flags; 1861 1862 mutex_enter(&mi->mi_async_lock); 1863 1864 /* 1865 * If asyncio has been disabled, then make a synchronous request. 1866 * This check is done a second time in case async io was diabled 1867 * while this thread was blocked waiting for memory pressure to 1868 * reduce or for the queue to drain. 1869 */ 1870 if (mi->mi_max_threads == 0) { 1871 mutex_exit(&mi->mi_async_lock); 1872 1873 VN_RELE(vp); 1874 crfree(cr); 1875 kmem_free(args, sizeof (*args)); 1876 goto noasync; 1877 } 1878 1879 /* 1880 * Link request structure into the async list and 1881 * wakeup async thread to do the i/o. 1882 */ 1883 if (mi->mi_async_reqs[NFS4_PAGEIO] == NULL) { 1884 mi->mi_async_reqs[NFS4_PAGEIO] = args; 1885 mi->mi_async_tail[NFS4_PAGEIO] = args; 1886 } else { 1887 mi->mi_async_tail[NFS4_PAGEIO]->a_next = args; 1888 mi->mi_async_tail[NFS4_PAGEIO] = args; 1889 } 1890 1891 mutex_enter(&rp->r_statelock); 1892 rp->r_count++; 1893 rp->r_awcount++; 1894 mutex_exit(&rp->r_statelock); 1895 1896 if (mi->mi_io_kstats) { 1897 mutex_enter(&mi->mi_lock); 1898 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1899 mutex_exit(&mi->mi_lock); 1900 } 1901 1902 mi->mi_async_req_count++; 1903 ASSERT(mi->mi_async_req_count != 0); 1904 cv_signal(&mi->mi_async_reqs_cv); 1905 mutex_exit(&mi->mi_async_lock); 1906 return (0); 1907 1908 noasync: 1909 /* 1910 * If we can't do it ASYNC, for reads we do nothing (but cleanup 1911 * the page list), for writes we do it synchronously, except for 1912 * proc_pageout/proc_fsflush as described below. 1913 */ 1914 if (flags & B_READ) { 1915 pvn_read_done(pp, flags | B_ERROR); 1916 return (0); 1917 } 1918 1919 if (curproc == proc_pageout || curproc == proc_fsflush) { 1920 /* 1921 * If we get here in the context of the pageout/fsflush, 1922 * we refuse to do a sync write, because this may hang 1923 * pageout/fsflush (and the machine). In this case, we just 1924 * re-mark the page as dirty and punt on the page. 1925 * 1926 * Make sure B_FORCE isn't set. We can re-mark the 1927 * pages as dirty and unlock the pages in one swoop by 1928 * passing in B_ERROR to pvn_write_done(). However, 1929 * we should make sure B_FORCE isn't set - we don't 1930 * want the page tossed before it gets written out. 1931 */ 1932 if (flags & B_FORCE) 1933 flags &= ~(B_INVAL | B_FORCE); 1934 pvn_write_done(pp, flags | B_ERROR); 1935 return (0); 1936 } 1937 1938 if (nfs_zone() != mi->mi_zone) { 1939 /* 1940 * So this was a cross-zone sync pageio. We pass in B_ERROR 1941 * to pvn_write_done() to re-mark the pages as dirty and unlock 1942 * them. 1943 * 1944 * We don't want to clear B_FORCE here as the caller presumably 1945 * knows what they're doing if they set it. 1946 */ 1947 pvn_write_done(pp, flags | B_ERROR); 1948 return (EPERM); 1949 } 1950 return ((*pageio)(vp, pp, io_off, io_len, flags, cr)); 1951 } 1952 1953 void 1954 nfs4_async_readdir(vnode_t *vp, rddir4_cache *rdc, cred_t *cr, 1955 int (*readdir)(vnode_t *, rddir4_cache *, cred_t *)) 1956 { 1957 rnode4_t *rp; 1958 mntinfo4_t *mi; 1959 struct nfs4_async_reqs *args; 1960 1961 rp = VTOR4(vp); 1962 ASSERT(rp->r_freef == NULL); 1963 1964 mi = VTOMI4(vp); 1965 1966 /* 1967 * If we can't allocate a request structure, skip the readdir. 1968 */ 1969 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1970 goto noasync; 1971 1972 args->a_next = NULL; 1973 #ifdef DEBUG 1974 args->a_queuer = curthread; 1975 #endif 1976 VN_HOLD(vp); 1977 args->a_vp = vp; 1978 ASSERT(cr != NULL); 1979 crhold(cr); 1980 args->a_cred = cr; 1981 args->a_io = NFS4_READDIR; 1982 args->a_nfs4_readdir = readdir; 1983 args->a_nfs4_rdc = rdc; 1984 1985 mutex_enter(&mi->mi_async_lock); 1986 1987 /* 1988 * If asyncio has been disabled, then skip this request 1989 */ 1990 if (mi->mi_max_threads == 0) { 1991 mutex_exit(&mi->mi_async_lock); 1992 1993 VN_RELE(vp); 1994 crfree(cr); 1995 kmem_free(args, sizeof (*args)); 1996 goto noasync; 1997 } 1998 1999 /* 2000 * Link request structure into the async list and 2001 * wakeup async thread to do the i/o. 2002 */ 2003 if (mi->mi_async_reqs[NFS4_READDIR] == NULL) { 2004 mi->mi_async_reqs[NFS4_READDIR] = args; 2005 mi->mi_async_tail[NFS4_READDIR] = args; 2006 } else { 2007 mi->mi_async_tail[NFS4_READDIR]->a_next = args; 2008 mi->mi_async_tail[NFS4_READDIR] = args; 2009 } 2010 2011 mutex_enter(&rp->r_statelock); 2012 rp->r_count++; 2013 mutex_exit(&rp->r_statelock); 2014 2015 if (mi->mi_io_kstats) { 2016 mutex_enter(&mi->mi_lock); 2017 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 2018 mutex_exit(&mi->mi_lock); 2019 } 2020 2021 mi->mi_async_req_count++; 2022 ASSERT(mi->mi_async_req_count != 0); 2023 cv_signal(&mi->mi_async_reqs_cv); 2024 mutex_exit(&mi->mi_async_lock); 2025 return; 2026 2027 noasync: 2028 mutex_enter(&rp->r_statelock); 2029 rdc->entries = NULL; 2030 /* 2031 * Indicate that no one is trying to fill this entry and 2032 * it still needs to be filled. 2033 */ 2034 rdc->flags &= ~RDDIR; 2035 rdc->flags |= RDDIRREQ; 2036 rddir4_cache_rele(rp, rdc); 2037 mutex_exit(&rp->r_statelock); 2038 } 2039 2040 void 2041 nfs4_async_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count, 2042 cred_t *cr, void (*commit)(vnode_t *, page_t *, offset3, count3, 2043 cred_t *)) 2044 { 2045 rnode4_t *rp; 2046 mntinfo4_t *mi; 2047 struct nfs4_async_reqs *args; 2048 page_t *pp; 2049 2050 rp = VTOR4(vp); 2051 mi = VTOMI4(vp); 2052 2053 /* 2054 * If we can't allocate a request structure, do the commit 2055 * operation synchronously in this thread's context. 2056 */ 2057 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 2058 goto noasync; 2059 2060 args->a_next = NULL; 2061 #ifdef DEBUG 2062 args->a_queuer = curthread; 2063 #endif 2064 VN_HOLD(vp); 2065 args->a_vp = vp; 2066 ASSERT(cr != NULL); 2067 crhold(cr); 2068 args->a_cred = cr; 2069 args->a_io = NFS4_COMMIT; 2070 args->a_nfs4_commit = commit; 2071 args->a_nfs4_plist = plist; 2072 args->a_nfs4_offset = offset; 2073 args->a_nfs4_count = count; 2074 2075 mutex_enter(&mi->mi_async_lock); 2076 2077 /* 2078 * If asyncio has been disabled, then make a synchronous request. 2079 * This check is done a second time in case async io was diabled 2080 * while this thread was blocked waiting for memory pressure to 2081 * reduce or for the queue to drain. 2082 */ 2083 if (mi->mi_max_threads == 0) { 2084 mutex_exit(&mi->mi_async_lock); 2085 2086 VN_RELE(vp); 2087 crfree(cr); 2088 kmem_free(args, sizeof (*args)); 2089 goto noasync; 2090 } 2091 2092 /* 2093 * Link request structure into the async list and 2094 * wakeup async thread to do the i/o. 2095 */ 2096 if (mi->mi_async_reqs[NFS4_COMMIT] == NULL) { 2097 mi->mi_async_reqs[NFS4_COMMIT] = args; 2098 mi->mi_async_tail[NFS4_COMMIT] = args; 2099 } else { 2100 mi->mi_async_tail[NFS4_COMMIT]->a_next = args; 2101 mi->mi_async_tail[NFS4_COMMIT] = args; 2102 } 2103 2104 mutex_enter(&rp->r_statelock); 2105 rp->r_count++; 2106 mutex_exit(&rp->r_statelock); 2107 2108 if (mi->mi_io_kstats) { 2109 mutex_enter(&mi->mi_lock); 2110 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 2111 mutex_exit(&mi->mi_lock); 2112 } 2113 2114 mi->mi_async_req_count++; 2115 ASSERT(mi->mi_async_req_count != 0); 2116 cv_signal(&mi->mi_async_reqs_cv); 2117 mutex_exit(&mi->mi_async_lock); 2118 return; 2119 2120 noasync: 2121 if (curproc == proc_pageout || curproc == proc_fsflush || 2122 nfs_zone() != mi->mi_zone) { 2123 while (plist != NULL) { 2124 pp = plist; 2125 page_sub(&plist, pp); 2126 pp->p_fsdata = C_COMMIT; 2127 page_unlock(pp); 2128 } 2129 return; 2130 } 2131 (*commit)(vp, plist, offset, count, cr); 2132 } 2133 2134 /* 2135 * nfs4_async_inactive - hand off a VOP_INACTIVE call to a thread. The 2136 * reference to the vnode is handed over to the thread; the caller should 2137 * no longer refer to the vnode. 2138 * 2139 * Unlike most of the async routines, this handoff is needed for 2140 * correctness reasons, not just performance. So doing operations in the 2141 * context of the current thread is not an option. 2142 */ 2143 void 2144 nfs4_async_inactive(vnode_t *vp, cred_t *cr) 2145 { 2146 mntinfo4_t *mi; 2147 struct nfs4_async_reqs *args; 2148 boolean_t signal_inactive_thread = B_FALSE; 2149 2150 mi = VTOMI4(vp); 2151 2152 args = kmem_alloc(sizeof (*args), KM_SLEEP); 2153 args->a_next = NULL; 2154 #ifdef DEBUG 2155 args->a_queuer = curthread; 2156 #endif 2157 args->a_vp = vp; 2158 ASSERT(cr != NULL); 2159 crhold(cr); 2160 args->a_cred = cr; 2161 args->a_io = NFS4_INACTIVE; 2162 2163 /* 2164 * Note that we don't check mi->mi_max_threads here, since we 2165 * *need* to get rid of this vnode regardless of whether someone 2166 * set nfs4_max_threads to zero in /etc/system. 2167 * 2168 * The manager thread knows about this and is willing to create 2169 * at least one thread to accommodate us. 2170 */ 2171 mutex_enter(&mi->mi_async_lock); 2172 if (mi->mi_inactive_thread == NULL) { 2173 rnode4_t *rp; 2174 vnode_t *unldvp = NULL; 2175 char *unlname; 2176 cred_t *unlcred; 2177 2178 mutex_exit(&mi->mi_async_lock); 2179 /* 2180 * We just need to free up the memory associated with the 2181 * vnode, which can be safely done from within the current 2182 * context. 2183 */ 2184 crfree(cr); /* drop our reference */ 2185 kmem_free(args, sizeof (*args)); 2186 rp = VTOR4(vp); 2187 mutex_enter(&rp->r_statelock); 2188 if (rp->r_unldvp != NULL) { 2189 unldvp = rp->r_unldvp; 2190 rp->r_unldvp = NULL; 2191 unlname = rp->r_unlname; 2192 rp->r_unlname = NULL; 2193 unlcred = rp->r_unlcred; 2194 rp->r_unlcred = NULL; 2195 } 2196 mutex_exit(&rp->r_statelock); 2197 /* 2198 * No need to explicitly throw away any cached pages. The 2199 * eventual r4inactive() will attempt a synchronous 2200 * VOP_PUTPAGE() which will immediately fail since the request 2201 * is coming from the wrong zone, and then will proceed to call 2202 * nfs4_invalidate_pages() which will clean things up for us. 2203 * 2204 * Throw away the delegation here so rp4_addfree()'s attempt to 2205 * return any existing delegations becomes a no-op. 2206 */ 2207 if (rp->r_deleg_type != OPEN_DELEGATE_NONE) { 2208 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 2209 FALSE); 2210 (void) nfs4delegreturn(rp, NFS4_DR_DISCARD); 2211 nfs_rw_exit(&mi->mi_recovlock); 2212 } 2213 nfs4_clear_open_streams(rp); 2214 2215 rp4_addfree(rp, cr); 2216 if (unldvp != NULL) { 2217 kmem_free(unlname, MAXNAMELEN); 2218 VN_RELE(unldvp); 2219 crfree(unlcred); 2220 } 2221 return; 2222 } 2223 2224 if (mi->mi_manager_thread == NULL) { 2225 /* 2226 * We want to talk to the inactive thread. 2227 */ 2228 signal_inactive_thread = B_TRUE; 2229 } 2230 2231 /* 2232 * Enqueue the vnode and wake up either the special thread (empty 2233 * list) or an async thread. 2234 */ 2235 if (mi->mi_async_reqs[NFS4_INACTIVE] == NULL) { 2236 mi->mi_async_reqs[NFS4_INACTIVE] = args; 2237 mi->mi_async_tail[NFS4_INACTIVE] = args; 2238 signal_inactive_thread = B_TRUE; 2239 } else { 2240 mi->mi_async_tail[NFS4_INACTIVE]->a_next = args; 2241 mi->mi_async_tail[NFS4_INACTIVE] = args; 2242 } 2243 if (signal_inactive_thread) { 2244 cv_signal(&mi->mi_inact_req_cv); 2245 } else { 2246 mi->mi_async_req_count++; 2247 ASSERT(mi->mi_async_req_count != 0); 2248 cv_signal(&mi->mi_async_reqs_cv); 2249 } 2250 2251 mutex_exit(&mi->mi_async_lock); 2252 } 2253 2254 int 2255 writerp4(rnode4_t *rp, caddr_t base, int tcount, struct uio *uio, int pgcreated) 2256 { 2257 int pagecreate; 2258 int n; 2259 int saved_n; 2260 caddr_t saved_base; 2261 u_offset_t offset; 2262 int error; 2263 int sm_error; 2264 vnode_t *vp = RTOV(rp); 2265 2266 ASSERT(tcount <= MAXBSIZE && tcount <= uio->uio_resid); 2267 ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_WRITER)); 2268 if (!vpm_enable) { 2269 ASSERT(((uintptr_t)base & MAXBOFFSET) + tcount <= MAXBSIZE); 2270 } 2271 2272 /* 2273 * Move bytes in at most PAGESIZE chunks. We must avoid 2274 * spanning pages in uiomove() because page faults may cause 2275 * the cache to be invalidated out from under us. The r_size is not 2276 * updated until after the uiomove. If we push the last page of a 2277 * file before r_size is correct, we will lose the data written past 2278 * the current (and invalid) r_size. 2279 */ 2280 do { 2281 offset = uio->uio_loffset; 2282 pagecreate = 0; 2283 2284 /* 2285 * n is the number of bytes required to satisfy the request 2286 * or the number of bytes to fill out the page. 2287 */ 2288 n = (int)MIN((PAGESIZE - (offset & PAGEOFFSET)), tcount); 2289 2290 /* 2291 * Check to see if we can skip reading in the page 2292 * and just allocate the memory. We can do this 2293 * if we are going to rewrite the entire mapping 2294 * or if we are going to write to or beyond the current 2295 * end of file from the beginning of the mapping. 2296 * 2297 * The read of r_size is now protected by r_statelock. 2298 */ 2299 mutex_enter(&rp->r_statelock); 2300 /* 2301 * When pgcreated is nonzero the caller has already done 2302 * a segmap_getmapflt with forcefault 0 and S_WRITE. With 2303 * segkpm this means we already have at least one page 2304 * created and mapped at base. 2305 */ 2306 pagecreate = pgcreated || 2307 ((offset & PAGEOFFSET) == 0 && 2308 (n == PAGESIZE || ((offset + n) >= rp->r_size))); 2309 2310 mutex_exit(&rp->r_statelock); 2311 2312 if (!vpm_enable && pagecreate) { 2313 /* 2314 * The last argument tells segmap_pagecreate() to 2315 * always lock the page, as opposed to sometimes 2316 * returning with the page locked. This way we avoid a 2317 * fault on the ensuing uiomove(), but also 2318 * more importantly (to fix bug 1094402) we can 2319 * call segmap_fault() to unlock the page in all 2320 * cases. An alternative would be to modify 2321 * segmap_pagecreate() to tell us when it is 2322 * locking a page, but that's a fairly major 2323 * interface change. 2324 */ 2325 if (pgcreated == 0) 2326 (void) segmap_pagecreate(segkmap, base, 2327 (uint_t)n, 1); 2328 saved_base = base; 2329 saved_n = n; 2330 } 2331 2332 /* 2333 * The number of bytes of data in the last page can not 2334 * be accurately be determined while page is being 2335 * uiomove'd to and the size of the file being updated. 2336 * Thus, inform threads which need to know accurately 2337 * how much data is in the last page of the file. They 2338 * will not do the i/o immediately, but will arrange for 2339 * the i/o to happen later when this modify operation 2340 * will have finished. 2341 */ 2342 ASSERT(!(rp->r_flags & R4MODINPROGRESS)); 2343 mutex_enter(&rp->r_statelock); 2344 rp->r_flags |= R4MODINPROGRESS; 2345 rp->r_modaddr = (offset & MAXBMASK); 2346 mutex_exit(&rp->r_statelock); 2347 2348 if (vpm_enable) { 2349 /* 2350 * Copy data. If new pages are created, part of 2351 * the page that is not written will be initizliazed 2352 * with zeros. 2353 */ 2354 error = vpm_data_copy(vp, offset, n, uio, 2355 !pagecreate, NULL, 0, S_WRITE); 2356 } else { 2357 error = uiomove(base, n, UIO_WRITE, uio); 2358 } 2359 2360 /* 2361 * r_size is the maximum number of 2362 * bytes known to be in the file. 2363 * Make sure it is at least as high as the 2364 * first unwritten byte pointed to by uio_loffset. 2365 */ 2366 mutex_enter(&rp->r_statelock); 2367 if (rp->r_size < uio->uio_loffset) 2368 rp->r_size = uio->uio_loffset; 2369 rp->r_flags &= ~R4MODINPROGRESS; 2370 rp->r_flags |= R4DIRTY; 2371 mutex_exit(&rp->r_statelock); 2372 2373 /* n = # of bytes written */ 2374 n = (int)(uio->uio_loffset - offset); 2375 2376 if (!vpm_enable) { 2377 base += n; 2378 } 2379 2380 tcount -= n; 2381 /* 2382 * If we created pages w/o initializing them completely, 2383 * we need to zero the part that wasn't set up. 2384 * This happens on a most EOF write cases and if 2385 * we had some sort of error during the uiomove. 2386 */ 2387 if (!vpm_enable && pagecreate) { 2388 if ((uio->uio_loffset & PAGEOFFSET) || n == 0) 2389 (void) kzero(base, PAGESIZE - n); 2390 2391 if (pgcreated) { 2392 /* 2393 * Caller is responsible for this page, 2394 * it was not created in this loop. 2395 */ 2396 pgcreated = 0; 2397 } else { 2398 /* 2399 * For bug 1094402: segmap_pagecreate locks 2400 * page. Unlock it. This also unlocks the 2401 * pages allocated by page_create_va() in 2402 * segmap_pagecreate(). 2403 */ 2404 sm_error = segmap_fault(kas.a_hat, segkmap, 2405 saved_base, saved_n, 2406 F_SOFTUNLOCK, S_WRITE); 2407 if (error == 0) 2408 error = sm_error; 2409 } 2410 } 2411 } while (tcount > 0 && error == 0); 2412 2413 return (error); 2414 } 2415 2416 int 2417 nfs4_putpages(vnode_t *vp, u_offset_t off, size_t len, int flags, cred_t *cr) 2418 { 2419 rnode4_t *rp; 2420 page_t *pp; 2421 u_offset_t eoff; 2422 u_offset_t io_off; 2423 size_t io_len; 2424 int error; 2425 int rdirty; 2426 int err; 2427 2428 rp = VTOR4(vp); 2429 ASSERT(rp->r_count > 0); 2430 2431 if (!nfs4_has_pages(vp)) 2432 return (0); 2433 2434 ASSERT(vp->v_type != VCHR); 2435 2436 /* 2437 * If R4OUTOFSPACE is set, then all writes turn into B_INVAL 2438 * writes. B_FORCE is set to force the VM system to actually 2439 * invalidate the pages, even if the i/o failed. The pages 2440 * need to get invalidated because they can't be written out 2441 * because there isn't any space left on either the server's 2442 * file system or in the user's disk quota. The B_FREE bit 2443 * is cleared to avoid confusion as to whether this is a 2444 * request to place the page on the freelist or to destroy 2445 * it. 2446 */ 2447 if ((rp->r_flags & R4OUTOFSPACE) || 2448 (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)) 2449 flags = (flags & ~B_FREE) | B_INVAL | B_FORCE; 2450 2451 if (len == 0) { 2452 /* 2453 * If doing a full file synchronous operation, then clear 2454 * the R4DIRTY bit. If a page gets dirtied while the flush 2455 * is happening, then R4DIRTY will get set again. The 2456 * R4DIRTY bit must get cleared before the flush so that 2457 * we don't lose this information. 2458 * 2459 * If there are no full file async write operations 2460 * pending and RDIRTY bit is set, clear it. 2461 */ 2462 if (off == (u_offset_t)0 && 2463 !(flags & B_ASYNC) && 2464 (rp->r_flags & R4DIRTY)) { 2465 mutex_enter(&rp->r_statelock); 2466 rdirty = (rp->r_flags & R4DIRTY); 2467 rp->r_flags &= ~R4DIRTY; 2468 mutex_exit(&rp->r_statelock); 2469 } else if (flags & B_ASYNC && off == (u_offset_t)0) { 2470 mutex_enter(&rp->r_statelock); 2471 if (rp->r_flags & R4DIRTY && rp->r_awcount == 0) { 2472 rdirty = (rp->r_flags & R4DIRTY); 2473 rp->r_flags &= ~R4DIRTY; 2474 } 2475 mutex_exit(&rp->r_statelock); 2476 } else 2477 rdirty = 0; 2478 2479 /* 2480 * Search the entire vp list for pages >= off, and flush 2481 * the dirty pages. 2482 */ 2483 error = pvn_vplist_dirty(vp, off, rp->r_putapage, 2484 flags, cr); 2485 2486 /* 2487 * If an error occurred and the file was marked as dirty 2488 * before and we aren't forcibly invalidating pages, then 2489 * reset the R4DIRTY flag. 2490 */ 2491 if (error && rdirty && 2492 (flags & (B_INVAL | B_FORCE)) != (B_INVAL | B_FORCE)) { 2493 mutex_enter(&rp->r_statelock); 2494 rp->r_flags |= R4DIRTY; 2495 mutex_exit(&rp->r_statelock); 2496 } 2497 } else { 2498 /* 2499 * Do a range from [off...off + len) looking for pages 2500 * to deal with. 2501 */ 2502 error = 0; 2503 io_len = 0; 2504 eoff = off + len; 2505 mutex_enter(&rp->r_statelock); 2506 for (io_off = off; io_off < eoff && io_off < rp->r_size; 2507 io_off += io_len) { 2508 mutex_exit(&rp->r_statelock); 2509 /* 2510 * If we are not invalidating, synchronously 2511 * freeing or writing pages use the routine 2512 * page_lookup_nowait() to prevent reclaiming 2513 * them from the free list. 2514 */ 2515 if ((flags & B_INVAL) || !(flags & B_ASYNC)) { 2516 pp = page_lookup(vp, io_off, 2517 (flags & (B_INVAL | B_FREE)) ? 2518 SE_EXCL : SE_SHARED); 2519 } else { 2520 pp = page_lookup_nowait(vp, io_off, 2521 (flags & B_FREE) ? SE_EXCL : SE_SHARED); 2522 } 2523 2524 if (pp == NULL || !pvn_getdirty(pp, flags)) 2525 io_len = PAGESIZE; 2526 else { 2527 err = (*rp->r_putapage)(vp, pp, &io_off, 2528 &io_len, flags, cr); 2529 if (!error) 2530 error = err; 2531 /* 2532 * "io_off" and "io_len" are returned as 2533 * the range of pages we actually wrote. 2534 * This allows us to skip ahead more quickly 2535 * since several pages may've been dealt 2536 * with by this iteration of the loop. 2537 */ 2538 } 2539 mutex_enter(&rp->r_statelock); 2540 } 2541 mutex_exit(&rp->r_statelock); 2542 } 2543 2544 return (error); 2545 } 2546 2547 void 2548 nfs4_invalidate_pages(vnode_t *vp, u_offset_t off, cred_t *cr) 2549 { 2550 rnode4_t *rp; 2551 2552 rp = VTOR4(vp); 2553 if (IS_SHADOW(vp, rp)) 2554 vp = RTOV4(rp); 2555 mutex_enter(&rp->r_statelock); 2556 while (rp->r_flags & R4TRUNCATE) 2557 cv_wait(&rp->r_cv, &rp->r_statelock); 2558 rp->r_flags |= R4TRUNCATE; 2559 if (off == (u_offset_t)0) { 2560 rp->r_flags &= ~R4DIRTY; 2561 if (!(rp->r_flags & R4STALE)) 2562 rp->r_error = 0; 2563 } 2564 rp->r_truncaddr = off; 2565 mutex_exit(&rp->r_statelock); 2566 (void) pvn_vplist_dirty(vp, off, rp->r_putapage, 2567 B_INVAL | B_TRUNC, cr); 2568 mutex_enter(&rp->r_statelock); 2569 rp->r_flags &= ~R4TRUNCATE; 2570 cv_broadcast(&rp->r_cv); 2571 mutex_exit(&rp->r_statelock); 2572 } 2573 2574 static int 2575 nfs4_mnt_kstat_update(kstat_t *ksp, int rw) 2576 { 2577 mntinfo4_t *mi; 2578 struct mntinfo_kstat *mik; 2579 vfs_t *vfsp; 2580 2581 /* this is a read-only kstat. Bail out on a write */ 2582 if (rw == KSTAT_WRITE) 2583 return (EACCES); 2584 2585 2586 /* 2587 * We don't want to wait here as kstat_chain_lock could be held by 2588 * dounmount(). dounmount() takes vfs_reflock before the chain lock 2589 * and thus could lead to a deadlock. 2590 */ 2591 vfsp = (struct vfs *)ksp->ks_private; 2592 2593 mi = VFTOMI4(vfsp); 2594 mik = (struct mntinfo_kstat *)ksp->ks_data; 2595 2596 (void) strcpy(mik->mik_proto, mi->mi_curr_serv->sv_knconf->knc_proto); 2597 2598 mik->mik_vers = (uint32_t)mi->mi_vers; 2599 mik->mik_flags = mi->mi_flags; 2600 /* 2601 * The sv_secdata holds the flavor the client specifies. 2602 * If the client uses default and a security negotiation 2603 * occurs, sv_currsec will point to the current flavor 2604 * selected from the server flavor list. 2605 * sv_currsec is NULL if no security negotiation takes place. 2606 */ 2607 mik->mik_secmod = mi->mi_curr_serv->sv_currsec ? 2608 mi->mi_curr_serv->sv_currsec->secmod : 2609 mi->mi_curr_serv->sv_secdata->secmod; 2610 mik->mik_curread = (uint32_t)mi->mi_curread; 2611 mik->mik_curwrite = (uint32_t)mi->mi_curwrite; 2612 mik->mik_retrans = mi->mi_retrans; 2613 mik->mik_timeo = mi->mi_timeo; 2614 mik->mik_acregmin = HR2SEC(mi->mi_acregmin); 2615 mik->mik_acregmax = HR2SEC(mi->mi_acregmax); 2616 mik->mik_acdirmin = HR2SEC(mi->mi_acdirmin); 2617 mik->mik_acdirmax = HR2SEC(mi->mi_acdirmax); 2618 mik->mik_noresponse = (uint32_t)mi->mi_noresponse; 2619 mik->mik_failover = (uint32_t)mi->mi_failover; 2620 mik->mik_remap = (uint32_t)mi->mi_remap; 2621 2622 (void) strcpy(mik->mik_curserver, mi->mi_curr_serv->sv_hostname); 2623 2624 return (0); 2625 } 2626 2627 void 2628 nfs4_mnt_kstat_init(struct vfs *vfsp) 2629 { 2630 mntinfo4_t *mi = VFTOMI4(vfsp); 2631 2632 /* 2633 * PSARC 2001/697 Contract Private Interface 2634 * All nfs kstats are under SunMC contract 2635 * Please refer to the PSARC listed above and contact 2636 * SunMC before making any changes! 2637 * 2638 * Changes must be reviewed by Solaris File Sharing 2639 * Changes must be communicated to contract-2001-697@sun.com 2640 * 2641 */ 2642 2643 mi->mi_io_kstats = kstat_create_zone("nfs", getminor(vfsp->vfs_dev), 2644 NULL, "nfs", KSTAT_TYPE_IO, 1, 0, mi->mi_zone->zone_id); 2645 if (mi->mi_io_kstats) { 2646 if (mi->mi_zone->zone_id != GLOBAL_ZONEID) 2647 kstat_zone_add(mi->mi_io_kstats, GLOBAL_ZONEID); 2648 mi->mi_io_kstats->ks_lock = &mi->mi_lock; 2649 kstat_install(mi->mi_io_kstats); 2650 } 2651 2652 if ((mi->mi_ro_kstats = kstat_create_zone("nfs", 2653 getminor(vfsp->vfs_dev), "mntinfo", "misc", KSTAT_TYPE_RAW, 2654 sizeof (struct mntinfo_kstat), 0, mi->mi_zone->zone_id)) != NULL) { 2655 if (mi->mi_zone->zone_id != GLOBAL_ZONEID) 2656 kstat_zone_add(mi->mi_ro_kstats, GLOBAL_ZONEID); 2657 mi->mi_ro_kstats->ks_update = nfs4_mnt_kstat_update; 2658 mi->mi_ro_kstats->ks_private = (void *)vfsp; 2659 kstat_install(mi->mi_ro_kstats); 2660 } 2661 2662 nfs4_mnt_recov_kstat_init(vfsp); 2663 } 2664 2665 void 2666 nfs4_write_error(vnode_t *vp, int error, cred_t *cr) 2667 { 2668 mntinfo4_t *mi; 2669 clock_t now = ddi_get_lbolt(); 2670 2671 mi = VTOMI4(vp); 2672 /* 2673 * In case of forced unmount, do not print any messages 2674 * since it can flood the console with error messages. 2675 */ 2676 if (mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED) 2677 return; 2678 2679 /* 2680 * If the mount point is dead, not recoverable, do not 2681 * print error messages that can flood the console. 2682 */ 2683 if (mi->mi_flags & MI4_RECOV_FAIL) 2684 return; 2685 2686 /* 2687 * No use in flooding the console with ENOSPC 2688 * messages from the same file system. 2689 */ 2690 if ((error != ENOSPC && error != EDQUOT) || 2691 now - mi->mi_printftime > 0) { 2692 zoneid_t zoneid = mi->mi_zone->zone_id; 2693 2694 #ifdef DEBUG 2695 nfs_perror(error, "NFS%ld write error on host %s: %m.\n", 2696 mi->mi_vers, VTOR4(vp)->r_server->sv_hostname, NULL); 2697 #else 2698 nfs_perror(error, "NFS write error on host %s: %m.\n", 2699 VTOR4(vp)->r_server->sv_hostname, NULL); 2700 #endif 2701 if (error == ENOSPC || error == EDQUOT) { 2702 zcmn_err(zoneid, CE_CONT, 2703 "^File: userid=%d, groupid=%d\n", 2704 crgetuid(cr), crgetgid(cr)); 2705 if (crgetuid(curthread->t_cred) != crgetuid(cr) || 2706 crgetgid(curthread->t_cred) != crgetgid(cr)) { 2707 zcmn_err(zoneid, CE_CONT, 2708 "^User: userid=%d, groupid=%d\n", 2709 crgetuid(curthread->t_cred), 2710 crgetgid(curthread->t_cred)); 2711 } 2712 mi->mi_printftime = now + 2713 nfs_write_error_interval * hz; 2714 } 2715 sfh4_printfhandle(VTOR4(vp)->r_fh); 2716 #ifdef DEBUG 2717 if (error == EACCES) { 2718 zcmn_err(zoneid, CE_CONT, 2719 "nfs_bio: cred is%s kcred\n", 2720 cr == kcred ? "" : " not"); 2721 } 2722 #endif 2723 } 2724 } 2725 2726 /* 2727 * Return non-zero if the given file can be safely memory mapped. Locks 2728 * are safe if whole-file (length and offset are both zero). 2729 */ 2730 2731 #define SAFE_LOCK(flk) ((flk).l_start == 0 && (flk).l_len == 0) 2732 2733 static int 2734 nfs4_safemap(const vnode_t *vp) 2735 { 2736 locklist_t *llp, *next_llp; 2737 int safe = 1; 2738 rnode4_t *rp = VTOR4(vp); 2739 2740 ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER)); 2741 2742 NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, "nfs4_safemap: " 2743 "vp = %p", (void *)vp)); 2744 2745 /* 2746 * Review all the locks for the vnode, both ones that have been 2747 * acquired and ones that are pending. We assume that 2748 * flk_active_locks_for_vp() has merged any locks that can be 2749 * merged (so that if a process has the entire file locked, it is 2750 * represented as a single lock). 2751 * 2752 * Note that we can't bail out of the loop if we find a non-safe 2753 * lock, because we have to free all the elements in the llp list. 2754 * We might be able to speed up this code slightly by not looking 2755 * at each lock's l_start and l_len fields once we've found a 2756 * non-safe lock. 2757 */ 2758 2759 llp = flk_active_locks_for_vp(vp); 2760 while (llp) { 2761 NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, 2762 "nfs4_safemap: active lock (%" PRId64 ", %" PRId64 ")", 2763 llp->ll_flock.l_start, llp->ll_flock.l_len)); 2764 if (!SAFE_LOCK(llp->ll_flock)) { 2765 safe = 0; 2766 NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, 2767 "nfs4_safemap: unsafe active lock (%" PRId64 2768 ", %" PRId64 ")", llp->ll_flock.l_start, 2769 llp->ll_flock.l_len)); 2770 } 2771 next_llp = llp->ll_next; 2772 VN_RELE(llp->ll_vp); 2773 kmem_free(llp, sizeof (*llp)); 2774 llp = next_llp; 2775 } 2776 2777 NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, "nfs4_safemap: %s", 2778 safe ? "safe" : "unsafe")); 2779 return (safe); 2780 } 2781 2782 /* 2783 * Return whether there is a lost LOCK or LOCKU queued up for the given 2784 * file that would make an mmap request unsafe. cf. nfs4_safemap(). 2785 */ 2786 2787 bool_t 2788 nfs4_map_lost_lock_conflict(vnode_t *vp) 2789 { 2790 bool_t conflict = FALSE; 2791 nfs4_lost_rqst_t *lrp; 2792 mntinfo4_t *mi = VTOMI4(vp); 2793 2794 mutex_enter(&mi->mi_lock); 2795 for (lrp = list_head(&mi->mi_lost_state); lrp != NULL; 2796 lrp = list_next(&mi->mi_lost_state, lrp)) { 2797 if (lrp->lr_op != OP_LOCK && lrp->lr_op != OP_LOCKU) 2798 continue; 2799 ASSERT(lrp->lr_vp != NULL); 2800 if (!VOP_CMP(lrp->lr_vp, vp, NULL)) 2801 continue; /* different file */ 2802 if (!SAFE_LOCK(*lrp->lr_flk)) { 2803 conflict = TRUE; 2804 break; 2805 } 2806 } 2807 2808 mutex_exit(&mi->mi_lock); 2809 return (conflict); 2810 } 2811 2812 /* 2813 * nfs_lockcompletion: 2814 * 2815 * If the vnode has a lock that makes it unsafe to cache the file, mark it 2816 * as non cachable (set VNOCACHE bit). 2817 */ 2818 2819 void 2820 nfs4_lockcompletion(vnode_t *vp, int cmd) 2821 { 2822 rnode4_t *rp = VTOR4(vp); 2823 2824 ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER)); 2825 ASSERT(!IS_SHADOW(vp, rp)); 2826 2827 if (cmd == F_SETLK || cmd == F_SETLKW) { 2828 2829 if (!nfs4_safemap(vp)) { 2830 mutex_enter(&vp->v_lock); 2831 vp->v_flag |= VNOCACHE; 2832 mutex_exit(&vp->v_lock); 2833 } else { 2834 mutex_enter(&vp->v_lock); 2835 vp->v_flag &= ~VNOCACHE; 2836 mutex_exit(&vp->v_lock); 2837 } 2838 } 2839 /* 2840 * The cached attributes of the file are stale after acquiring 2841 * the lock on the file. They were updated when the file was 2842 * opened, but not updated when the lock was acquired. Therefore the 2843 * cached attributes are invalidated after the lock is obtained. 2844 */ 2845 PURGE_ATTRCACHE4(vp); 2846 } 2847 2848 /* ARGSUSED */ 2849 static void * 2850 nfs4_mi_init(zoneid_t zoneid) 2851 { 2852 struct mi4_globals *mig; 2853 2854 mig = kmem_alloc(sizeof (*mig), KM_SLEEP); 2855 mutex_init(&mig->mig_lock, NULL, MUTEX_DEFAULT, NULL); 2856 list_create(&mig->mig_list, sizeof (mntinfo4_t), 2857 offsetof(mntinfo4_t, mi_zone_node)); 2858 mig->mig_destructor_called = B_FALSE; 2859 return (mig); 2860 } 2861 2862 /* 2863 * Callback routine to tell all NFSv4 mounts in the zone to start tearing down 2864 * state and killing off threads. 2865 */ 2866 /* ARGSUSED */ 2867 static void 2868 nfs4_mi_shutdown(zoneid_t zoneid, void *data) 2869 { 2870 struct mi4_globals *mig = data; 2871 mntinfo4_t *mi; 2872 nfs4_server_t *np; 2873 2874 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE, 2875 "nfs4_mi_shutdown zone %d\n", zoneid)); 2876 ASSERT(mig != NULL); 2877 for (;;) { 2878 mutex_enter(&mig->mig_lock); 2879 mi = list_head(&mig->mig_list); 2880 if (mi == NULL) { 2881 mutex_exit(&mig->mig_lock); 2882 break; 2883 } 2884 2885 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE, 2886 "nfs4_mi_shutdown stopping vfs %p\n", (void *)mi->mi_vfsp)); 2887 /* 2888 * purge the DNLC for this filesystem 2889 */ 2890 (void) dnlc_purge_vfsp(mi->mi_vfsp, 0); 2891 /* 2892 * Tell existing async worker threads to exit. 2893 */ 2894 mutex_enter(&mi->mi_async_lock); 2895 mi->mi_max_threads = 0; 2896 NFS4_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv); 2897 /* 2898 * Set the appropriate flags, signal and wait for both the 2899 * async manager and the inactive thread to exit when they're 2900 * done with their current work. 2901 */ 2902 mutex_enter(&mi->mi_lock); 2903 mi->mi_flags |= (MI4_ASYNC_MGR_STOP|MI4_DEAD); 2904 mutex_exit(&mi->mi_lock); 2905 mutex_exit(&mi->mi_async_lock); 2906 if (mi->mi_manager_thread) { 2907 nfs4_async_manager_stop(mi->mi_vfsp); 2908 } 2909 if (mi->mi_inactive_thread) { 2910 mutex_enter(&mi->mi_async_lock); 2911 cv_signal(&mi->mi_inact_req_cv); 2912 /* 2913 * Wait for the inactive thread to exit. 2914 */ 2915 while (mi->mi_inactive_thread != NULL) { 2916 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock); 2917 } 2918 mutex_exit(&mi->mi_async_lock); 2919 } 2920 /* 2921 * Wait for the recovery thread to complete, that is, it will 2922 * signal when it is done using the "mi" structure and about 2923 * to exit 2924 */ 2925 mutex_enter(&mi->mi_lock); 2926 while (mi->mi_in_recovery > 0) 2927 cv_wait(&mi->mi_cv_in_recov, &mi->mi_lock); 2928 mutex_exit(&mi->mi_lock); 2929 /* 2930 * We're done when every mi has been done or the list is empty. 2931 * This one is done, remove it from the list. 2932 */ 2933 list_remove(&mig->mig_list, mi); 2934 mutex_exit(&mig->mig_lock); 2935 zone_rele_ref(&mi->mi_zone_ref, ZONE_REF_NFSV4); 2936 2937 /* 2938 * Release hold on vfs and mi done to prevent race with zone 2939 * shutdown. This releases the hold in nfs4_mi_zonelist_add. 2940 */ 2941 VFS_RELE(mi->mi_vfsp); 2942 MI4_RELE(mi); 2943 } 2944 /* 2945 * Tell each renew thread in the zone to exit 2946 */ 2947 mutex_enter(&nfs4_server_lst_lock); 2948 for (np = nfs4_server_lst.forw; np != &nfs4_server_lst; np = np->forw) { 2949 mutex_enter(&np->s_lock); 2950 if (np->zoneid == zoneid) { 2951 /* 2952 * We add another hold onto the nfs4_server_t 2953 * because this will make sure tha the nfs4_server_t 2954 * stays around until nfs4_callback_fini_zone destroys 2955 * the zone. This way, the renew thread can 2956 * unconditionally release its holds on the 2957 * nfs4_server_t. 2958 */ 2959 np->s_refcnt++; 2960 nfs4_mark_srv_dead(np); 2961 } 2962 mutex_exit(&np->s_lock); 2963 } 2964 mutex_exit(&nfs4_server_lst_lock); 2965 } 2966 2967 static void 2968 nfs4_mi_free_globals(struct mi4_globals *mig) 2969 { 2970 list_destroy(&mig->mig_list); /* makes sure the list is empty */ 2971 mutex_destroy(&mig->mig_lock); 2972 kmem_free(mig, sizeof (*mig)); 2973 } 2974 2975 /* ARGSUSED */ 2976 static void 2977 nfs4_mi_destroy(zoneid_t zoneid, void *data) 2978 { 2979 struct mi4_globals *mig = data; 2980 2981 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE, 2982 "nfs4_mi_destroy zone %d\n", zoneid)); 2983 ASSERT(mig != NULL); 2984 mutex_enter(&mig->mig_lock); 2985 if (list_head(&mig->mig_list) != NULL) { 2986 /* Still waiting for VFS_FREEVFS() */ 2987 mig->mig_destructor_called = B_TRUE; 2988 mutex_exit(&mig->mig_lock); 2989 return; 2990 } 2991 nfs4_mi_free_globals(mig); 2992 } 2993 2994 /* 2995 * Add an NFS mount to the per-zone list of NFS mounts. 2996 */ 2997 void 2998 nfs4_mi_zonelist_add(mntinfo4_t *mi) 2999 { 3000 struct mi4_globals *mig; 3001 3002 mig = zone_getspecific(mi4_list_key, mi->mi_zone); 3003 mutex_enter(&mig->mig_lock); 3004 list_insert_head(&mig->mig_list, mi); 3005 /* 3006 * hold added to eliminate race with zone shutdown -this will be 3007 * released in mi_shutdown 3008 */ 3009 MI4_HOLD(mi); 3010 VFS_HOLD(mi->mi_vfsp); 3011 mutex_exit(&mig->mig_lock); 3012 } 3013 3014 /* 3015 * Remove an NFS mount from the per-zone list of NFS mounts. 3016 */ 3017 int 3018 nfs4_mi_zonelist_remove(mntinfo4_t *mi) 3019 { 3020 struct mi4_globals *mig; 3021 int ret = 0; 3022 3023 mig = zone_getspecific(mi4_list_key, mi->mi_zone); 3024 mutex_enter(&mig->mig_lock); 3025 mutex_enter(&mi->mi_lock); 3026 /* if this mi is marked dead, then the zone already released it */ 3027 if (!(mi->mi_flags & MI4_DEAD)) { 3028 list_remove(&mig->mig_list, mi); 3029 mutex_exit(&mi->mi_lock); 3030 3031 /* release the holds put on in zonelist_add(). */ 3032 VFS_RELE(mi->mi_vfsp); 3033 MI4_RELE(mi); 3034 ret = 1; 3035 } else { 3036 mutex_exit(&mi->mi_lock); 3037 } 3038 3039 /* 3040 * We can be called asynchronously by VFS_FREEVFS() after the zone 3041 * shutdown/destroy callbacks have executed; if so, clean up the zone's 3042 * mi globals. 3043 */ 3044 if (list_head(&mig->mig_list) == NULL && 3045 mig->mig_destructor_called == B_TRUE) { 3046 nfs4_mi_free_globals(mig); 3047 return (ret); 3048 } 3049 mutex_exit(&mig->mig_lock); 3050 return (ret); 3051 } 3052 3053 void 3054 nfs_free_mi4(mntinfo4_t *mi) 3055 { 3056 nfs4_open_owner_t *foop; 3057 nfs4_oo_hash_bucket_t *bucketp; 3058 nfs4_debug_msg_t *msgp; 3059 int i; 3060 servinfo4_t *svp; 3061 3062 /* 3063 * Code introduced here should be carefully evaluated to make 3064 * sure none of the freed resources are accessed either directly 3065 * or indirectly after freeing them. For eg: Introducing calls to 3066 * NFS4_DEBUG that use mntinfo4_t structure member after freeing 3067 * the structure members or other routines calling back into NFS 3068 * accessing freed mntinfo4_t structure member. 3069 */ 3070 mutex_enter(&mi->mi_lock); 3071 ASSERT(mi->mi_recovthread == NULL); 3072 ASSERT(mi->mi_flags & MI4_ASYNC_MGR_STOP); 3073 mutex_exit(&mi->mi_lock); 3074 mutex_enter(&mi->mi_async_lock); 3075 ASSERT(mi->mi_threads[NFS4_ASYNC_QUEUE] == 0 && 3076 mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] == 0); 3077 ASSERT(mi->mi_manager_thread == NULL); 3078 mutex_exit(&mi->mi_async_lock); 3079 if (mi->mi_io_kstats) { 3080 kstat_delete(mi->mi_io_kstats); 3081 mi->mi_io_kstats = NULL; 3082 } 3083 if (mi->mi_ro_kstats) { 3084 kstat_delete(mi->mi_ro_kstats); 3085 mi->mi_ro_kstats = NULL; 3086 } 3087 if (mi->mi_recov_ksp) { 3088 kstat_delete(mi->mi_recov_ksp); 3089 mi->mi_recov_ksp = NULL; 3090 } 3091 mutex_enter(&mi->mi_msg_list_lock); 3092 while (msgp = list_head(&mi->mi_msg_list)) { 3093 list_remove(&mi->mi_msg_list, msgp); 3094 nfs4_free_msg(msgp); 3095 } 3096 mutex_exit(&mi->mi_msg_list_lock); 3097 list_destroy(&mi->mi_msg_list); 3098 if (mi->mi_fname != NULL) 3099 fn_rele(&mi->mi_fname); 3100 if (mi->mi_rootfh != NULL) 3101 sfh4_rele(&mi->mi_rootfh); 3102 if (mi->mi_srvparentfh != NULL) 3103 sfh4_rele(&mi->mi_srvparentfh); 3104 svp = mi->mi_servers; 3105 sv4_free(svp); 3106 mutex_destroy(&mi->mi_lock); 3107 mutex_destroy(&mi->mi_async_lock); 3108 mutex_destroy(&mi->mi_msg_list_lock); 3109 nfs_rw_destroy(&mi->mi_recovlock); 3110 nfs_rw_destroy(&mi->mi_rename_lock); 3111 nfs_rw_destroy(&mi->mi_fh_lock); 3112 cv_destroy(&mi->mi_failover_cv); 3113 cv_destroy(&mi->mi_async_reqs_cv); 3114 cv_destroy(&mi->mi_async_work_cv[NFS4_ASYNC_QUEUE]); 3115 cv_destroy(&mi->mi_async_work_cv[NFS4_ASYNC_PGOPS_QUEUE]); 3116 cv_destroy(&mi->mi_async_cv); 3117 cv_destroy(&mi->mi_inact_req_cv); 3118 /* 3119 * Destroy the oo hash lists and mutexes for the cred hash table. 3120 */ 3121 for (i = 0; i < NFS4_NUM_OO_BUCKETS; i++) { 3122 bucketp = &(mi->mi_oo_list[i]); 3123 /* Destroy any remaining open owners on the list */ 3124 foop = list_head(&bucketp->b_oo_hash_list); 3125 while (foop != NULL) { 3126 list_remove(&bucketp->b_oo_hash_list, foop); 3127 nfs4_destroy_open_owner(foop); 3128 foop = list_head(&bucketp->b_oo_hash_list); 3129 } 3130 list_destroy(&bucketp->b_oo_hash_list); 3131 mutex_destroy(&bucketp->b_lock); 3132 } 3133 /* 3134 * Empty and destroy the freed open owner list. 3135 */ 3136 foop = list_head(&mi->mi_foo_list); 3137 while (foop != NULL) { 3138 list_remove(&mi->mi_foo_list, foop); 3139 nfs4_destroy_open_owner(foop); 3140 foop = list_head(&mi->mi_foo_list); 3141 } 3142 list_destroy(&mi->mi_foo_list); 3143 list_destroy(&mi->mi_bseqid_list); 3144 list_destroy(&mi->mi_lost_state); 3145 avl_destroy(&mi->mi_filehandles); 3146 kmem_free(mi, sizeof (*mi)); 3147 } 3148 void 3149 mi_hold(mntinfo4_t *mi) 3150 { 3151 atomic_add_32(&mi->mi_count, 1); 3152 ASSERT(mi->mi_count != 0); 3153 } 3154 3155 void 3156 mi_rele(mntinfo4_t *mi) 3157 { 3158 ASSERT(mi->mi_count != 0); 3159 if (atomic_add_32_nv(&mi->mi_count, -1) == 0) { 3160 nfs_free_mi4(mi); 3161 } 3162 } 3163 3164 vnode_t nfs4_xattr_notsupp_vnode; 3165 3166 void 3167 nfs4_clnt_init(void) 3168 { 3169 nfs4_vnops_init(); 3170 (void) nfs4_rnode_init(); 3171 (void) nfs4_shadow_init(); 3172 (void) nfs4_acache_init(); 3173 (void) nfs4_subr_init(); 3174 nfs4_acl_init(); 3175 nfs_idmap_init(); 3176 nfs4_callback_init(); 3177 nfs4_secinfo_init(); 3178 #ifdef DEBUG 3179 tsd_create(&nfs4_tsd_key, NULL); 3180 #endif 3181 3182 /* 3183 * Add a CPR callback so that we can update client 3184 * lease after a suspend and resume. 3185 */ 3186 cid = callb_add(nfs4_client_cpr_callb, 0, CB_CL_CPR_RPC, "nfs4"); 3187 3188 zone_key_create(&mi4_list_key, nfs4_mi_init, nfs4_mi_shutdown, 3189 nfs4_mi_destroy); 3190 3191 /* 3192 * Initialise the reference count of the notsupp xattr cache vnode to 1 3193 * so that it never goes away (VOP_INACTIVE isn't called on it). 3194 */ 3195 nfs4_xattr_notsupp_vnode.v_count = 1; 3196 } 3197 3198 void 3199 nfs4_clnt_fini(void) 3200 { 3201 (void) zone_key_delete(mi4_list_key); 3202 nfs4_vnops_fini(); 3203 (void) nfs4_rnode_fini(); 3204 (void) nfs4_shadow_fini(); 3205 (void) nfs4_acache_fini(); 3206 (void) nfs4_subr_fini(); 3207 nfs_idmap_fini(); 3208 nfs4_callback_fini(); 3209 nfs4_secinfo_fini(); 3210 #ifdef DEBUG 3211 tsd_destroy(&nfs4_tsd_key); 3212 #endif 3213 if (cid) 3214 (void) callb_delete(cid); 3215 } 3216 3217 /*ARGSUSED*/ 3218 static boolean_t 3219 nfs4_client_cpr_callb(void *arg, int code) 3220 { 3221 /* 3222 * We get called for Suspend and Resume events. 3223 * For the suspend case we simply don't care! 3224 */ 3225 if (code == CB_CODE_CPR_CHKPT) { 3226 return (B_TRUE); 3227 } 3228 3229 /* 3230 * When we get to here we are in the process of 3231 * resuming the system from a previous suspend. 3232 */ 3233 nfs4_client_resumed = gethrestime_sec(); 3234 return (B_TRUE); 3235 } 3236 3237 void 3238 nfs4_renew_lease_thread(nfs4_server_t *sp) 3239 { 3240 int error = 0; 3241 time_t tmp_last_renewal_time, tmp_time, tmp_now_time, kip_secs; 3242 clock_t tick_delay = 0; 3243 clock_t time_left = 0; 3244 callb_cpr_t cpr_info; 3245 kmutex_t cpr_lock; 3246 3247 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3248 "nfs4_renew_lease_thread: acting on sp 0x%p", (void*)sp)); 3249 mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL); 3250 CALLB_CPR_INIT(&cpr_info, &cpr_lock, callb_generic_cpr, "nfsv4Lease"); 3251 3252 mutex_enter(&sp->s_lock); 3253 /* sp->s_lease_time is set via a GETATTR */ 3254 sp->last_renewal_time = gethrestime_sec(); 3255 sp->lease_valid = NFS4_LEASE_UNINITIALIZED; 3256 ASSERT(sp->s_refcnt >= 1); 3257 3258 for (;;) { 3259 if (!sp->state_ref_count || 3260 sp->lease_valid != NFS4_LEASE_VALID) { 3261 3262 kip_secs = MAX((sp->s_lease_time >> 1) - 3263 (3 * sp->propagation_delay.tv_sec), 1); 3264 3265 tick_delay = SEC_TO_TICK(kip_secs); 3266 3267 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3268 "nfs4_renew_lease_thread: no renew : thread " 3269 "wait %ld secs", kip_secs)); 3270 3271 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3272 "nfs4_renew_lease_thread: no renew : " 3273 "state_ref_count %d, lease_valid %d", 3274 sp->state_ref_count, sp->lease_valid)); 3275 3276 mutex_enter(&cpr_lock); 3277 CALLB_CPR_SAFE_BEGIN(&cpr_info); 3278 mutex_exit(&cpr_lock); 3279 time_left = cv_reltimedwait(&sp->cv_thread_exit, 3280 &sp->s_lock, tick_delay, TR_CLOCK_TICK); 3281 mutex_enter(&cpr_lock); 3282 CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock); 3283 mutex_exit(&cpr_lock); 3284 3285 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3286 "nfs4_renew_lease_thread: no renew: " 3287 "time left %ld", time_left)); 3288 3289 if (sp->s_thread_exit == NFS4_THREAD_EXIT) 3290 goto die; 3291 continue; 3292 } 3293 3294 tmp_last_renewal_time = sp->last_renewal_time; 3295 3296 tmp_time = gethrestime_sec() - sp->last_renewal_time + 3297 (3 * sp->propagation_delay.tv_sec); 3298 3299 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3300 "nfs4_renew_lease_thread: tmp_time %ld, " 3301 "sp->last_renewal_time %ld", tmp_time, 3302 sp->last_renewal_time)); 3303 3304 kip_secs = MAX((sp->s_lease_time >> 1) - tmp_time, 1); 3305 3306 tick_delay = SEC_TO_TICK(kip_secs); 3307 3308 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3309 "nfs4_renew_lease_thread: valid lease: sleep for %ld " 3310 "secs", kip_secs)); 3311 3312 mutex_enter(&cpr_lock); 3313 CALLB_CPR_SAFE_BEGIN(&cpr_info); 3314 mutex_exit(&cpr_lock); 3315 time_left = cv_reltimedwait(&sp->cv_thread_exit, &sp->s_lock, 3316 tick_delay, TR_CLOCK_TICK); 3317 mutex_enter(&cpr_lock); 3318 CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock); 3319 mutex_exit(&cpr_lock); 3320 3321 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3322 "nfs4_renew_lease_thread: valid lease: time left %ld :" 3323 "sp last_renewal_time %ld, nfs4_client_resumed %ld, " 3324 "tmp_last_renewal_time %ld", time_left, 3325 sp->last_renewal_time, nfs4_client_resumed, 3326 tmp_last_renewal_time)); 3327 3328 if (sp->s_thread_exit == NFS4_THREAD_EXIT) 3329 goto die; 3330 3331 if (tmp_last_renewal_time == sp->last_renewal_time || 3332 (nfs4_client_resumed != 0 && 3333 nfs4_client_resumed > sp->last_renewal_time)) { 3334 /* 3335 * Issue RENEW op since we haven't renewed the lease 3336 * since we slept. 3337 */ 3338 tmp_now_time = gethrestime_sec(); 3339 error = nfs4renew(sp); 3340 /* 3341 * Need to re-acquire sp's lock, nfs4renew() 3342 * relinqueshes it. 3343 */ 3344 mutex_enter(&sp->s_lock); 3345 3346 /* 3347 * See if someone changed s_thread_exit while we gave 3348 * up s_lock. 3349 */ 3350 if (sp->s_thread_exit == NFS4_THREAD_EXIT) 3351 goto die; 3352 3353 if (!error) { 3354 /* 3355 * check to see if we implicitly renewed while 3356 * we waited for a reply for our RENEW call. 3357 */ 3358 if (tmp_last_renewal_time == 3359 sp->last_renewal_time) { 3360 /* no implicit renew came */ 3361 sp->last_renewal_time = tmp_now_time; 3362 } else { 3363 NFS4_DEBUG(nfs4_client_lease_debug, 3364 (CE_NOTE, "renew_thread: did " 3365 "implicit renewal before reply " 3366 "from server for RENEW")); 3367 } 3368 } else { 3369 /* figure out error */ 3370 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3371 "renew_thread: nfs4renew returned error" 3372 " %d", error)); 3373 } 3374 3375 } 3376 } 3377 3378 die: 3379 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3380 "nfs4_renew_lease_thread: thread exiting")); 3381 3382 while (sp->s_otw_call_count != 0) { 3383 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3384 "nfs4_renew_lease_thread: waiting for outstanding " 3385 "otw calls to finish for sp 0x%p, current " 3386 "s_otw_call_count %d", (void *)sp, 3387 sp->s_otw_call_count)); 3388 mutex_enter(&cpr_lock); 3389 CALLB_CPR_SAFE_BEGIN(&cpr_info); 3390 mutex_exit(&cpr_lock); 3391 cv_wait(&sp->s_cv_otw_count, &sp->s_lock); 3392 mutex_enter(&cpr_lock); 3393 CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock); 3394 mutex_exit(&cpr_lock); 3395 } 3396 mutex_exit(&sp->s_lock); 3397 3398 nfs4_server_rele(sp); /* free the thread's reference */ 3399 nfs4_server_rele(sp); /* free the list's reference */ 3400 sp = NULL; 3401 3402 done: 3403 mutex_enter(&cpr_lock); 3404 CALLB_CPR_EXIT(&cpr_info); /* drops cpr_lock */ 3405 mutex_destroy(&cpr_lock); 3406 3407 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3408 "nfs4_renew_lease_thread: renew thread exit officially")); 3409 3410 zthread_exit(); 3411 /* NOT REACHED */ 3412 } 3413 3414 /* 3415 * Send out a RENEW op to the server. 3416 * Assumes sp is locked down. 3417 */ 3418 static int 3419 nfs4renew(nfs4_server_t *sp) 3420 { 3421 COMPOUND4args_clnt args; 3422 COMPOUND4res_clnt res; 3423 nfs_argop4 argop[1]; 3424 int doqueue = 1; 3425 int rpc_error; 3426 cred_t *cr; 3427 mntinfo4_t *mi; 3428 timespec_t prop_time, after_time; 3429 int needrecov = FALSE; 3430 nfs4_recov_state_t recov_state; 3431 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 3432 3433 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "nfs4renew")); 3434 3435 recov_state.rs_flags = 0; 3436 recov_state.rs_num_retry_despite_err = 0; 3437 3438 recov_retry: 3439 mi = sp->mntinfo4_list; 3440 VFS_HOLD(mi->mi_vfsp); 3441 mutex_exit(&sp->s_lock); 3442 ASSERT(mi != NULL); 3443 3444 e.error = nfs4_start_op(mi, NULL, NULL, &recov_state); 3445 if (e.error) { 3446 VFS_RELE(mi->mi_vfsp); 3447 return (e.error); 3448 } 3449 3450 /* Check to see if we're dealing with a marked-dead sp */ 3451 mutex_enter(&sp->s_lock); 3452 if (sp->s_thread_exit == NFS4_THREAD_EXIT) { 3453 mutex_exit(&sp->s_lock); 3454 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov); 3455 VFS_RELE(mi->mi_vfsp); 3456 return (0); 3457 } 3458 3459 /* Make sure mi hasn't changed on us */ 3460 if (mi != sp->mntinfo4_list) { 3461 /* Must drop sp's lock to avoid a recursive mutex enter */ 3462 mutex_exit(&sp->s_lock); 3463 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov); 3464 VFS_RELE(mi->mi_vfsp); 3465 mutex_enter(&sp->s_lock); 3466 goto recov_retry; 3467 } 3468 mutex_exit(&sp->s_lock); 3469 3470 args.ctag = TAG_RENEW; 3471 3472 args.array_len = 1; 3473 args.array = argop; 3474 3475 argop[0].argop = OP_RENEW; 3476 3477 mutex_enter(&sp->s_lock); 3478 argop[0].nfs_argop4_u.oprenew.clientid = sp->clientid; 3479 cr = sp->s_cred; 3480 crhold(cr); 3481 mutex_exit(&sp->s_lock); 3482 3483 ASSERT(cr != NULL); 3484 3485 /* used to figure out RTT for sp */ 3486 gethrestime(&prop_time); 3487 3488 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 3489 "nfs4renew: %s call, sp 0x%p", needrecov ? "recov" : "first", 3490 (void*)sp)); 3491 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "before: %ld s %ld ns ", 3492 prop_time.tv_sec, prop_time.tv_nsec)); 3493 3494 DTRACE_PROBE2(nfs4__renew__start, nfs4_server_t *, sp, 3495 mntinfo4_t *, mi); 3496 3497 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 3498 crfree(cr); 3499 3500 DTRACE_PROBE2(nfs4__renew__end, nfs4_server_t *, sp, 3501 mntinfo4_t *, mi); 3502 3503 gethrestime(&after_time); 3504 3505 mutex_enter(&sp->s_lock); 3506 sp->propagation_delay.tv_sec = 3507 MAX(1, after_time.tv_sec - prop_time.tv_sec); 3508 mutex_exit(&sp->s_lock); 3509 3510 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "after : %ld s %ld ns ", 3511 after_time.tv_sec, after_time.tv_nsec)); 3512 3513 if (e.error == 0 && res.status == NFS4ERR_CB_PATH_DOWN) { 3514 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3515 nfs4_delegreturn_all(sp); 3516 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov); 3517 VFS_RELE(mi->mi_vfsp); 3518 /* 3519 * If the server returns CB_PATH_DOWN, it has renewed 3520 * the lease and informed us that the callback path is 3521 * down. Since the lease is renewed, just return 0 and 3522 * let the renew thread proceed as normal. 3523 */ 3524 return (0); 3525 } 3526 3527 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 3528 if (!needrecov && e.error) { 3529 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov); 3530 VFS_RELE(mi->mi_vfsp); 3531 return (e.error); 3532 } 3533 3534 rpc_error = e.error; 3535 3536 if (needrecov) { 3537 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 3538 "nfs4renew: initiating recovery\n")); 3539 3540 if (nfs4_start_recovery(&e, mi, NULL, NULL, NULL, NULL, 3541 OP_RENEW, NULL, NULL, NULL) == FALSE) { 3542 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov); 3543 VFS_RELE(mi->mi_vfsp); 3544 if (!e.error) 3545 (void) xdr_free(xdr_COMPOUND4res_clnt, 3546 (caddr_t)&res); 3547 mutex_enter(&sp->s_lock); 3548 goto recov_retry; 3549 } 3550 /* fall through for res.status case */ 3551 } 3552 3553 if (res.status) { 3554 if (res.status == NFS4ERR_LEASE_MOVED) { 3555 /*EMPTY*/ 3556 /* 3557 * XXX need to try every mntinfo4 in sp->mntinfo4_list 3558 * to renew the lease on that server 3559 */ 3560 } 3561 e.error = geterrno4(res.status); 3562 } 3563 3564 if (!rpc_error) 3565 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3566 3567 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov); 3568 3569 VFS_RELE(mi->mi_vfsp); 3570 3571 return (e.error); 3572 } 3573 3574 void 3575 nfs4_inc_state_ref_count(mntinfo4_t *mi) 3576 { 3577 nfs4_server_t *sp; 3578 3579 /* this locks down sp if it is found */ 3580 sp = find_nfs4_server(mi); 3581 3582 if (sp != NULL) { 3583 nfs4_inc_state_ref_count_nolock(sp, mi); 3584 mutex_exit(&sp->s_lock); 3585 nfs4_server_rele(sp); 3586 } 3587 } 3588 3589 /* 3590 * Bump the number of OPEN files (ie: those with state) so we know if this 3591 * nfs4_server has any state to maintain a lease for or not. 3592 * 3593 * Also, marks the nfs4_server's lease valid if it hasn't been done so already. 3594 */ 3595 void 3596 nfs4_inc_state_ref_count_nolock(nfs4_server_t *sp, mntinfo4_t *mi) 3597 { 3598 ASSERT(mutex_owned(&sp->s_lock)); 3599 3600 sp->state_ref_count++; 3601 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3602 "nfs4_inc_state_ref_count: state_ref_count now %d", 3603 sp->state_ref_count)); 3604 3605 if (sp->lease_valid == NFS4_LEASE_UNINITIALIZED) 3606 sp->lease_valid = NFS4_LEASE_VALID; 3607 3608 /* 3609 * If this call caused the lease to be marked valid and/or 3610 * took the state_ref_count from 0 to 1, then start the time 3611 * on lease renewal. 3612 */ 3613 if (sp->lease_valid == NFS4_LEASE_VALID && sp->state_ref_count == 1) 3614 sp->last_renewal_time = gethrestime_sec(); 3615 3616 /* update the number of open files for mi */ 3617 mi->mi_open_files++; 3618 } 3619 3620 void 3621 nfs4_dec_state_ref_count(mntinfo4_t *mi) 3622 { 3623 nfs4_server_t *sp; 3624 3625 /* this locks down sp if it is found */ 3626 sp = find_nfs4_server_all(mi, 1); 3627 3628 if (sp != NULL) { 3629 nfs4_dec_state_ref_count_nolock(sp, mi); 3630 mutex_exit(&sp->s_lock); 3631 nfs4_server_rele(sp); 3632 } 3633 } 3634 3635 /* 3636 * Decrement the number of OPEN files (ie: those with state) so we know if 3637 * this nfs4_server has any state to maintain a lease for or not. 3638 */ 3639 void 3640 nfs4_dec_state_ref_count_nolock(nfs4_server_t *sp, mntinfo4_t *mi) 3641 { 3642 ASSERT(mutex_owned(&sp->s_lock)); 3643 ASSERT(sp->state_ref_count != 0); 3644 sp->state_ref_count--; 3645 3646 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3647 "nfs4_dec_state_ref_count: state ref count now %d", 3648 sp->state_ref_count)); 3649 3650 mi->mi_open_files--; 3651 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3652 "nfs4_dec_state_ref_count: mi open files %d, v4 flags 0x%x", 3653 mi->mi_open_files, mi->mi_flags)); 3654 3655 /* We don't have to hold the mi_lock to test mi_flags */ 3656 if (mi->mi_open_files == 0 && 3657 (mi->mi_flags & MI4_REMOVE_ON_LAST_CLOSE)) { 3658 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3659 "nfs4_dec_state_ref_count: remove mntinfo4 %p since " 3660 "we have closed the last open file", (void*)mi)); 3661 nfs4_remove_mi_from_server(mi, sp); 3662 } 3663 } 3664 3665 bool_t 3666 inlease(nfs4_server_t *sp) 3667 { 3668 bool_t result; 3669 3670 ASSERT(mutex_owned(&sp->s_lock)); 3671 3672 if (sp->lease_valid == NFS4_LEASE_VALID && 3673 gethrestime_sec() < sp->last_renewal_time + sp->s_lease_time) 3674 result = TRUE; 3675 else 3676 result = FALSE; 3677 3678 return (result); 3679 } 3680 3681 3682 /* 3683 * Return non-zero if the given nfs4_server_t is going through recovery. 3684 */ 3685 3686 int 3687 nfs4_server_in_recovery(nfs4_server_t *sp) 3688 { 3689 return (nfs_rw_lock_held(&sp->s_recovlock, RW_WRITER)); 3690 } 3691 3692 /* 3693 * Compare two shared filehandle objects. Returns -1, 0, or +1, if the 3694 * first is less than, equal to, or greater than the second. 3695 */ 3696 3697 int 3698 sfh4cmp(const void *p1, const void *p2) 3699 { 3700 const nfs4_sharedfh_t *sfh1 = (const nfs4_sharedfh_t *)p1; 3701 const nfs4_sharedfh_t *sfh2 = (const nfs4_sharedfh_t *)p2; 3702 3703 return (nfs4cmpfh(&sfh1->sfh_fh, &sfh2->sfh_fh)); 3704 } 3705 3706 /* 3707 * Create a table for shared filehandle objects. 3708 */ 3709 3710 void 3711 sfh4_createtab(avl_tree_t *tab) 3712 { 3713 avl_create(tab, sfh4cmp, sizeof (nfs4_sharedfh_t), 3714 offsetof(nfs4_sharedfh_t, sfh_tree)); 3715 } 3716 3717 /* 3718 * Return a shared filehandle object for the given filehandle. The caller 3719 * is responsible for eventually calling sfh4_rele(). 3720 */ 3721 3722 nfs4_sharedfh_t * 3723 sfh4_put(const nfs_fh4 *fh, mntinfo4_t *mi, nfs4_sharedfh_t *key) 3724 { 3725 nfs4_sharedfh_t *sfh, *nsfh; 3726 avl_index_t where; 3727 nfs4_sharedfh_t skey; 3728 3729 if (!key) { 3730 skey.sfh_fh = *fh; 3731 key = &skey; 3732 } 3733 3734 nsfh = kmem_alloc(sizeof (nfs4_sharedfh_t), KM_SLEEP); 3735 nsfh->sfh_fh.nfs_fh4_len = fh->nfs_fh4_len; 3736 /* 3737 * We allocate the largest possible filehandle size because it's 3738 * not that big, and it saves us from possibly having to resize the 3739 * buffer later. 3740 */ 3741 nsfh->sfh_fh.nfs_fh4_val = kmem_alloc(NFS4_FHSIZE, KM_SLEEP); 3742 bcopy(fh->nfs_fh4_val, nsfh->sfh_fh.nfs_fh4_val, fh->nfs_fh4_len); 3743 mutex_init(&nsfh->sfh_lock, NULL, MUTEX_DEFAULT, NULL); 3744 nsfh->sfh_refcnt = 1; 3745 nsfh->sfh_flags = SFH4_IN_TREE; 3746 nsfh->sfh_mi = mi; 3747 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE, "sfh4_get: new object (%p)", 3748 (void *)nsfh)); 3749 3750 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0); 3751 sfh = avl_find(&mi->mi_filehandles, key, &where); 3752 if (sfh != NULL) { 3753 mutex_enter(&sfh->sfh_lock); 3754 sfh->sfh_refcnt++; 3755 mutex_exit(&sfh->sfh_lock); 3756 nfs_rw_exit(&mi->mi_fh_lock); 3757 /* free our speculative allocs */ 3758 kmem_free(nsfh->sfh_fh.nfs_fh4_val, NFS4_FHSIZE); 3759 kmem_free(nsfh, sizeof (nfs4_sharedfh_t)); 3760 return (sfh); 3761 } 3762 3763 avl_insert(&mi->mi_filehandles, nsfh, where); 3764 nfs_rw_exit(&mi->mi_fh_lock); 3765 3766 return (nsfh); 3767 } 3768 3769 /* 3770 * Return a shared filehandle object for the given filehandle. The caller 3771 * is responsible for eventually calling sfh4_rele(). 3772 */ 3773 3774 nfs4_sharedfh_t * 3775 sfh4_get(const nfs_fh4 *fh, mntinfo4_t *mi) 3776 { 3777 nfs4_sharedfh_t *sfh; 3778 nfs4_sharedfh_t key; 3779 3780 ASSERT(fh->nfs_fh4_len <= NFS4_FHSIZE); 3781 3782 #ifdef DEBUG 3783 if (nfs4_sharedfh_debug) { 3784 nfs4_fhandle_t fhandle; 3785 3786 fhandle.fh_len = fh->nfs_fh4_len; 3787 bcopy(fh->nfs_fh4_val, fhandle.fh_buf, fhandle.fh_len); 3788 zcmn_err(mi->mi_zone->zone_id, CE_NOTE, "sfh4_get:"); 3789 nfs4_printfhandle(&fhandle); 3790 } 3791 #endif 3792 3793 /* 3794 * If there's already an object for the given filehandle, bump the 3795 * reference count and return it. Otherwise, create a new object 3796 * and add it to the AVL tree. 3797 */ 3798 3799 key.sfh_fh = *fh; 3800 3801 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0); 3802 sfh = avl_find(&mi->mi_filehandles, &key, NULL); 3803 if (sfh != NULL) { 3804 mutex_enter(&sfh->sfh_lock); 3805 sfh->sfh_refcnt++; 3806 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE, 3807 "sfh4_get: found existing %p, new refcnt=%d", 3808 (void *)sfh, sfh->sfh_refcnt)); 3809 mutex_exit(&sfh->sfh_lock); 3810 nfs_rw_exit(&mi->mi_fh_lock); 3811 return (sfh); 3812 } 3813 nfs_rw_exit(&mi->mi_fh_lock); 3814 3815 return (sfh4_put(fh, mi, &key)); 3816 } 3817 3818 /* 3819 * Get a reference to the given shared filehandle object. 3820 */ 3821 3822 void 3823 sfh4_hold(nfs4_sharedfh_t *sfh) 3824 { 3825 ASSERT(sfh->sfh_refcnt > 0); 3826 3827 mutex_enter(&sfh->sfh_lock); 3828 sfh->sfh_refcnt++; 3829 NFS4_DEBUG(nfs4_sharedfh_debug, 3830 (CE_NOTE, "sfh4_hold %p, new refcnt=%d", 3831 (void *)sfh, sfh->sfh_refcnt)); 3832 mutex_exit(&sfh->sfh_lock); 3833 } 3834 3835 /* 3836 * Release a reference to the given shared filehandle object and null out 3837 * the given pointer. 3838 */ 3839 3840 void 3841 sfh4_rele(nfs4_sharedfh_t **sfhpp) 3842 { 3843 mntinfo4_t *mi; 3844 nfs4_sharedfh_t *sfh = *sfhpp; 3845 3846 ASSERT(sfh->sfh_refcnt > 0); 3847 3848 mutex_enter(&sfh->sfh_lock); 3849 if (sfh->sfh_refcnt > 1) { 3850 sfh->sfh_refcnt--; 3851 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE, 3852 "sfh4_rele %p, new refcnt=%d", 3853 (void *)sfh, sfh->sfh_refcnt)); 3854 mutex_exit(&sfh->sfh_lock); 3855 goto finish; 3856 } 3857 mutex_exit(&sfh->sfh_lock); 3858 3859 /* 3860 * Possibly the last reference, so get the lock for the table in 3861 * case it's time to remove the object from the table. 3862 */ 3863 mi = sfh->sfh_mi; 3864 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0); 3865 mutex_enter(&sfh->sfh_lock); 3866 sfh->sfh_refcnt--; 3867 if (sfh->sfh_refcnt > 0) { 3868 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE, 3869 "sfh4_rele %p, new refcnt=%d", 3870 (void *)sfh, sfh->sfh_refcnt)); 3871 mutex_exit(&sfh->sfh_lock); 3872 nfs_rw_exit(&mi->mi_fh_lock); 3873 goto finish; 3874 } 3875 3876 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE, 3877 "sfh4_rele %p, last ref", (void *)sfh)); 3878 if (sfh->sfh_flags & SFH4_IN_TREE) { 3879 avl_remove(&mi->mi_filehandles, sfh); 3880 sfh->sfh_flags &= ~SFH4_IN_TREE; 3881 } 3882 mutex_exit(&sfh->sfh_lock); 3883 nfs_rw_exit(&mi->mi_fh_lock); 3884 mutex_destroy(&sfh->sfh_lock); 3885 kmem_free(sfh->sfh_fh.nfs_fh4_val, NFS4_FHSIZE); 3886 kmem_free(sfh, sizeof (nfs4_sharedfh_t)); 3887 3888 finish: 3889 *sfhpp = NULL; 3890 } 3891 3892 /* 3893 * Update the filehandle for the given shared filehandle object. 3894 */ 3895 3896 int nfs4_warn_dupfh = 0; /* if set, always warn about dup fhs below */ 3897 3898 void 3899 sfh4_update(nfs4_sharedfh_t *sfh, const nfs_fh4 *newfh) 3900 { 3901 mntinfo4_t *mi = sfh->sfh_mi; 3902 nfs4_sharedfh_t *dupsfh; 3903 avl_index_t where; 3904 nfs4_sharedfh_t key; 3905 3906 #ifdef DEBUG 3907 mutex_enter(&sfh->sfh_lock); 3908 ASSERT(sfh->sfh_refcnt > 0); 3909 mutex_exit(&sfh->sfh_lock); 3910 #endif 3911 ASSERT(newfh->nfs_fh4_len <= NFS4_FHSIZE); 3912 3913 /* 3914 * The basic plan is to remove the shared filehandle object from 3915 * the table, update it to have the new filehandle, then reinsert 3916 * it. 3917 */ 3918 3919 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0); 3920 mutex_enter(&sfh->sfh_lock); 3921 if (sfh->sfh_flags & SFH4_IN_TREE) { 3922 avl_remove(&mi->mi_filehandles, sfh); 3923 sfh->sfh_flags &= ~SFH4_IN_TREE; 3924 } 3925 mutex_exit(&sfh->sfh_lock); 3926 sfh->sfh_fh.nfs_fh4_len = newfh->nfs_fh4_len; 3927 bcopy(newfh->nfs_fh4_val, sfh->sfh_fh.nfs_fh4_val, 3928 sfh->sfh_fh.nfs_fh4_len); 3929 3930 /* 3931 * XXX If there is already a shared filehandle object with the new 3932 * filehandle, we're in trouble, because the rnode code assumes 3933 * that there is only one shared filehandle object for a given 3934 * filehandle. So issue a warning (for read-write mounts only) 3935 * and don't try to re-insert the given object into the table. 3936 * Hopefully the given object will quickly go away and everyone 3937 * will use the new object. 3938 */ 3939 key.sfh_fh = *newfh; 3940 dupsfh = avl_find(&mi->mi_filehandles, &key, &where); 3941 if (dupsfh != NULL) { 3942 if (!(mi->mi_vfsp->vfs_flag & VFS_RDONLY) || nfs4_warn_dupfh) { 3943 zcmn_err(mi->mi_zone->zone_id, CE_WARN, "sfh4_update: " 3944 "duplicate filehandle detected"); 3945 sfh4_printfhandle(dupsfh); 3946 } 3947 } else { 3948 avl_insert(&mi->mi_filehandles, sfh, where); 3949 mutex_enter(&sfh->sfh_lock); 3950 sfh->sfh_flags |= SFH4_IN_TREE; 3951 mutex_exit(&sfh->sfh_lock); 3952 } 3953 nfs_rw_exit(&mi->mi_fh_lock); 3954 } 3955 3956 /* 3957 * Copy out the current filehandle for the given shared filehandle object. 3958 */ 3959 3960 void 3961 sfh4_copyval(const nfs4_sharedfh_t *sfh, nfs4_fhandle_t *fhp) 3962 { 3963 mntinfo4_t *mi = sfh->sfh_mi; 3964 3965 ASSERT(sfh->sfh_refcnt > 0); 3966 3967 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0); 3968 fhp->fh_len = sfh->sfh_fh.nfs_fh4_len; 3969 ASSERT(fhp->fh_len <= NFS4_FHSIZE); 3970 bcopy(sfh->sfh_fh.nfs_fh4_val, fhp->fh_buf, fhp->fh_len); 3971 nfs_rw_exit(&mi->mi_fh_lock); 3972 } 3973 3974 /* 3975 * Print out the filehandle for the given shared filehandle object. 3976 */ 3977 3978 void 3979 sfh4_printfhandle(const nfs4_sharedfh_t *sfh) 3980 { 3981 nfs4_fhandle_t fhandle; 3982 3983 sfh4_copyval(sfh, &fhandle); 3984 nfs4_printfhandle(&fhandle); 3985 } 3986 3987 /* 3988 * Compare 2 fnames. Returns -1 if the first is "less" than the second, 0 3989 * if they're the same, +1 if the first is "greater" than the second. The 3990 * caller (or whoever's calling the AVL package) is responsible for 3991 * handling locking issues. 3992 */ 3993 3994 static int 3995 fncmp(const void *p1, const void *p2) 3996 { 3997 const nfs4_fname_t *f1 = p1; 3998 const nfs4_fname_t *f2 = p2; 3999 int res; 4000 4001 res = strcmp(f1->fn_name, f2->fn_name); 4002 /* 4003 * The AVL package wants +/-1, not arbitrary positive or negative 4004 * integers. 4005 */ 4006 if (res > 0) 4007 res = 1; 4008 else if (res < 0) 4009 res = -1; 4010 return (res); 4011 } 4012 4013 /* 4014 * Get or create an fname with the given name, as a child of the given 4015 * fname. The caller is responsible for eventually releasing the reference 4016 * (fn_rele()). parent may be NULL. 4017 */ 4018 4019 nfs4_fname_t * 4020 fn_get(nfs4_fname_t *parent, char *name, nfs4_sharedfh_t *sfh) 4021 { 4022 nfs4_fname_t key; 4023 nfs4_fname_t *fnp; 4024 avl_index_t where; 4025 4026 key.fn_name = name; 4027 4028 /* 4029 * If there's already an fname registered with the given name, bump 4030 * its reference count and return it. Otherwise, create a new one 4031 * and add it to the parent's AVL tree. 4032 * 4033 * fname entries we are looking for should match both name 4034 * and sfh stored in the fname. 4035 */ 4036 again: 4037 if (parent != NULL) { 4038 mutex_enter(&parent->fn_lock); 4039 fnp = avl_find(&parent->fn_children, &key, &where); 4040 if (fnp != NULL) { 4041 /* 4042 * This hold on fnp is released below later, 4043 * in case this is not the fnp we want. 4044 */ 4045 fn_hold(fnp); 4046 4047 if (fnp->fn_sfh == sfh) { 4048 /* 4049 * We have found our entry. 4050 * put an hold and return it. 4051 */ 4052 mutex_exit(&parent->fn_lock); 4053 return (fnp); 4054 } 4055 4056 /* 4057 * We have found an entry that has a mismatching 4058 * fn_sfh. This could be a stale entry due to 4059 * server side rename. We will remove this entry 4060 * and make sure no such entries exist. 4061 */ 4062 mutex_exit(&parent->fn_lock); 4063 mutex_enter(&fnp->fn_lock); 4064 if (fnp->fn_parent == parent) { 4065 /* 4066 * Remove ourselves from parent's 4067 * fn_children tree. 4068 */ 4069 mutex_enter(&parent->fn_lock); 4070 avl_remove(&parent->fn_children, fnp); 4071 mutex_exit(&parent->fn_lock); 4072 fn_rele(&fnp->fn_parent); 4073 } 4074 mutex_exit(&fnp->fn_lock); 4075 fn_rele(&fnp); 4076 goto again; 4077 } 4078 } 4079 4080 fnp = kmem_alloc(sizeof (nfs4_fname_t), KM_SLEEP); 4081 mutex_init(&fnp->fn_lock, NULL, MUTEX_DEFAULT, NULL); 4082 fnp->fn_parent = parent; 4083 if (parent != NULL) 4084 fn_hold(parent); 4085 fnp->fn_len = strlen(name); 4086 ASSERT(fnp->fn_len < MAXNAMELEN); 4087 fnp->fn_name = kmem_alloc(fnp->fn_len + 1, KM_SLEEP); 4088 (void) strcpy(fnp->fn_name, name); 4089 fnp->fn_refcnt = 1; 4090 4091 /* 4092 * This hold on sfh is later released 4093 * when we do the final fn_rele() on this fname. 4094 */ 4095 sfh4_hold(sfh); 4096 fnp->fn_sfh = sfh; 4097 4098 avl_create(&fnp->fn_children, fncmp, sizeof (nfs4_fname_t), 4099 offsetof(nfs4_fname_t, fn_tree)); 4100 NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE, 4101 "fn_get %p:%s, a new nfs4_fname_t!", 4102 (void *)fnp, fnp->fn_name)); 4103 if (parent != NULL) { 4104 avl_insert(&parent->fn_children, fnp, where); 4105 mutex_exit(&parent->fn_lock); 4106 } 4107 4108 return (fnp); 4109 } 4110 4111 void 4112 fn_hold(nfs4_fname_t *fnp) 4113 { 4114 atomic_add_32(&fnp->fn_refcnt, 1); 4115 NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE, 4116 "fn_hold %p:%s, new refcnt=%d", 4117 (void *)fnp, fnp->fn_name, fnp->fn_refcnt)); 4118 } 4119 4120 /* 4121 * Decrement the reference count of the given fname, and destroy it if its 4122 * reference count goes to zero. Nulls out the given pointer. 4123 */ 4124 4125 void 4126 fn_rele(nfs4_fname_t **fnpp) 4127 { 4128 nfs4_fname_t *parent; 4129 uint32_t newref; 4130 nfs4_fname_t *fnp; 4131 4132 recur: 4133 fnp = *fnpp; 4134 *fnpp = NULL; 4135 4136 mutex_enter(&fnp->fn_lock); 4137 parent = fnp->fn_parent; 4138 if (parent != NULL) 4139 mutex_enter(&parent->fn_lock); /* prevent new references */ 4140 newref = atomic_add_32_nv(&fnp->fn_refcnt, -1); 4141 if (newref > 0) { 4142 NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE, 4143 "fn_rele %p:%s, new refcnt=%d", 4144 (void *)fnp, fnp->fn_name, fnp->fn_refcnt)); 4145 if (parent != NULL) 4146 mutex_exit(&parent->fn_lock); 4147 mutex_exit(&fnp->fn_lock); 4148 return; 4149 } 4150 4151 NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE, 4152 "fn_rele %p:%s, last reference, deleting...", 4153 (void *)fnp, fnp->fn_name)); 4154 if (parent != NULL) { 4155 avl_remove(&parent->fn_children, fnp); 4156 mutex_exit(&parent->fn_lock); 4157 } 4158 kmem_free(fnp->fn_name, fnp->fn_len + 1); 4159 sfh4_rele(&fnp->fn_sfh); 4160 mutex_destroy(&fnp->fn_lock); 4161 avl_destroy(&fnp->fn_children); 4162 kmem_free(fnp, sizeof (nfs4_fname_t)); 4163 /* 4164 * Recursivly fn_rele the parent. 4165 * Use goto instead of a recursive call to avoid stack overflow. 4166 */ 4167 if (parent != NULL) { 4168 fnpp = &parent; 4169 goto recur; 4170 } 4171 } 4172 4173 /* 4174 * Returns the single component name of the given fname, in a MAXNAMELEN 4175 * string buffer, which the caller is responsible for freeing. Note that 4176 * the name may become invalid as a result of fn_move(). 4177 */ 4178 4179 char * 4180 fn_name(nfs4_fname_t *fnp) 4181 { 4182 char *name; 4183 4184 ASSERT(fnp->fn_len < MAXNAMELEN); 4185 name = kmem_alloc(MAXNAMELEN, KM_SLEEP); 4186 mutex_enter(&fnp->fn_lock); 4187 (void) strcpy(name, fnp->fn_name); 4188 mutex_exit(&fnp->fn_lock); 4189 4190 return (name); 4191 } 4192 4193 4194 /* 4195 * fn_path_realloc 4196 * 4197 * This function, used only by fn_path, constructs 4198 * a new string which looks like "prepend" + "/" + "current". 4199 * by allocating a new string and freeing the old one. 4200 */ 4201 static void 4202 fn_path_realloc(char **curses, char *prepend) 4203 { 4204 int len, curlen = 0; 4205 char *news; 4206 4207 if (*curses == NULL) { 4208 /* 4209 * Prime the pump, allocate just the 4210 * space for prepend and return that. 4211 */ 4212 len = strlen(prepend) + 1; 4213 news = kmem_alloc(len, KM_SLEEP); 4214 (void) strncpy(news, prepend, len); 4215 } else { 4216 /* 4217 * Allocate the space for a new string 4218 * +1 +1 is for the "/" and the NULL 4219 * byte at the end of it all. 4220 */ 4221 curlen = strlen(*curses); 4222 len = curlen + strlen(prepend) + 1 + 1; 4223 news = kmem_alloc(len, KM_SLEEP); 4224 (void) strncpy(news, prepend, len); 4225 (void) strcat(news, "/"); 4226 (void) strcat(news, *curses); 4227 kmem_free(*curses, curlen + 1); 4228 } 4229 *curses = news; 4230 } 4231 4232 /* 4233 * Returns the path name (starting from the fs root) for the given fname. 4234 * The caller is responsible for freeing. Note that the path may be or 4235 * become invalid as a result of fn_move(). 4236 */ 4237 4238 char * 4239 fn_path(nfs4_fname_t *fnp) 4240 { 4241 char *path; 4242 nfs4_fname_t *nextfnp; 4243 4244 if (fnp == NULL) 4245 return (NULL); 4246 4247 path = NULL; 4248 4249 /* walk up the tree constructing the pathname. */ 4250 4251 fn_hold(fnp); /* adjust for later rele */ 4252 do { 4253 mutex_enter(&fnp->fn_lock); 4254 /* 4255 * Add fn_name in front of the current path 4256 */ 4257 fn_path_realloc(&path, fnp->fn_name); 4258 nextfnp = fnp->fn_parent; 4259 if (nextfnp != NULL) 4260 fn_hold(nextfnp); 4261 mutex_exit(&fnp->fn_lock); 4262 fn_rele(&fnp); 4263 fnp = nextfnp; 4264 } while (fnp != NULL); 4265 4266 return (path); 4267 } 4268 4269 /* 4270 * Return a reference to the parent of the given fname, which the caller is 4271 * responsible for eventually releasing. 4272 */ 4273 4274 nfs4_fname_t * 4275 fn_parent(nfs4_fname_t *fnp) 4276 { 4277 nfs4_fname_t *parent; 4278 4279 mutex_enter(&fnp->fn_lock); 4280 parent = fnp->fn_parent; 4281 if (parent != NULL) 4282 fn_hold(parent); 4283 mutex_exit(&fnp->fn_lock); 4284 4285 return (parent); 4286 } 4287 4288 /* 4289 * Update fnp so that its parent is newparent and its name is newname. 4290 */ 4291 4292 void 4293 fn_move(nfs4_fname_t *fnp, nfs4_fname_t *newparent, char *newname) 4294 { 4295 nfs4_fname_t *parent, *tmpfnp; 4296 ssize_t newlen; 4297 nfs4_fname_t key; 4298 avl_index_t where; 4299 4300 /* 4301 * This assert exists to catch the client trying to rename 4302 * a dir to be a child of itself. This happened at a recent 4303 * bakeoff against a 3rd party (broken) server which allowed 4304 * the rename to succeed. If it trips it means that: 4305 * a) the code in nfs4rename that detects this case is broken 4306 * b) the server is broken (since it allowed the bogus rename) 4307 * 4308 * For non-DEBUG kernels, prepare for a recursive mutex_enter 4309 * panic below from: mutex_enter(&newparent->fn_lock); 4310 */ 4311 ASSERT(fnp != newparent); 4312 4313 /* 4314 * Remove fnp from its current parent, change its name, then add it 4315 * to newparent. It might happen that fnp was replaced by another 4316 * nfs4_fname_t with the same fn_name in parent->fn_children. 4317 * In such case, fnp->fn_parent is NULL and we skip the removal 4318 * of fnp from its current parent. 4319 */ 4320 mutex_enter(&fnp->fn_lock); 4321 parent = fnp->fn_parent; 4322 if (parent != NULL) { 4323 mutex_enter(&parent->fn_lock); 4324 avl_remove(&parent->fn_children, fnp); 4325 mutex_exit(&parent->fn_lock); 4326 fn_rele(&fnp->fn_parent); 4327 } 4328 4329 newlen = strlen(newname); 4330 if (newlen != fnp->fn_len) { 4331 ASSERT(newlen < MAXNAMELEN); 4332 kmem_free(fnp->fn_name, fnp->fn_len + 1); 4333 fnp->fn_name = kmem_alloc(newlen + 1, KM_SLEEP); 4334 fnp->fn_len = newlen; 4335 } 4336 (void) strcpy(fnp->fn_name, newname); 4337 4338 again: 4339 mutex_enter(&newparent->fn_lock); 4340 key.fn_name = fnp->fn_name; 4341 tmpfnp = avl_find(&newparent->fn_children, &key, &where); 4342 if (tmpfnp != NULL) { 4343 /* 4344 * This could be due to a file that was unlinked while 4345 * open, or perhaps the rnode is in the free list. Remove 4346 * it from newparent and let it go away on its own. The 4347 * contorted code is to deal with lock order issues and 4348 * race conditions. 4349 */ 4350 fn_hold(tmpfnp); 4351 mutex_exit(&newparent->fn_lock); 4352 mutex_enter(&tmpfnp->fn_lock); 4353 if (tmpfnp->fn_parent == newparent) { 4354 mutex_enter(&newparent->fn_lock); 4355 avl_remove(&newparent->fn_children, tmpfnp); 4356 mutex_exit(&newparent->fn_lock); 4357 fn_rele(&tmpfnp->fn_parent); 4358 } 4359 mutex_exit(&tmpfnp->fn_lock); 4360 fn_rele(&tmpfnp); 4361 goto again; 4362 } 4363 fnp->fn_parent = newparent; 4364 fn_hold(newparent); 4365 avl_insert(&newparent->fn_children, fnp, where); 4366 mutex_exit(&newparent->fn_lock); 4367 mutex_exit(&fnp->fn_lock); 4368 } 4369 4370 #ifdef DEBUG 4371 /* 4372 * Return non-zero if the type information makes sense for the given vnode. 4373 * Otherwise panic. 4374 */ 4375 int 4376 nfs4_consistent_type(vnode_t *vp) 4377 { 4378 rnode4_t *rp = VTOR4(vp); 4379 4380 if (nfs4_vtype_debug && vp->v_type != VNON && 4381 rp->r_attr.va_type != VNON && vp->v_type != rp->r_attr.va_type) { 4382 cmn_err(CE_PANIC, "vnode %p type mismatch; v_type=%d, " 4383 "rnode attr type=%d", (void *)vp, vp->v_type, 4384 rp->r_attr.va_type); 4385 } 4386 4387 return (1); 4388 } 4389 #endif /* DEBUG */