1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 25 26 /* 27 * VM - page locking primitives 28 */ 29 #include <sys/param.h> 30 #include <sys/t_lock.h> 31 #include <sys/vtrace.h> 32 #include <sys/debug.h> 33 #include <sys/cmn_err.h> 34 #include <sys/bitmap.h> 35 #include <sys/lockstat.h> 36 #include <sys/sysmacros.h> 37 #include <sys/condvar_impl.h> 38 #include <vm/page.h> 39 #include <vm/seg_enum.h> 40 #include <vm/vm_dep.h> 41 #include <vm/seg_kmem.h> 42 43 /* 44 * This global mutex array is for logical page locking. 45 * The following fields in the page structure are protected 46 * by this lock: 47 * 48 * p_lckcnt 49 * p_cowcnt 50 */ 51 pad_mutex_t page_llocks[8 * NCPU_P2]; 52 53 /* 54 * This is a global lock for the logical page free list. The 55 * logical free list, in this implementation, is maintained as two 56 * separate physical lists - the cache list and the free list. 57 */ 58 kmutex_t page_freelock; 59 60 /* 61 * The hash table, page_hash[], the p_selock fields, and the 62 * list of pages associated with vnodes are protected by arrays of mutexes. 63 * 64 * Unless the hashes are changed radically, the table sizes must be 65 * a power of two. Also, we typically need more mutexes for the 66 * vnodes since these locks are occasionally held for long periods. 67 * And since there seem to be two special vnodes (kvp and swapvp), 68 * we make room for private mutexes for them. 69 * 70 * The pse_mutex[] array holds the mutexes to protect the p_selock 71 * fields of all page_t structures. 72 * 73 * PAGE_SE_MUTEX(pp) returns the address of the appropriate mutex 74 * when given a pointer to a page_t. 75 * 76 * PIO_TABLE_SIZE must be a power of two. One could argue that we 77 * should go to the trouble of setting it up at run time and base it 78 * on memory size rather than the number of compile time CPUs. 79 * 80 * XX64 We should be using physmem size to calculate PIO_SHIFT. 81 * 82 * These might break in 64 bit world. 83 */ 84 #define PIO_SHIFT 7 /* log2(sizeof(page_t)) */ 85 #define PIO_TABLE_SIZE 128 /* number of io mutexes to have */ 86 87 pad_mutex_t ph_mutex[PH_TABLE_SIZE]; 88 kmutex_t pio_mutex[PIO_TABLE_SIZE]; 89 90 #define PAGE_IO_MUTEX(pp) \ 91 &pio_mutex[(((uintptr_t)pp) >> PIO_SHIFT) & (PIO_TABLE_SIZE - 1)] 92 93 /* 94 * The pse_mutex[] array is allocated in the platform startup code 95 * based on the size of the machine at startup. 96 */ 97 extern pad_mutex_t *pse_mutex; /* Locks protecting pp->p_selock */ 98 extern size_t pse_table_size; /* Number of mutexes in pse_mutex[] */ 99 extern int pse_shift; /* log2(pse_table_size) */ 100 #define PAGE_SE_MUTEX(pp) &pse_mutex[ \ 101 ((((uintptr_t)(pp) >> pse_shift) ^ ((uintptr_t)(pp))) >> 7) & \ 102 (pse_table_size - 1)].pad_mutex 103 104 #define PSZC_MTX_TABLE_SIZE 128 105 #define PSZC_MTX_TABLE_SHIFT 7 106 107 static pad_mutex_t pszc_mutex[PSZC_MTX_TABLE_SIZE]; 108 109 #define PAGE_SZC_MUTEX(_pp) \ 110 &pszc_mutex[((((uintptr_t)(_pp) >> PSZC_MTX_TABLE_SHIFT) ^ \ 111 ((uintptr_t)(_pp) >> (PSZC_MTX_TABLE_SHIFT << 1)) ^ \ 112 ((uintptr_t)(_pp) >> (3 * PSZC_MTX_TABLE_SHIFT))) & \ 113 (PSZC_MTX_TABLE_SIZE - 1))].pad_mutex 114 115 /* 116 * The vph_mutex[] array holds the mutexes to protect the vnode chains, 117 * (i.e., the list of pages anchored by v_pages and connected via p_vpprev 118 * and p_vpnext). 119 * 120 * The page_vnode_mutex(vp) function returns the address of the appropriate 121 * mutex from this array given a pointer to a vnode. It is complicated 122 * by the fact that the kernel's vnode and the swapfs vnode are referenced 123 * frequently enough to warrent their own mutexes. 124 * 125 * The VP_HASH_FUNC returns the index into the vph_mutex array given 126 * an address of a vnode. 127 */ 128 129 #if defined(_LP64) 130 #define VPH_TABLE_SIZE (8 * NCPU_P2) 131 #else /* 32 bits */ 132 #define VPH_TABLE_SIZE (2 * NCPU_P2) 133 #endif 134 135 #define VP_HASH_FUNC(vp) \ 136 ((((uintptr_t)(vp) >> 6) + \ 137 ((uintptr_t)(vp) >> 8) + \ 138 ((uintptr_t)(vp) >> 10) + \ 139 ((uintptr_t)(vp) >> 12)) \ 140 & (VPH_TABLE_SIZE - 1)) 141 142 /* 143 * Two slots after VPH_TABLE_SIZE are reserved in vph_mutex for kernel vnodes. 144 * The lock for kvp is VPH_TABLE_SIZE + 0, and the lock for zvp is 145 * VPH_TABLE_SIZE + 1. 146 */ 147 148 kmutex_t vph_mutex[VPH_TABLE_SIZE + 2]; 149 150 /* 151 * Initialize the locks used by the Virtual Memory Management system. 152 */ 153 void 154 page_lock_init() 155 { 156 } 157 158 /* 159 * Return a value for pse_shift based on npg (the number of physical pages) 160 * and ncpu (the maximum number of CPUs). This is called by platform startup 161 * code. 162 * 163 * Lockstat data from TPC-H runs showed that contention on the pse_mutex[] 164 * locks grew approximately as the square of the number of threads executing. 165 * So the primary scaling factor used is NCPU^2. The size of the machine in 166 * megabytes is used as an upper bound, particularly for sun4v machines which 167 * all claim to have 256 CPUs maximum, and the old value of PSE_TABLE_SIZE 168 * (128) is used as a minimum. Since the size of the table has to be a power 169 * of two, the calculated size is rounded up to the next power of two. 170 */ 171 /*ARGSUSED*/ 172 int 173 size_pse_array(pgcnt_t npg, int ncpu) 174 { 175 size_t size; 176 pgcnt_t pp_per_mb = (1024 * 1024) / PAGESIZE; 177 178 size = MAX(128, MIN(npg / pp_per_mb, 2 * ncpu * ncpu)); 179 size += (1 << (highbit(size) - 1)) - 1; 180 return (highbit(size) - 1); 181 } 182 183 /* 184 * At present we only use page ownership to aid debugging, so it's 185 * OK if the owner field isn't exact. In the 32-bit world two thread ids 186 * can map to the same owner because we just 'or' in 0x80000000 and 187 * then clear the second highest bit, so that (for example) 0x2faced00 188 * and 0xafaced00 both map to 0xafaced00. 189 * In the 64-bit world, p_selock may not be large enough to hold a full 190 * thread pointer. If we ever need precise ownership (e.g. if we implement 191 * priority inheritance for page locks) then p_selock should become a 192 * uintptr_t and SE_WRITER should be -((uintptr_t)curthread >> 2). 193 */ 194 #define SE_WRITER (((selock_t)(ulong_t)curthread | INT_MIN) & ~SE_EWANTED) 195 #define SE_READER 1 196 197 /* 198 * A page that is deleted must be marked as such using the 199 * page_lock_delete() function. The page must be exclusively locked. 200 * The SE_DELETED marker is put in p_selock when this function is called. 201 * SE_DELETED must be distinct from any SE_WRITER value. 202 */ 203 #define SE_DELETED (1 | INT_MIN) 204 205 #ifdef VM_STATS 206 uint_t vph_kvp_count; 207 uint_t vph_swapfsvp_count; 208 uint_t vph_other; 209 #endif /* VM_STATS */ 210 211 #ifdef VM_STATS 212 uint_t page_lock_count; 213 uint_t page_lock_miss; 214 uint_t page_lock_miss_lock; 215 uint_t page_lock_reclaim; 216 uint_t page_lock_bad_reclaim; 217 uint_t page_lock_same_page; 218 uint_t page_lock_upgrade; 219 uint_t page_lock_retired; 220 uint_t page_lock_upgrade_failed; 221 uint_t page_lock_deleted; 222 223 uint_t page_trylock_locked; 224 uint_t page_trylock_failed; 225 uint_t page_trylock_missed; 226 227 uint_t page_try_reclaim_upgrade; 228 #endif /* VM_STATS */ 229 230 /* 231 * Acquire the "shared/exclusive" lock on a page. 232 * 233 * Returns 1 on success and locks the page appropriately. 234 * 0 on failure and does not lock the page. 235 * 236 * If `lock' is non-NULL, it will be dropped and reacquired in the 237 * failure case. This routine can block, and if it does 238 * it will always return a failure since the page identity [vp, off] 239 * or state may have changed. 240 */ 241 242 int 243 page_lock(page_t *pp, se_t se, kmutex_t *lock, reclaim_t reclaim) 244 { 245 return (page_lock_es(pp, se, lock, reclaim, 0)); 246 } 247 248 /* 249 * With the addition of reader-writer lock semantics to page_lock_es, 250 * callers wanting an exclusive (writer) lock may prevent shared-lock 251 * (reader) starvation by setting the es parameter to SE_EXCL_WANTED. 252 * In this case, when an exclusive lock cannot be acquired, p_selock's 253 * SE_EWANTED bit is set. Shared-lock (reader) requests are also denied 254 * if the page is slated for retirement. 255 * 256 * The se and es parameters determine if the lock should be granted 257 * based on the following decision table: 258 * 259 * Lock wanted es flags p_selock/SE_EWANTED Action 260 * ----------- -------------- ------------------- --------- 261 * SE_EXCL any [1][2] unlocked/any grant lock, clear SE_EWANTED 262 * SE_EXCL SE_EWANTED any lock/any deny, set SE_EWANTED 263 * SE_EXCL none any lock/any deny 264 * SE_SHARED n/a [2] shared/0 grant 265 * SE_SHARED n/a [2] unlocked/0 grant 266 * SE_SHARED n/a shared/1 deny 267 * SE_SHARED n/a unlocked/1 deny 268 * SE_SHARED n/a excl/any deny 269 * 270 * Notes: 271 * [1] The code grants an exclusive lock to the caller and clears the bit 272 * SE_EWANTED whenever p_selock is unlocked, regardless of the SE_EWANTED 273 * bit's value. This was deemed acceptable as we are not concerned about 274 * exclusive-lock starvation. If this ever becomes an issue, a priority or 275 * fifo mechanism should also be implemented. Meantime, the thread that 276 * set SE_EWANTED should be prepared to catch this condition and reset it 277 * 278 * [2] Retired pages may not be locked at any time, regardless of the 279 * dispostion of se, unless the es parameter has SE_RETIRED flag set. 280 * 281 * Notes on values of "es": 282 * 283 * es & 1: page_lookup_create will attempt page relocation 284 * es & SE_EXCL_WANTED: caller wants SE_EWANTED set (eg. delete 285 * memory thread); this prevents reader-starvation of waiting 286 * writer thread(s) by giving priority to writers over readers. 287 * es & SE_RETIRED: caller wants to lock pages even if they are 288 * retired. Default is to deny the lock if the page is retired. 289 * 290 * And yes, we know, the semantics of this function are too complicated. 291 * It's on the list to be cleaned up. 292 */ 293 int 294 page_lock_es(page_t *pp, se_t se, kmutex_t *lock, reclaim_t reclaim, int es) 295 { 296 int retval; 297 kmutex_t *pse = PAGE_SE_MUTEX(pp); 298 int upgraded; 299 int reclaim_it; 300 301 ASSERT(lock != NULL ? MUTEX_HELD(lock) : 1); 302 303 VM_STAT_ADD(page_lock_count); 304 305 upgraded = 0; 306 reclaim_it = 0; 307 308 mutex_enter(pse); 309 310 ASSERT(((es & SE_EXCL_WANTED) == 0) || 311 ((es & SE_EXCL_WANTED) && (se == SE_EXCL))); 312 313 if (PP_RETIRED(pp) && !(es & SE_RETIRED)) { 314 mutex_exit(pse); 315 VM_STAT_ADD(page_lock_retired); 316 return (0); 317 } 318 319 if (se == SE_SHARED && es == 1 && pp->p_selock == 0) { 320 se = SE_EXCL; 321 } 322 323 if ((reclaim == P_RECLAIM) && (PP_ISFREE(pp))) { 324 325 reclaim_it = 1; 326 if (se == SE_SHARED) { 327 /* 328 * This is an interesting situation. 329 * 330 * Remember that p_free can only change if 331 * p_selock < 0. 332 * p_free does not depend on our holding `pse'. 333 * And, since we hold `pse', p_selock can not change. 334 * So, if p_free changes on us, the page is already 335 * exclusively held, and we would fail to get p_selock 336 * regardless. 337 * 338 * We want to avoid getting the share 339 * lock on a free page that needs to be reclaimed. 340 * It is possible that some other thread has the share 341 * lock and has left the free page on the cache list. 342 * pvn_vplist_dirty() does this for brief periods. 343 * If the se_share is currently SE_EXCL, we will fail 344 * to acquire p_selock anyway. Blocking is the 345 * right thing to do. 346 * If we need to reclaim this page, we must get 347 * exclusive access to it, force the upgrade now. 348 * Again, we will fail to acquire p_selock if the 349 * page is not free and block. 350 */ 351 upgraded = 1; 352 se = SE_EXCL; 353 VM_STAT_ADD(page_lock_upgrade); 354 } 355 } 356 357 if (se == SE_EXCL) { 358 if (!(es & SE_EXCL_WANTED) && (pp->p_selock & SE_EWANTED)) { 359 /* 360 * if the caller wants a writer lock (but did not 361 * specify exclusive access), and there is a pending 362 * writer that wants exclusive access, return failure 363 */ 364 retval = 0; 365 } else if ((pp->p_selock & ~SE_EWANTED) == 0) { 366 /* no reader/writer lock held */ 367 THREAD_KPRI_REQUEST(); 368 /* this clears our setting of the SE_EWANTED bit */ 369 pp->p_selock = SE_WRITER; 370 retval = 1; 371 } else { 372 /* page is locked */ 373 if (es & SE_EXCL_WANTED) { 374 /* set the SE_EWANTED bit */ 375 pp->p_selock |= SE_EWANTED; 376 } 377 retval = 0; 378 } 379 } else { 380 retval = 0; 381 if (pp->p_selock >= 0) { 382 if ((pp->p_selock & SE_EWANTED) == 0) { 383 pp->p_selock += SE_READER; 384 retval = 1; 385 } 386 } 387 } 388 389 if (retval == 0) { 390 if ((pp->p_selock & ~SE_EWANTED) == SE_DELETED) { 391 VM_STAT_ADD(page_lock_deleted); 392 mutex_exit(pse); 393 return (retval); 394 } 395 396 #ifdef VM_STATS 397 VM_STAT_ADD(page_lock_miss); 398 if (upgraded) { 399 VM_STAT_ADD(page_lock_upgrade_failed); 400 } 401 #endif 402 if (lock) { 403 VM_STAT_ADD(page_lock_miss_lock); 404 mutex_exit(lock); 405 } 406 407 /* 408 * Now, wait for the page to be unlocked and 409 * release the lock protecting p_cv and p_selock. 410 */ 411 cv_wait(&pp->p_cv, pse); 412 mutex_exit(pse); 413 414 /* 415 * The page identity may have changed while we were 416 * blocked. If we are willing to depend on "pp" 417 * still pointing to a valid page structure (i.e., 418 * assuming page structures are not dynamically allocated 419 * or freed), we could try to lock the page if its 420 * identity hasn't changed. 421 * 422 * This needs to be measured, since we come back from 423 * cv_wait holding pse (the expensive part of this 424 * operation) we might as well try the cheap part. 425 * Though we would also have to confirm that dropping 426 * `lock' did not cause any grief to the callers. 427 */ 428 if (lock) { 429 mutex_enter(lock); 430 } 431 } else { 432 /* 433 * We have the page lock. 434 * If we needed to reclaim the page, and the page 435 * needed reclaiming (ie, it was free), then we 436 * have the page exclusively locked. We may need 437 * to downgrade the page. 438 */ 439 ASSERT((upgraded) ? 440 ((PP_ISFREE(pp)) && PAGE_EXCL(pp)) : 1); 441 mutex_exit(pse); 442 443 /* 444 * We now hold this page's lock, either shared or 445 * exclusive. This will prevent its identity from changing. 446 * The page, however, may or may not be free. If the caller 447 * requested, and it is free, go reclaim it from the 448 * free list. If the page can't be reclaimed, return failure 449 * so that the caller can start all over again. 450 * 451 * NOTE:page_reclaim() releases the page lock (p_selock) 452 * if it can't be reclaimed. 453 */ 454 if (reclaim_it) { 455 if (!page_reclaim(pp, lock)) { 456 VM_STAT_ADD(page_lock_bad_reclaim); 457 retval = 0; 458 } else { 459 VM_STAT_ADD(page_lock_reclaim); 460 if (upgraded) { 461 page_downgrade(pp); 462 } 463 } 464 } 465 } 466 return (retval); 467 } 468 469 /* 470 * Clear the SE_EWANTED bit from p_selock. This function allows 471 * callers of page_lock_es and page_try_reclaim_lock to clear 472 * their setting of this bit if they decide they no longer wish 473 * to gain exclusive access to the page. Currently only 474 * delete_memory_thread uses this when the delete memory 475 * operation is cancelled. 476 */ 477 void 478 page_lock_clr_exclwanted(page_t *pp) 479 { 480 kmutex_t *pse = PAGE_SE_MUTEX(pp); 481 482 mutex_enter(pse); 483 pp->p_selock &= ~SE_EWANTED; 484 if (CV_HAS_WAITERS(&pp->p_cv)) 485 cv_broadcast(&pp->p_cv); 486 mutex_exit(pse); 487 } 488 489 /* 490 * Read the comments inside of page_lock_es() carefully. 491 * 492 * SE_EXCL callers specifying es == SE_EXCL_WANTED will cause the 493 * SE_EWANTED bit of p_selock to be set when the lock cannot be obtained. 494 * This is used by threads subject to reader-starvation (eg. memory delete). 495 * 496 * When a thread using SE_EXCL_WANTED does not obtain the SE_EXCL lock, 497 * it is expected that it will retry at a later time. Threads that will 498 * not retry the lock *must* call page_lock_clr_exclwanted to clear the 499 * SE_EWANTED bit. (When a thread using SE_EXCL_WANTED obtains the lock, 500 * the bit is cleared.) 501 */ 502 int 503 page_try_reclaim_lock(page_t *pp, se_t se, int es) 504 { 505 kmutex_t *pse = PAGE_SE_MUTEX(pp); 506 selock_t old; 507 508 mutex_enter(pse); 509 510 old = pp->p_selock; 511 512 ASSERT(((es & SE_EXCL_WANTED) == 0) || 513 ((es & SE_EXCL_WANTED) && (se == SE_EXCL))); 514 515 if (PP_RETIRED(pp) && !(es & SE_RETIRED)) { 516 mutex_exit(pse); 517 VM_STAT_ADD(page_trylock_failed); 518 return (0); 519 } 520 521 if (se == SE_SHARED && es == 1 && old == 0) { 522 se = SE_EXCL; 523 } 524 525 if (se == SE_SHARED) { 526 if (!PP_ISFREE(pp)) { 527 if (old >= 0) { 528 /* 529 * Readers are not allowed when excl wanted 530 */ 531 if ((old & SE_EWANTED) == 0) { 532 pp->p_selock = old + SE_READER; 533 mutex_exit(pse); 534 return (1); 535 } 536 } 537 mutex_exit(pse); 538 return (0); 539 } 540 /* 541 * The page is free, so we really want SE_EXCL (below) 542 */ 543 VM_STAT_ADD(page_try_reclaim_upgrade); 544 } 545 546 /* 547 * The caller wants a writer lock. We try for it only if 548 * SE_EWANTED is not set, or if the caller specified 549 * SE_EXCL_WANTED. 550 */ 551 if (!(old & SE_EWANTED) || (es & SE_EXCL_WANTED)) { 552 if ((old & ~SE_EWANTED) == 0) { 553 /* no reader/writer lock held */ 554 THREAD_KPRI_REQUEST(); 555 /* this clears out our setting of the SE_EWANTED bit */ 556 pp->p_selock = SE_WRITER; 557 mutex_exit(pse); 558 return (1); 559 } 560 } 561 if (es & SE_EXCL_WANTED) { 562 /* page is locked, set the SE_EWANTED bit */ 563 pp->p_selock |= SE_EWANTED; 564 } 565 mutex_exit(pse); 566 return (0); 567 } 568 569 /* 570 * Acquire a page's "shared/exclusive" lock, but never block. 571 * Returns 1 on success, 0 on failure. 572 */ 573 int 574 page_trylock(page_t *pp, se_t se) 575 { 576 kmutex_t *pse = PAGE_SE_MUTEX(pp); 577 578 mutex_enter(pse); 579 if (pp->p_selock & SE_EWANTED || PP_RETIRED(pp) || 580 (se == SE_SHARED && PP_PR_NOSHARE(pp))) { 581 /* 582 * Fail if a thread wants exclusive access and page is 583 * retired, if the page is slated for retirement, or a 584 * share lock is requested. 585 */ 586 mutex_exit(pse); 587 VM_STAT_ADD(page_trylock_failed); 588 return (0); 589 } 590 591 if (se == SE_EXCL) { 592 if (pp->p_selock == 0) { 593 THREAD_KPRI_REQUEST(); 594 pp->p_selock = SE_WRITER; 595 mutex_exit(pse); 596 return (1); 597 } 598 } else { 599 if (pp->p_selock >= 0) { 600 pp->p_selock += SE_READER; 601 mutex_exit(pse); 602 return (1); 603 } 604 } 605 mutex_exit(pse); 606 return (0); 607 } 608 609 /* 610 * Variant of page_unlock() specifically for the page freelist 611 * code. The mere existence of this code is a vile hack that 612 * has resulted due to the backwards locking order of the page 613 * freelist manager; please don't call it. 614 */ 615 void 616 page_unlock_nocapture(page_t *pp) 617 { 618 kmutex_t *pse = PAGE_SE_MUTEX(pp); 619 selock_t old; 620 621 mutex_enter(pse); 622 623 old = pp->p_selock; 624 if ((old & ~SE_EWANTED) == SE_READER) { 625 pp->p_selock = old & ~SE_READER; 626 if (CV_HAS_WAITERS(&pp->p_cv)) 627 cv_broadcast(&pp->p_cv); 628 } else if ((old & ~SE_EWANTED) == SE_DELETED) { 629 panic("page_unlock_nocapture: page %p is deleted", (void *)pp); 630 } else if (old < 0) { 631 THREAD_KPRI_RELEASE(); 632 pp->p_selock &= SE_EWANTED; 633 if (CV_HAS_WAITERS(&pp->p_cv)) 634 cv_broadcast(&pp->p_cv); 635 } else if ((old & ~SE_EWANTED) > SE_READER) { 636 pp->p_selock = old - SE_READER; 637 } else { 638 panic("page_unlock_nocapture: page %p is not locked", 639 (void *)pp); 640 } 641 642 mutex_exit(pse); 643 } 644 645 /* 646 * Release the page's "shared/exclusive" lock and wake up anyone 647 * who might be waiting for it. 648 */ 649 void 650 page_unlock(page_t *pp) 651 { 652 kmutex_t *pse = PAGE_SE_MUTEX(pp); 653 selock_t old; 654 655 mutex_enter(pse); 656 657 old = pp->p_selock; 658 if ((old & ~SE_EWANTED) == SE_READER) { 659 pp->p_selock = old & ~SE_READER; 660 if (CV_HAS_WAITERS(&pp->p_cv)) 661 cv_broadcast(&pp->p_cv); 662 } else if ((old & ~SE_EWANTED) == SE_DELETED) { 663 panic("page_unlock: page %p is deleted", (void *)pp); 664 } else if (old < 0) { 665 THREAD_KPRI_RELEASE(); 666 pp->p_selock &= SE_EWANTED; 667 if (CV_HAS_WAITERS(&pp->p_cv)) 668 cv_broadcast(&pp->p_cv); 669 } else if ((old & ~SE_EWANTED) > SE_READER) { 670 pp->p_selock = old - SE_READER; 671 } else { 672 panic("page_unlock: page %p is not locked", (void *)pp); 673 } 674 675 if (pp->p_selock == 0) { 676 /* 677 * If the T_CAPTURING bit is set, that means that we should 678 * not try and capture the page again as we could recurse 679 * which could lead to a stack overflow panic or spending a 680 * relatively long time in the kernel making no progress. 681 */ 682 if ((pp->p_toxic & PR_CAPTURE) && 683 !(curthread->t_flag & T_CAPTURING) && 684 !PP_RETIRED(pp)) { 685 THREAD_KPRI_REQUEST(); 686 pp->p_selock = SE_WRITER; 687 mutex_exit(pse); 688 page_unlock_capture(pp); 689 } else { 690 mutex_exit(pse); 691 } 692 } else { 693 mutex_exit(pse); 694 } 695 } 696 697 /* 698 * Try to upgrade the lock on the page from a "shared" to an 699 * "exclusive" lock. Since this upgrade operation is done while 700 * holding the mutex protecting this page, no one else can acquire this page's 701 * lock and change the page. Thus, it is safe to drop the "shared" 702 * lock and attempt to acquire the "exclusive" lock. 703 * 704 * Returns 1 on success, 0 on failure. 705 */ 706 int 707 page_tryupgrade(page_t *pp) 708 { 709 kmutex_t *pse = PAGE_SE_MUTEX(pp); 710 711 mutex_enter(pse); 712 if (!(pp->p_selock & SE_EWANTED)) { 713 /* no threads want exclusive access, try upgrade */ 714 if (pp->p_selock == SE_READER) { 715 THREAD_KPRI_REQUEST(); 716 /* convert to exclusive lock */ 717 pp->p_selock = SE_WRITER; 718 mutex_exit(pse); 719 return (1); 720 } 721 } 722 mutex_exit(pse); 723 return (0); 724 } 725 726 /* 727 * Downgrade the "exclusive" lock on the page to a "shared" lock 728 * while holding the mutex protecting this page's p_selock field. 729 */ 730 void 731 page_downgrade(page_t *pp) 732 { 733 kmutex_t *pse = PAGE_SE_MUTEX(pp); 734 int excl_waiting; 735 736 ASSERT((pp->p_selock & ~SE_EWANTED) != SE_DELETED); 737 ASSERT(PAGE_EXCL(pp)); 738 739 mutex_enter(pse); 740 excl_waiting = pp->p_selock & SE_EWANTED; 741 THREAD_KPRI_RELEASE(); 742 pp->p_selock = SE_READER | excl_waiting; 743 if (CV_HAS_WAITERS(&pp->p_cv)) 744 cv_broadcast(&pp->p_cv); 745 mutex_exit(pse); 746 } 747 748 void 749 page_lock_delete(page_t *pp) 750 { 751 kmutex_t *pse = PAGE_SE_MUTEX(pp); 752 753 ASSERT(PAGE_EXCL(pp)); 754 ASSERT(pp->p_vnode == NULL); 755 ASSERT(pp->p_offset == (u_offset_t)-1); 756 ASSERT(!PP_ISFREE(pp)); 757 758 mutex_enter(pse); 759 THREAD_KPRI_RELEASE(); 760 pp->p_selock = SE_DELETED; 761 if (CV_HAS_WAITERS(&pp->p_cv)) 762 cv_broadcast(&pp->p_cv); 763 mutex_exit(pse); 764 } 765 766 int 767 page_deleted(page_t *pp) 768 { 769 return (pp->p_selock == SE_DELETED); 770 } 771 772 /* 773 * Implement the io lock for pages 774 */ 775 void 776 page_iolock_init(page_t *pp) 777 { 778 pp->p_iolock_state = 0; 779 cv_init(&pp->p_io_cv, NULL, CV_DEFAULT, NULL); 780 } 781 782 /* 783 * Acquire the i/o lock on a page. 784 */ 785 void 786 page_io_lock(page_t *pp) 787 { 788 kmutex_t *pio; 789 790 pio = PAGE_IO_MUTEX(pp); 791 mutex_enter(pio); 792 while (pp->p_iolock_state & PAGE_IO_INUSE) { 793 cv_wait(&(pp->p_io_cv), pio); 794 } 795 pp->p_iolock_state |= PAGE_IO_INUSE; 796 mutex_exit(pio); 797 } 798 799 /* 800 * Release the i/o lock on a page. 801 */ 802 void 803 page_io_unlock(page_t *pp) 804 { 805 kmutex_t *pio; 806 807 pio = PAGE_IO_MUTEX(pp); 808 mutex_enter(pio); 809 cv_broadcast(&pp->p_io_cv); 810 pp->p_iolock_state &= ~PAGE_IO_INUSE; 811 mutex_exit(pio); 812 } 813 814 /* 815 * Try to acquire the i/o lock on a page without blocking. 816 * Returns 1 on success, 0 on failure. 817 */ 818 int 819 page_io_trylock(page_t *pp) 820 { 821 kmutex_t *pio; 822 823 if (pp->p_iolock_state & PAGE_IO_INUSE) 824 return (0); 825 826 pio = PAGE_IO_MUTEX(pp); 827 mutex_enter(pio); 828 829 if (pp->p_iolock_state & PAGE_IO_INUSE) { 830 mutex_exit(pio); 831 return (0); 832 } 833 pp->p_iolock_state |= PAGE_IO_INUSE; 834 mutex_exit(pio); 835 836 return (1); 837 } 838 839 /* 840 * Wait until the i/o lock is not held. 841 */ 842 void 843 page_io_wait(page_t *pp) 844 { 845 kmutex_t *pio; 846 847 pio = PAGE_IO_MUTEX(pp); 848 mutex_enter(pio); 849 while (pp->p_iolock_state & PAGE_IO_INUSE) { 850 cv_wait(&(pp->p_io_cv), pio); 851 } 852 mutex_exit(pio); 853 } 854 855 /* 856 * Returns 1 on success, 0 on failure. 857 */ 858 int 859 page_io_locked(page_t *pp) 860 { 861 return (pp->p_iolock_state & PAGE_IO_INUSE); 862 } 863 864 /* 865 * Assert that the i/o lock on a page is held. 866 * Returns 1 on success, 0 on failure. 867 */ 868 int 869 page_iolock_assert(page_t *pp) 870 { 871 return (page_io_locked(pp)); 872 } 873 874 /* 875 * Wrapper exported to kernel routines that are built 876 * platform-independent (the macro is platform-dependent; 877 * the size of vph_mutex[] is based on NCPU). 878 * 879 * Note that you can do stress testing on this by setting the 880 * variable page_vnode_mutex_stress to something other than 881 * zero in a DEBUG kernel in a debugger after loading the kernel. 882 * Setting it after the kernel is running may not work correctly. 883 */ 884 #ifdef DEBUG 885 static int page_vnode_mutex_stress = 0; 886 #endif 887 888 kmutex_t * 889 page_vnode_mutex(vnode_t *vp) 890 { 891 if (vp == &kvp) 892 return (&vph_mutex[VPH_TABLE_SIZE + 0]); 893 894 if (vp == &zvp) 895 return (&vph_mutex[VPH_TABLE_SIZE + 1]); 896 #ifdef DEBUG 897 if (page_vnode_mutex_stress != 0) 898 return (&vph_mutex[0]); 899 #endif 900 901 return (&vph_mutex[VP_HASH_FUNC(vp)]); 902 } 903 904 kmutex_t * 905 page_se_mutex(page_t *pp) 906 { 907 return (PAGE_SE_MUTEX(pp)); 908 } 909 910 #ifdef VM_STATS 911 uint_t pszclck_stat[4]; 912 #endif 913 /* 914 * Find, take and return a mutex held by hat_page_demote(). 915 * Called by page_demote_vp_pages() before hat_page_demote() call and by 916 * routines that want to block hat_page_demote() but can't do it 917 * via locking all constituent pages. 918 * 919 * Return NULL if p_szc is 0. 920 * 921 * It should only be used for pages that can be demoted by hat_page_demote() 922 * i.e. non swapfs file system pages. The logic here is lifted from 923 * sfmmu_mlspl_enter() except there's no need to worry about p_szc increase 924 * since the page is locked and not free. 925 * 926 * Hash of the root page is used to find the lock. 927 * To find the root in the presense of hat_page_demote() chageing the location 928 * of the root this routine relies on the fact that hat_page_demote() changes 929 * root last. 930 * 931 * If NULL is returned pp's p_szc is guaranteed to be 0. If non NULL is 932 * returned pp's p_szc may be any value. 933 */ 934 kmutex_t * 935 page_szc_lock(page_t *pp) 936 { 937 kmutex_t *mtx; 938 page_t *rootpp; 939 uint_t szc; 940 uint_t rszc; 941 uint_t pszc = pp->p_szc; 942 943 ASSERT(pp != NULL); 944 ASSERT(PAGE_LOCKED(pp)); 945 ASSERT(!PP_ISFREE(pp)); 946 ASSERT(pp->p_vnode != NULL); 947 ASSERT(!IS_SWAPFSVP(pp->p_vnode)); 948 ASSERT(!PP_ISKAS(pp)); 949 950 again: 951 if (pszc == 0) { 952 VM_STAT_ADD(pszclck_stat[0]); 953 return (NULL); 954 } 955 956 /* The lock lives in the root page */ 957 958 rootpp = PP_GROUPLEADER(pp, pszc); 959 mtx = PAGE_SZC_MUTEX(rootpp); 960 mutex_enter(mtx); 961 962 /* 963 * since p_szc can only decrease if pp == rootpp 964 * rootpp will be always the same i.e we have the right root 965 * regardless of rootpp->p_szc. 966 * If location of pp's root didn't change after we took 967 * the lock we have the right root. return mutex hashed off it. 968 */ 969 if (pp == rootpp || (rszc = rootpp->p_szc) == pszc) { 970 VM_STAT_ADD(pszclck_stat[1]); 971 return (mtx); 972 } 973 974 /* 975 * root location changed because page got demoted. 976 * locate the new root. 977 */ 978 if (rszc < pszc) { 979 szc = pp->p_szc; 980 ASSERT(szc < pszc); 981 mutex_exit(mtx); 982 pszc = szc; 983 VM_STAT_ADD(pszclck_stat[2]); 984 goto again; 985 } 986 987 VM_STAT_ADD(pszclck_stat[3]); 988 /* 989 * current hat_page_demote not done yet. 990 * wait for it to finish. 991 */ 992 mutex_exit(mtx); 993 rootpp = PP_GROUPLEADER(rootpp, rszc); 994 mtx = PAGE_SZC_MUTEX(rootpp); 995 mutex_enter(mtx); 996 mutex_exit(mtx); 997 ASSERT(rootpp->p_szc < rszc); 998 goto again; 999 } 1000 1001 int 1002 page_szc_lock_assert(page_t *pp) 1003 { 1004 page_t *rootpp = PP_PAGEROOT(pp); 1005 kmutex_t *mtx = PAGE_SZC_MUTEX(rootpp); 1006 1007 return (MUTEX_HELD(mtx)); 1008 } 1009 1010 /* 1011 * memseg locking 1012 */ 1013 static krwlock_t memsegslock; 1014 1015 /* 1016 * memlist (phys_install, phys_avail) locking. 1017 */ 1018 static krwlock_t memlists_lock; 1019 1020 int 1021 memsegs_trylock(int writer) 1022 { 1023 return (rw_tryenter(&memsegslock, writer ? RW_WRITER : RW_READER)); 1024 } 1025 1026 void 1027 memsegs_lock(int writer) 1028 { 1029 rw_enter(&memsegslock, writer ? RW_WRITER : RW_READER); 1030 } 1031 1032 /*ARGSUSED*/ 1033 void 1034 memsegs_unlock(int writer) 1035 { 1036 rw_exit(&memsegslock); 1037 } 1038 1039 int 1040 memsegs_lock_held(void) 1041 { 1042 return (RW_LOCK_HELD(&memsegslock)); 1043 } 1044 1045 void 1046 memlist_read_lock(void) 1047 { 1048 rw_enter(&memlists_lock, RW_READER); 1049 } 1050 1051 void 1052 memlist_read_unlock(void) 1053 { 1054 rw_exit(&memlists_lock); 1055 } 1056 1057 void 1058 memlist_write_lock(void) 1059 { 1060 rw_enter(&memlists_lock, RW_WRITER); 1061 } 1062 1063 void 1064 memlist_write_unlock(void) 1065 { 1066 rw_exit(&memlists_lock); 1067 }