1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
  23  */
  24 
  25 
  26 /*
  27  * VM - page locking primitives
  28  */
  29 #include <sys/param.h>
  30 #include <sys/t_lock.h>
  31 #include <sys/vtrace.h>
  32 #include <sys/debug.h>
  33 #include <sys/cmn_err.h>
  34 #include <sys/bitmap.h>
  35 #include <sys/lockstat.h>
  36 #include <sys/sysmacros.h>
  37 #include <sys/condvar_impl.h>
  38 #include <vm/page.h>
  39 #include <vm/seg_enum.h>
  40 #include <vm/vm_dep.h>
  41 #include <vm/seg_kmem.h>
  42 
  43 /*
  44  * This global mutex array is for logical page locking.
  45  * The following fields in the page structure are protected
  46  * by this lock:
  47  *
  48  *      p_lckcnt
  49  *      p_cowcnt
  50  */
  51 pad_mutex_t page_llocks[8 * NCPU_P2];
  52 
  53 /*
  54  * This is a global lock for the logical page free list.  The
  55  * logical free list, in this implementation, is maintained as two
  56  * separate physical lists - the cache list and the free list.
  57  */
  58 kmutex_t  page_freelock;
  59 
  60 /*
  61  * The hash table, page_hash[], the p_selock fields, and the
  62  * list of pages associated with vnodes are protected by arrays of mutexes.
  63  *
  64  * Unless the hashes are changed radically, the table sizes must be
  65  * a power of two.  Also, we typically need more mutexes for the
  66  * vnodes since these locks are occasionally held for long periods.
  67  * And since there seem to be two special vnodes (kvp and swapvp),
  68  * we make room for private mutexes for them.
  69  *
  70  * The pse_mutex[] array holds the mutexes to protect the p_selock
  71  * fields of all page_t structures.
  72  *
  73  * PAGE_SE_MUTEX(pp) returns the address of the appropriate mutex
  74  * when given a pointer to a page_t.
  75  *
  76  * PIO_TABLE_SIZE must be a power of two.  One could argue that we
  77  * should go to the trouble of setting it up at run time and base it
  78  * on memory size rather than the number of compile time CPUs.
  79  *
  80  * XX64 We should be using physmem size to calculate PIO_SHIFT.
  81  *
  82  *      These might break in 64 bit world.
  83  */
  84 #define PIO_SHIFT       7       /* log2(sizeof(page_t)) */
  85 #define PIO_TABLE_SIZE  128     /* number of io mutexes to have */
  86 
  87 pad_mutex_t     ph_mutex[PH_TABLE_SIZE];
  88 kmutex_t        pio_mutex[PIO_TABLE_SIZE];
  89 
  90 #define PAGE_IO_MUTEX(pp) \
  91             &pio_mutex[(((uintptr_t)pp) >> PIO_SHIFT) & (PIO_TABLE_SIZE - 1)]
  92 
  93 /*
  94  * The pse_mutex[] array is allocated in the platform startup code
  95  * based on the size of the machine at startup.
  96  */
  97 extern pad_mutex_t *pse_mutex;          /* Locks protecting pp->p_selock */
  98 extern size_t pse_table_size;           /* Number of mutexes in pse_mutex[] */
  99 extern int pse_shift;                   /* log2(pse_table_size) */
 100 #define PAGE_SE_MUTEX(pp)       &pse_mutex[                         \
 101         ((((uintptr_t)(pp) >> pse_shift) ^ ((uintptr_t)(pp))) >> 7) &   \
 102         (pse_table_size - 1)].pad_mutex
 103 
 104 #define PSZC_MTX_TABLE_SIZE     128
 105 #define PSZC_MTX_TABLE_SHIFT    7
 106 
 107 static pad_mutex_t      pszc_mutex[PSZC_MTX_TABLE_SIZE];
 108 
 109 #define PAGE_SZC_MUTEX(_pp) \
 110             &pszc_mutex[((((uintptr_t)(_pp) >> PSZC_MTX_TABLE_SHIFT) ^ \
 111                 ((uintptr_t)(_pp) >> (PSZC_MTX_TABLE_SHIFT << 1)) ^ \
 112                 ((uintptr_t)(_pp) >> (3 * PSZC_MTX_TABLE_SHIFT))) & \
 113                 (PSZC_MTX_TABLE_SIZE - 1))].pad_mutex
 114 
 115 /*
 116  * The vph_mutex[] array  holds the mutexes to protect the vnode chains,
 117  * (i.e., the list of pages anchored by v_pages and connected via p_vpprev
 118  * and p_vpnext).
 119  *
 120  * The page_vnode_mutex(vp) function returns the address of the appropriate
 121  * mutex from this array given a pointer to a vnode.  It is complicated
 122  * by the fact that the kernel's vnode and the swapfs vnode are referenced
 123  * frequently enough to warrent their own mutexes.
 124  *
 125  * The VP_HASH_FUNC returns the index into the vph_mutex array given
 126  * an address of a vnode.
 127  */
 128 
 129 #if defined(_LP64)
 130 #define VPH_TABLE_SIZE  (8 * NCPU_P2)
 131 #else   /* 32 bits */
 132 #define VPH_TABLE_SIZE  (2 * NCPU_P2)
 133 #endif
 134 
 135 #define VP_HASH_FUNC(vp) \
 136         ((((uintptr_t)(vp) >> 6) + \
 137             ((uintptr_t)(vp) >> 8) + \
 138             ((uintptr_t)(vp) >> 10) + \
 139             ((uintptr_t)(vp) >> 12)) \
 140             & (VPH_TABLE_SIZE - 1))
 141 
 142 /*
 143  * Two slots after VPH_TABLE_SIZE are reserved in vph_mutex for kernel vnodes.
 144  * The lock for kvp is VPH_TABLE_SIZE + 0, and the lock for zvp is
 145  * VPH_TABLE_SIZE + 1.
 146  */
 147 
 148 kmutex_t        vph_mutex[VPH_TABLE_SIZE + 2];
 149 
 150 /*
 151  * Initialize the locks used by the Virtual Memory Management system.
 152  */
 153 void
 154 page_lock_init()
 155 {
 156 }
 157 
 158 /*
 159  * Return a value for pse_shift based on npg (the number of physical pages)
 160  * and ncpu (the maximum number of CPUs).  This is called by platform startup
 161  * code.
 162  *
 163  * Lockstat data from TPC-H runs showed that contention on the pse_mutex[]
 164  * locks grew approximately as the square of the number of threads executing.
 165  * So the primary scaling factor used is NCPU^2.  The size of the machine in
 166  * megabytes is used as an upper bound, particularly for sun4v machines which
 167  * all claim to have 256 CPUs maximum, and the old value of PSE_TABLE_SIZE
 168  * (128) is used as a minimum.  Since the size of the table has to be a power
 169  * of two, the calculated size is rounded up to the next power of two.
 170  */
 171 /*ARGSUSED*/
 172 int
 173 size_pse_array(pgcnt_t npg, int ncpu)
 174 {
 175         size_t size;
 176         pgcnt_t pp_per_mb = (1024 * 1024) / PAGESIZE;
 177 
 178         size = MAX(128, MIN(npg / pp_per_mb, 2 * ncpu * ncpu));
 179         size += (1 << (highbit(size) - 1)) - 1;
 180         return (highbit(size) - 1);
 181 }
 182 
 183 /*
 184  * At present we only use page ownership to aid debugging, so it's
 185  * OK if the owner field isn't exact.  In the 32-bit world two thread ids
 186  * can map to the same owner because we just 'or' in 0x80000000 and
 187  * then clear the second highest bit, so that (for example) 0x2faced00
 188  * and 0xafaced00 both map to 0xafaced00.
 189  * In the 64-bit world, p_selock may not be large enough to hold a full
 190  * thread pointer.  If we ever need precise ownership (e.g. if we implement
 191  * priority inheritance for page locks) then p_selock should become a
 192  * uintptr_t and SE_WRITER should be -((uintptr_t)curthread >> 2).
 193  */
 194 #define SE_WRITER       (((selock_t)(ulong_t)curthread | INT_MIN) & ~SE_EWANTED)
 195 #define SE_READER       1
 196 
 197 /*
 198  * A page that is deleted must be marked as such using the
 199  * page_lock_delete() function. The page must be exclusively locked.
 200  * The SE_DELETED marker is put in p_selock when this function is called.
 201  * SE_DELETED must be distinct from any SE_WRITER value.
 202  */
 203 #define SE_DELETED      (1 | INT_MIN)
 204 
 205 #ifdef VM_STATS
 206 uint_t  vph_kvp_count;
 207 uint_t  vph_swapfsvp_count;
 208 uint_t  vph_other;
 209 #endif /* VM_STATS */
 210 
 211 #ifdef VM_STATS
 212 uint_t  page_lock_count;
 213 uint_t  page_lock_miss;
 214 uint_t  page_lock_miss_lock;
 215 uint_t  page_lock_reclaim;
 216 uint_t  page_lock_bad_reclaim;
 217 uint_t  page_lock_same_page;
 218 uint_t  page_lock_upgrade;
 219 uint_t  page_lock_retired;
 220 uint_t  page_lock_upgrade_failed;
 221 uint_t  page_lock_deleted;
 222 
 223 uint_t  page_trylock_locked;
 224 uint_t  page_trylock_failed;
 225 uint_t  page_trylock_missed;
 226 
 227 uint_t  page_try_reclaim_upgrade;
 228 #endif /* VM_STATS */
 229 
 230 /*
 231  * Acquire the "shared/exclusive" lock on a page.
 232  *
 233  * Returns 1 on success and locks the page appropriately.
 234  *         0 on failure and does not lock the page.
 235  *
 236  * If `lock' is non-NULL, it will be dropped and reacquired in the
 237  * failure case.  This routine can block, and if it does
 238  * it will always return a failure since the page identity [vp, off]
 239  * or state may have changed.
 240  */
 241 
 242 int
 243 page_lock(page_t *pp, se_t se, kmutex_t *lock, reclaim_t reclaim)
 244 {
 245         return (page_lock_es(pp, se, lock, reclaim, 0));
 246 }
 247 
 248 /*
 249  * With the addition of reader-writer lock semantics to page_lock_es,
 250  * callers wanting an exclusive (writer) lock may prevent shared-lock
 251  * (reader) starvation by setting the es parameter to SE_EXCL_WANTED.
 252  * In this case, when an exclusive lock cannot be acquired, p_selock's
 253  * SE_EWANTED bit is set. Shared-lock (reader) requests are also denied
 254  * if the page is slated for retirement.
 255  *
 256  * The se and es parameters determine if the lock should be granted
 257  * based on the following decision table:
 258  *
 259  * Lock wanted   es flags     p_selock/SE_EWANTED  Action
 260  * ----------- -------------- -------------------  ---------
 261  * SE_EXCL        any [1][2]   unlocked/any        grant lock, clear SE_EWANTED
 262  * SE_EXCL        SE_EWANTED   any lock/any        deny, set SE_EWANTED
 263  * SE_EXCL        none         any lock/any        deny
 264  * SE_SHARED      n/a [2]        shared/0          grant
 265  * SE_SHARED      n/a [2]      unlocked/0          grant
 266  * SE_SHARED      n/a            shared/1          deny
 267  * SE_SHARED      n/a          unlocked/1          deny
 268  * SE_SHARED      n/a              excl/any        deny
 269  *
 270  * Notes:
 271  * [1] The code grants an exclusive lock to the caller and clears the bit
 272  *   SE_EWANTED whenever p_selock is unlocked, regardless of the SE_EWANTED
 273  *   bit's value.  This was deemed acceptable as we are not concerned about
 274  *   exclusive-lock starvation. If this ever becomes an issue, a priority or
 275  *   fifo mechanism should also be implemented. Meantime, the thread that
 276  *   set SE_EWANTED should be prepared to catch this condition and reset it
 277  *
 278  * [2] Retired pages may not be locked at any time, regardless of the
 279  *   dispostion of se, unless the es parameter has SE_RETIRED flag set.
 280  *
 281  * Notes on values of "es":
 282  *
 283  *   es & 1: page_lookup_create will attempt page relocation
 284  *   es & SE_EXCL_WANTED: caller wants SE_EWANTED set (eg. delete
 285  *       memory thread); this prevents reader-starvation of waiting
 286  *       writer thread(s) by giving priority to writers over readers.
 287  *   es & SE_RETIRED: caller wants to lock pages even if they are
 288  *       retired.  Default is to deny the lock if the page is retired.
 289  *
 290  * And yes, we know, the semantics of this function are too complicated.
 291  * It's on the list to be cleaned up.
 292  */
 293 int
 294 page_lock_es(page_t *pp, se_t se, kmutex_t *lock, reclaim_t reclaim, int es)
 295 {
 296         int             retval;
 297         kmutex_t        *pse = PAGE_SE_MUTEX(pp);
 298         int             upgraded;
 299         int             reclaim_it;
 300 
 301         ASSERT(lock != NULL ? MUTEX_HELD(lock) : 1);
 302 
 303         VM_STAT_ADD(page_lock_count);
 304 
 305         upgraded = 0;
 306         reclaim_it = 0;
 307 
 308         mutex_enter(pse);
 309 
 310         ASSERT(((es & SE_EXCL_WANTED) == 0) ||
 311             ((es & SE_EXCL_WANTED) && (se == SE_EXCL)));
 312 
 313         if (PP_RETIRED(pp) && !(es & SE_RETIRED)) {
 314                 mutex_exit(pse);
 315                 VM_STAT_ADD(page_lock_retired);
 316                 return (0);
 317         }
 318 
 319         if (se == SE_SHARED && es == 1 && pp->p_selock == 0) {
 320                 se = SE_EXCL;
 321         }
 322 
 323         if ((reclaim == P_RECLAIM) && (PP_ISFREE(pp))) {
 324 
 325                 reclaim_it = 1;
 326                 if (se == SE_SHARED) {
 327                         /*
 328                          * This is an interesting situation.
 329                          *
 330                          * Remember that p_free can only change if
 331                          * p_selock < 0.
 332                          * p_free does not depend on our holding `pse'.
 333                          * And, since we hold `pse', p_selock can not change.
 334                          * So, if p_free changes on us, the page is already
 335                          * exclusively held, and we would fail to get p_selock
 336                          * regardless.
 337                          *
 338                          * We want to avoid getting the share
 339                          * lock on a free page that needs to be reclaimed.
 340                          * It is possible that some other thread has the share
 341                          * lock and has left the free page on the cache list.
 342                          * pvn_vplist_dirty() does this for brief periods.
 343                          * If the se_share is currently SE_EXCL, we will fail
 344                          * to acquire p_selock anyway.  Blocking is the
 345                          * right thing to do.
 346                          * If we need to reclaim this page, we must get
 347                          * exclusive access to it, force the upgrade now.
 348                          * Again, we will fail to acquire p_selock if the
 349                          * page is not free and block.
 350                          */
 351                         upgraded = 1;
 352                         se = SE_EXCL;
 353                         VM_STAT_ADD(page_lock_upgrade);
 354                 }
 355         }
 356 
 357         if (se == SE_EXCL) {
 358                 if (!(es & SE_EXCL_WANTED) && (pp->p_selock & SE_EWANTED)) {
 359                         /*
 360                          * if the caller wants a writer lock (but did not
 361                          * specify exclusive access), and there is a pending
 362                          * writer that wants exclusive access, return failure
 363                          */
 364                         retval = 0;
 365                 } else if ((pp->p_selock & ~SE_EWANTED) == 0) {
 366                         /* no reader/writer lock held */
 367                         THREAD_KPRI_REQUEST();
 368                         /* this clears our setting of the SE_EWANTED bit */
 369                         pp->p_selock = SE_WRITER;
 370                         retval = 1;
 371                 } else {
 372                         /* page is locked */
 373                         if (es & SE_EXCL_WANTED) {
 374                                 /* set the SE_EWANTED bit */
 375                                 pp->p_selock |= SE_EWANTED;
 376                         }
 377                         retval = 0;
 378                 }
 379         } else {
 380                 retval = 0;
 381                 if (pp->p_selock >= 0) {
 382                         if ((pp->p_selock & SE_EWANTED) == 0) {
 383                                 pp->p_selock += SE_READER;
 384                                 retval = 1;
 385                         }
 386                 }
 387         }
 388 
 389         if (retval == 0) {
 390                 if ((pp->p_selock & ~SE_EWANTED) == SE_DELETED) {
 391                         VM_STAT_ADD(page_lock_deleted);
 392                         mutex_exit(pse);
 393                         return (retval);
 394                 }
 395 
 396 #ifdef VM_STATS
 397                 VM_STAT_ADD(page_lock_miss);
 398                 if (upgraded) {
 399                         VM_STAT_ADD(page_lock_upgrade_failed);
 400                 }
 401 #endif
 402                 if (lock) {
 403                         VM_STAT_ADD(page_lock_miss_lock);
 404                         mutex_exit(lock);
 405                 }
 406 
 407                 /*
 408                  * Now, wait for the page to be unlocked and
 409                  * release the lock protecting p_cv and p_selock.
 410                  */
 411                 cv_wait(&pp->p_cv, pse);
 412                 mutex_exit(pse);
 413 
 414                 /*
 415                  * The page identity may have changed while we were
 416                  * blocked.  If we are willing to depend on "pp"
 417                  * still pointing to a valid page structure (i.e.,
 418                  * assuming page structures are not dynamically allocated
 419                  * or freed), we could try to lock the page if its
 420                  * identity hasn't changed.
 421                  *
 422                  * This needs to be measured, since we come back from
 423                  * cv_wait holding pse (the expensive part of this
 424                  * operation) we might as well try the cheap part.
 425                  * Though we would also have to confirm that dropping
 426                  * `lock' did not cause any grief to the callers.
 427                  */
 428                 if (lock) {
 429                         mutex_enter(lock);
 430                 }
 431         } else {
 432                 /*
 433                  * We have the page lock.
 434                  * If we needed to reclaim the page, and the page
 435                  * needed reclaiming (ie, it was free), then we
 436                  * have the page exclusively locked.  We may need
 437                  * to downgrade the page.
 438                  */
 439                 ASSERT((upgraded) ?
 440                     ((PP_ISFREE(pp)) && PAGE_EXCL(pp)) : 1);
 441                 mutex_exit(pse);
 442 
 443                 /*
 444                  * We now hold this page's lock, either shared or
 445                  * exclusive.  This will prevent its identity from changing.
 446                  * The page, however, may or may not be free.  If the caller
 447                  * requested, and it is free, go reclaim it from the
 448                  * free list.  If the page can't be reclaimed, return failure
 449                  * so that the caller can start all over again.
 450                  *
 451                  * NOTE:page_reclaim() releases the page lock (p_selock)
 452                  *      if it can't be reclaimed.
 453                  */
 454                 if (reclaim_it) {
 455                         if (!page_reclaim(pp, lock)) {
 456                                 VM_STAT_ADD(page_lock_bad_reclaim);
 457                                 retval = 0;
 458                         } else {
 459                                 VM_STAT_ADD(page_lock_reclaim);
 460                                 if (upgraded) {
 461                                         page_downgrade(pp);
 462                                 }
 463                         }
 464                 }
 465         }
 466         return (retval);
 467 }
 468 
 469 /*
 470  * Clear the SE_EWANTED bit from p_selock.  This function allows
 471  * callers of page_lock_es and page_try_reclaim_lock to clear
 472  * their setting of this bit if they decide they no longer wish
 473  * to gain exclusive access to the page.  Currently only
 474  * delete_memory_thread uses this when the delete memory
 475  * operation is cancelled.
 476  */
 477 void
 478 page_lock_clr_exclwanted(page_t *pp)
 479 {
 480         kmutex_t *pse = PAGE_SE_MUTEX(pp);
 481 
 482         mutex_enter(pse);
 483         pp->p_selock &= ~SE_EWANTED;
 484         if (CV_HAS_WAITERS(&pp->p_cv))
 485                 cv_broadcast(&pp->p_cv);
 486         mutex_exit(pse);
 487 }
 488 
 489 /*
 490  * Read the comments inside of page_lock_es() carefully.
 491  *
 492  * SE_EXCL callers specifying es == SE_EXCL_WANTED will cause the
 493  * SE_EWANTED bit of p_selock to be set when the lock cannot be obtained.
 494  * This is used by threads subject to reader-starvation (eg. memory delete).
 495  *
 496  * When a thread using SE_EXCL_WANTED does not obtain the SE_EXCL lock,
 497  * it is expected that it will retry at a later time.  Threads that will
 498  * not retry the lock *must* call page_lock_clr_exclwanted to clear the
 499  * SE_EWANTED bit.  (When a thread using SE_EXCL_WANTED obtains the lock,
 500  * the bit is cleared.)
 501  */
 502 int
 503 page_try_reclaim_lock(page_t *pp, se_t se, int es)
 504 {
 505         kmutex_t *pse = PAGE_SE_MUTEX(pp);
 506         selock_t old;
 507 
 508         mutex_enter(pse);
 509 
 510         old = pp->p_selock;
 511 
 512         ASSERT(((es & SE_EXCL_WANTED) == 0) ||
 513             ((es & SE_EXCL_WANTED) && (se == SE_EXCL)));
 514 
 515         if (PP_RETIRED(pp) && !(es & SE_RETIRED)) {
 516                 mutex_exit(pse);
 517                 VM_STAT_ADD(page_trylock_failed);
 518                 return (0);
 519         }
 520 
 521         if (se == SE_SHARED && es == 1 && old == 0) {
 522                 se = SE_EXCL;
 523         }
 524 
 525         if (se == SE_SHARED) {
 526                 if (!PP_ISFREE(pp)) {
 527                         if (old >= 0) {
 528                                 /*
 529                                  * Readers are not allowed when excl wanted
 530                                  */
 531                                 if ((old & SE_EWANTED) == 0) {
 532                                         pp->p_selock = old + SE_READER;
 533                                         mutex_exit(pse);
 534                                         return (1);
 535                                 }
 536                         }
 537                         mutex_exit(pse);
 538                         return (0);
 539                 }
 540                 /*
 541                  * The page is free, so we really want SE_EXCL (below)
 542                  */
 543                 VM_STAT_ADD(page_try_reclaim_upgrade);
 544         }
 545 
 546         /*
 547          * The caller wants a writer lock.  We try for it only if
 548          * SE_EWANTED is not set, or if the caller specified
 549          * SE_EXCL_WANTED.
 550          */
 551         if (!(old & SE_EWANTED) || (es & SE_EXCL_WANTED)) {
 552                 if ((old & ~SE_EWANTED) == 0) {
 553                         /* no reader/writer lock held */
 554                         THREAD_KPRI_REQUEST();
 555                         /* this clears out our setting of the SE_EWANTED bit */
 556                         pp->p_selock = SE_WRITER;
 557                         mutex_exit(pse);
 558                         return (1);
 559                 }
 560         }
 561         if (es & SE_EXCL_WANTED) {
 562                 /* page is locked, set the SE_EWANTED bit */
 563                 pp->p_selock |= SE_EWANTED;
 564         }
 565         mutex_exit(pse);
 566         return (0);
 567 }
 568 
 569 /*
 570  * Acquire a page's "shared/exclusive" lock, but never block.
 571  * Returns 1 on success, 0 on failure.
 572  */
 573 int
 574 page_trylock(page_t *pp, se_t se)
 575 {
 576         kmutex_t *pse = PAGE_SE_MUTEX(pp);
 577 
 578         mutex_enter(pse);
 579         if (pp->p_selock & SE_EWANTED || PP_RETIRED(pp) ||
 580             (se == SE_SHARED && PP_PR_NOSHARE(pp))) {
 581                 /*
 582                  * Fail if a thread wants exclusive access and page is
 583                  * retired, if the page is slated for retirement, or a
 584                  * share lock is requested.
 585                  */
 586                 mutex_exit(pse);
 587                 VM_STAT_ADD(page_trylock_failed);
 588                 return (0);
 589         }
 590 
 591         if (se == SE_EXCL) {
 592                 if (pp->p_selock == 0) {
 593                         THREAD_KPRI_REQUEST();
 594                         pp->p_selock = SE_WRITER;
 595                         mutex_exit(pse);
 596                         return (1);
 597                 }
 598         } else {
 599                 if (pp->p_selock >= 0) {
 600                         pp->p_selock += SE_READER;
 601                         mutex_exit(pse);
 602                         return (1);
 603                 }
 604         }
 605         mutex_exit(pse);
 606         return (0);
 607 }
 608 
 609 /*
 610  * Variant of page_unlock() specifically for the page freelist
 611  * code. The mere existence of this code is a vile hack that
 612  * has resulted due to the backwards locking order of the page
 613  * freelist manager; please don't call it.
 614  */
 615 void
 616 page_unlock_nocapture(page_t *pp)
 617 {
 618         kmutex_t *pse = PAGE_SE_MUTEX(pp);
 619         selock_t old;
 620 
 621         mutex_enter(pse);
 622 
 623         old = pp->p_selock;
 624         if ((old & ~SE_EWANTED) == SE_READER) {
 625                 pp->p_selock = old & ~SE_READER;
 626                 if (CV_HAS_WAITERS(&pp->p_cv))
 627                         cv_broadcast(&pp->p_cv);
 628         } else if ((old & ~SE_EWANTED) == SE_DELETED) {
 629                 panic("page_unlock_nocapture: page %p is deleted", (void *)pp);
 630         } else if (old < 0) {
 631                 THREAD_KPRI_RELEASE();
 632                 pp->p_selock &= SE_EWANTED;
 633                 if (CV_HAS_WAITERS(&pp->p_cv))
 634                         cv_broadcast(&pp->p_cv);
 635         } else if ((old & ~SE_EWANTED) > SE_READER) {
 636                 pp->p_selock = old - SE_READER;
 637         } else {
 638                 panic("page_unlock_nocapture: page %p is not locked",
 639                     (void *)pp);
 640         }
 641 
 642         mutex_exit(pse);
 643 }
 644 
 645 /*
 646  * Release the page's "shared/exclusive" lock and wake up anyone
 647  * who might be waiting for it.
 648  */
 649 void
 650 page_unlock(page_t *pp)
 651 {
 652         kmutex_t *pse = PAGE_SE_MUTEX(pp);
 653         selock_t old;
 654 
 655         mutex_enter(pse);
 656 
 657         old = pp->p_selock;
 658         if ((old & ~SE_EWANTED) == SE_READER) {
 659                 pp->p_selock = old & ~SE_READER;
 660                 if (CV_HAS_WAITERS(&pp->p_cv))
 661                         cv_broadcast(&pp->p_cv);
 662         } else if ((old & ~SE_EWANTED) == SE_DELETED) {
 663                 panic("page_unlock: page %p is deleted", (void *)pp);
 664         } else if (old < 0) {
 665                 THREAD_KPRI_RELEASE();
 666                 pp->p_selock &= SE_EWANTED;
 667                 if (CV_HAS_WAITERS(&pp->p_cv))
 668                         cv_broadcast(&pp->p_cv);
 669         } else if ((old & ~SE_EWANTED) > SE_READER) {
 670                 pp->p_selock = old - SE_READER;
 671         } else {
 672                 panic("page_unlock: page %p is not locked", (void *)pp);
 673         }
 674 
 675         if (pp->p_selock == 0) {
 676                 /*
 677                  * If the T_CAPTURING bit is set, that means that we should
 678                  * not try and capture the page again as we could recurse
 679                  * which could lead to a stack overflow panic or spending a
 680                  * relatively long time in the kernel making no progress.
 681                  */
 682                 if ((pp->p_toxic & PR_CAPTURE) &&
 683                     !(curthread->t_flag & T_CAPTURING) &&
 684                     !PP_RETIRED(pp)) {
 685                         THREAD_KPRI_REQUEST();
 686                         pp->p_selock = SE_WRITER;
 687                         mutex_exit(pse);
 688                         page_unlock_capture(pp);
 689                 } else {
 690                         mutex_exit(pse);
 691                 }
 692         } else {
 693                 mutex_exit(pse);
 694         }
 695 }
 696 
 697 /*
 698  * Try to upgrade the lock on the page from a "shared" to an
 699  * "exclusive" lock.  Since this upgrade operation is done while
 700  * holding the mutex protecting this page, no one else can acquire this page's
 701  * lock and change the page. Thus, it is safe to drop the "shared"
 702  * lock and attempt to acquire the "exclusive" lock.
 703  *
 704  * Returns 1 on success, 0 on failure.
 705  */
 706 int
 707 page_tryupgrade(page_t *pp)
 708 {
 709         kmutex_t *pse = PAGE_SE_MUTEX(pp);
 710 
 711         mutex_enter(pse);
 712         if (!(pp->p_selock & SE_EWANTED)) {
 713                 /* no threads want exclusive access, try upgrade */
 714                 if (pp->p_selock == SE_READER) {
 715                         THREAD_KPRI_REQUEST();
 716                         /* convert to exclusive lock */
 717                         pp->p_selock = SE_WRITER;
 718                         mutex_exit(pse);
 719                         return (1);
 720                 }
 721         }
 722         mutex_exit(pse);
 723         return (0);
 724 }
 725 
 726 /*
 727  * Downgrade the "exclusive" lock on the page to a "shared" lock
 728  * while holding the mutex protecting this page's p_selock field.
 729  */
 730 void
 731 page_downgrade(page_t *pp)
 732 {
 733         kmutex_t *pse = PAGE_SE_MUTEX(pp);
 734         int excl_waiting;
 735 
 736         ASSERT((pp->p_selock & ~SE_EWANTED) != SE_DELETED);
 737         ASSERT(PAGE_EXCL(pp));
 738 
 739         mutex_enter(pse);
 740         excl_waiting =  pp->p_selock & SE_EWANTED;
 741         THREAD_KPRI_RELEASE();
 742         pp->p_selock = SE_READER | excl_waiting;
 743         if (CV_HAS_WAITERS(&pp->p_cv))
 744                 cv_broadcast(&pp->p_cv);
 745         mutex_exit(pse);
 746 }
 747 
 748 void
 749 page_lock_delete(page_t *pp)
 750 {
 751         kmutex_t *pse = PAGE_SE_MUTEX(pp);
 752 
 753         ASSERT(PAGE_EXCL(pp));
 754         ASSERT(pp->p_vnode == NULL);
 755         ASSERT(pp->p_offset == (u_offset_t)-1);
 756         ASSERT(!PP_ISFREE(pp));
 757 
 758         mutex_enter(pse);
 759         THREAD_KPRI_RELEASE();
 760         pp->p_selock = SE_DELETED;
 761         if (CV_HAS_WAITERS(&pp->p_cv))
 762                 cv_broadcast(&pp->p_cv);
 763         mutex_exit(pse);
 764 }
 765 
 766 int
 767 page_deleted(page_t *pp)
 768 {
 769         return (pp->p_selock == SE_DELETED);
 770 }
 771 
 772 /*
 773  * Implement the io lock for pages
 774  */
 775 void
 776 page_iolock_init(page_t *pp)
 777 {
 778         pp->p_iolock_state = 0;
 779         cv_init(&pp->p_io_cv, NULL, CV_DEFAULT, NULL);
 780 }
 781 
 782 /*
 783  * Acquire the i/o lock on a page.
 784  */
 785 void
 786 page_io_lock(page_t *pp)
 787 {
 788         kmutex_t *pio;
 789 
 790         pio = PAGE_IO_MUTEX(pp);
 791         mutex_enter(pio);
 792         while (pp->p_iolock_state & PAGE_IO_INUSE) {
 793                 cv_wait(&(pp->p_io_cv), pio);
 794         }
 795         pp->p_iolock_state |= PAGE_IO_INUSE;
 796         mutex_exit(pio);
 797 }
 798 
 799 /*
 800  * Release the i/o lock on a page.
 801  */
 802 void
 803 page_io_unlock(page_t *pp)
 804 {
 805         kmutex_t *pio;
 806 
 807         pio = PAGE_IO_MUTEX(pp);
 808         mutex_enter(pio);
 809         cv_broadcast(&pp->p_io_cv);
 810         pp->p_iolock_state &= ~PAGE_IO_INUSE;
 811         mutex_exit(pio);
 812 }
 813 
 814 /*
 815  * Try to acquire the i/o lock on a page without blocking.
 816  * Returns 1 on success, 0 on failure.
 817  */
 818 int
 819 page_io_trylock(page_t *pp)
 820 {
 821         kmutex_t *pio;
 822 
 823         if (pp->p_iolock_state & PAGE_IO_INUSE)
 824                 return (0);
 825 
 826         pio = PAGE_IO_MUTEX(pp);
 827         mutex_enter(pio);
 828 
 829         if (pp->p_iolock_state & PAGE_IO_INUSE) {
 830                 mutex_exit(pio);
 831                 return (0);
 832         }
 833         pp->p_iolock_state |= PAGE_IO_INUSE;
 834         mutex_exit(pio);
 835 
 836         return (1);
 837 }
 838 
 839 /*
 840  * Wait until the i/o lock is not held.
 841  */
 842 void
 843 page_io_wait(page_t *pp)
 844 {
 845         kmutex_t *pio;
 846 
 847         pio = PAGE_IO_MUTEX(pp);
 848         mutex_enter(pio);
 849         while (pp->p_iolock_state & PAGE_IO_INUSE) {
 850                 cv_wait(&(pp->p_io_cv), pio);
 851         }
 852         mutex_exit(pio);
 853 }
 854 
 855 /*
 856  * Returns 1 on success, 0 on failure.
 857  */
 858 int
 859 page_io_locked(page_t *pp)
 860 {
 861         return (pp->p_iolock_state & PAGE_IO_INUSE);
 862 }
 863 
 864 /*
 865  * Assert that the i/o lock on a page is held.
 866  * Returns 1 on success, 0 on failure.
 867  */
 868 int
 869 page_iolock_assert(page_t *pp)
 870 {
 871         return (page_io_locked(pp));
 872 }
 873 
 874 /*
 875  * Wrapper exported to kernel routines that are built
 876  * platform-independent (the macro is platform-dependent;
 877  * the size of vph_mutex[] is based on NCPU).
 878  *
 879  * Note that you can do stress testing on this by setting the
 880  * variable page_vnode_mutex_stress to something other than
 881  * zero in a DEBUG kernel in a debugger after loading the kernel.
 882  * Setting it after the kernel is running may not work correctly.
 883  */
 884 #ifdef DEBUG
 885 static int page_vnode_mutex_stress = 0;
 886 #endif
 887 
 888 kmutex_t *
 889 page_vnode_mutex(vnode_t *vp)
 890 {
 891         if (vp == &kvp)
 892                 return (&vph_mutex[VPH_TABLE_SIZE + 0]);
 893 
 894         if (vp == &zvp)
 895                 return (&vph_mutex[VPH_TABLE_SIZE + 1]);
 896 #ifdef DEBUG
 897         if (page_vnode_mutex_stress != 0)
 898                 return (&vph_mutex[0]);
 899 #endif
 900 
 901         return (&vph_mutex[VP_HASH_FUNC(vp)]);
 902 }
 903 
 904 kmutex_t *
 905 page_se_mutex(page_t *pp)
 906 {
 907         return (PAGE_SE_MUTEX(pp));
 908 }
 909 
 910 #ifdef VM_STATS
 911 uint_t pszclck_stat[4];
 912 #endif
 913 /*
 914  * Find, take and return a mutex held by hat_page_demote().
 915  * Called by page_demote_vp_pages() before hat_page_demote() call and by
 916  * routines that want to block hat_page_demote() but can't do it
 917  * via locking all constituent pages.
 918  *
 919  * Return NULL if p_szc is 0.
 920  *
 921  * It should only be used for pages that can be demoted by hat_page_demote()
 922  * i.e. non swapfs file system pages.  The logic here is lifted from
 923  * sfmmu_mlspl_enter() except there's no need to worry about p_szc increase
 924  * since the page is locked and not free.
 925  *
 926  * Hash of the root page is used to find the lock.
 927  * To find the root in the presense of hat_page_demote() chageing the location
 928  * of the root this routine relies on the fact that hat_page_demote() changes
 929  * root last.
 930  *
 931  * If NULL is returned pp's p_szc is guaranteed to be 0. If non NULL is
 932  * returned pp's p_szc may be any value.
 933  */
 934 kmutex_t *
 935 page_szc_lock(page_t *pp)
 936 {
 937         kmutex_t        *mtx;
 938         page_t          *rootpp;
 939         uint_t          szc;
 940         uint_t          rszc;
 941         uint_t          pszc = pp->p_szc;
 942 
 943         ASSERT(pp != NULL);
 944         ASSERT(PAGE_LOCKED(pp));
 945         ASSERT(!PP_ISFREE(pp));
 946         ASSERT(pp->p_vnode != NULL);
 947         ASSERT(!IS_SWAPFSVP(pp->p_vnode));
 948         ASSERT(!PP_ISKAS(pp));
 949 
 950 again:
 951         if (pszc == 0) {
 952                 VM_STAT_ADD(pszclck_stat[0]);
 953                 return (NULL);
 954         }
 955 
 956         /* The lock lives in the root page */
 957 
 958         rootpp = PP_GROUPLEADER(pp, pszc);
 959         mtx = PAGE_SZC_MUTEX(rootpp);
 960         mutex_enter(mtx);
 961 
 962         /*
 963          * since p_szc can only decrease if pp == rootpp
 964          * rootpp will be always the same i.e we have the right root
 965          * regardless of rootpp->p_szc.
 966          * If location of pp's root didn't change after we took
 967          * the lock we have the right root. return mutex hashed off it.
 968          */
 969         if (pp == rootpp || (rszc = rootpp->p_szc) == pszc) {
 970                 VM_STAT_ADD(pszclck_stat[1]);
 971                 return (mtx);
 972         }
 973 
 974         /*
 975          * root location changed because page got demoted.
 976          * locate the new root.
 977          */
 978         if (rszc < pszc) {
 979                 szc = pp->p_szc;
 980                 ASSERT(szc < pszc);
 981                 mutex_exit(mtx);
 982                 pszc = szc;
 983                 VM_STAT_ADD(pszclck_stat[2]);
 984                 goto again;
 985         }
 986 
 987         VM_STAT_ADD(pszclck_stat[3]);
 988         /*
 989          * current hat_page_demote not done yet.
 990          * wait for it to finish.
 991          */
 992         mutex_exit(mtx);
 993         rootpp = PP_GROUPLEADER(rootpp, rszc);
 994         mtx = PAGE_SZC_MUTEX(rootpp);
 995         mutex_enter(mtx);
 996         mutex_exit(mtx);
 997         ASSERT(rootpp->p_szc < rszc);
 998         goto again;
 999 }
1000 
1001 int
1002 page_szc_lock_assert(page_t *pp)
1003 {
1004         page_t *rootpp = PP_PAGEROOT(pp);
1005         kmutex_t *mtx = PAGE_SZC_MUTEX(rootpp);
1006 
1007         return (MUTEX_HELD(mtx));
1008 }
1009 
1010 /*
1011  * memseg locking
1012  */
1013 static krwlock_t memsegslock;
1014 
1015 /*
1016  * memlist (phys_install, phys_avail) locking.
1017  */
1018 static krwlock_t memlists_lock;
1019 
1020 int
1021 memsegs_trylock(int writer)
1022 {
1023         return (rw_tryenter(&memsegslock, writer ? RW_WRITER : RW_READER));
1024 }
1025 
1026 void
1027 memsegs_lock(int writer)
1028 {
1029         rw_enter(&memsegslock, writer ? RW_WRITER : RW_READER);
1030 }
1031 
1032 /*ARGSUSED*/
1033 void
1034 memsegs_unlock(int writer)
1035 {
1036         rw_exit(&memsegslock);
1037 }
1038 
1039 int
1040 memsegs_lock_held(void)
1041 {
1042         return (RW_LOCK_HELD(&memsegslock));
1043 }
1044 
1045 void
1046 memlist_read_lock(void)
1047 {
1048         rw_enter(&memlists_lock, RW_READER);
1049 }
1050 
1051 void
1052 memlist_read_unlock(void)
1053 {
1054         rw_exit(&memlists_lock);
1055 }
1056 
1057 void
1058 memlist_write_lock(void)
1059 {
1060         rw_enter(&memlists_lock, RW_WRITER);
1061 }
1062 
1063 void
1064 memlist_write_unlock(void)
1065 {
1066         rw_exit(&memlists_lock);
1067 }