Print this page
    
11909 THREAD_KPRI_RELEASE does nothing of the sort
Reviewed by: Bryan Cantrill <bryan@joyent.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
    
      
        | Split | 
	Close | 
      
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/common/vm/page_lock.c
          +++ new/usr/src/uts/common/vm/page_lock.c
   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  
    | 
      ↓ open down ↓ | 
    12 lines elided | 
    
      ↑ open up ↑ | 
  
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
       23 + * Copyright 2019 Joyent, Inc.
  23   24   */
  24   25  
  25   26  
  26   27  /*
  27   28   * VM - page locking primitives
  28   29   */
  29   30  #include <sys/param.h>
  30   31  #include <sys/t_lock.h>
  31   32  #include <sys/vtrace.h>
  32   33  #include <sys/debug.h>
  33   34  #include <sys/cmn_err.h>
  34   35  #include <sys/bitmap.h>
  35   36  #include <sys/lockstat.h>
  36   37  #include <sys/sysmacros.h>
  37   38  #include <sys/condvar_impl.h>
  38   39  #include <vm/page.h>
  39   40  #include <vm/seg_enum.h>
  40   41  #include <vm/vm_dep.h>
  41   42  #include <vm/seg_kmem.h>
  42   43  
  43   44  /*
  44   45   * This global mutex array is for logical page locking.
  45   46   * The following fields in the page structure are protected
  46   47   * by this lock:
  47   48   *
  48   49   *      p_lckcnt
  49   50   *      p_cowcnt
  50   51   */
  51   52  pad_mutex_t page_llocks[8 * NCPU_P2];
  52   53  
  53   54  /*
  54   55   * This is a global lock for the logical page free list.  The
  55   56   * logical free list, in this implementation, is maintained as two
  56   57   * separate physical lists - the cache list and the free list.
  57   58   */
  58   59  kmutex_t  page_freelock;
  59   60  
  60   61  /*
  61   62   * The hash table, page_hash[], the p_selock fields, and the
  62   63   * list of pages associated with vnodes are protected by arrays of mutexes.
  63   64   *
  64   65   * Unless the hashes are changed radically, the table sizes must be
  65   66   * a power of two.  Also, we typically need more mutexes for the
  66   67   * vnodes since these locks are occasionally held for long periods.
  67   68   * And since there seem to be two special vnodes (kvp and swapvp),
  68   69   * we make room for private mutexes for them.
  69   70   *
  70   71   * The pse_mutex[] array holds the mutexes to protect the p_selock
  71   72   * fields of all page_t structures.
  72   73   *
  73   74   * PAGE_SE_MUTEX(pp) returns the address of the appropriate mutex
  74   75   * when given a pointer to a page_t.
  75   76   *
  76   77   * PIO_TABLE_SIZE must be a power of two.  One could argue that we
  77   78   * should go to the trouble of setting it up at run time and base it
  78   79   * on memory size rather than the number of compile time CPUs.
  79   80   *
  80   81   * XX64 We should be using physmem size to calculate PIO_SHIFT.
  81   82   *
  82   83   *      These might break in 64 bit world.
  83   84   */
  84   85  #define PIO_SHIFT       7       /* log2(sizeof(page_t)) */
  85   86  #define PIO_TABLE_SIZE  128     /* number of io mutexes to have */
  86   87  
  87   88  pad_mutex_t     ph_mutex[PH_TABLE_SIZE];
  88   89  kmutex_t        pio_mutex[PIO_TABLE_SIZE];
  89   90  
  90   91  #define PAGE_IO_MUTEX(pp) \
  91   92              &pio_mutex[(((uintptr_t)pp) >> PIO_SHIFT) & (PIO_TABLE_SIZE - 1)]
  92   93  
  93   94  /*
  94   95   * The pse_mutex[] array is allocated in the platform startup code
  95   96   * based on the size of the machine at startup.
  96   97   */
  97   98  extern pad_mutex_t *pse_mutex;          /* Locks protecting pp->p_selock */
  98   99  extern size_t pse_table_size;           /* Number of mutexes in pse_mutex[] */
  99  100  extern int pse_shift;                   /* log2(pse_table_size) */
 100  101  #define PAGE_SE_MUTEX(pp)       &pse_mutex[                             \
 101  102          ((((uintptr_t)(pp) >> pse_shift) ^ ((uintptr_t)(pp))) >> 7) &   \
 102  103          (pse_table_size - 1)].pad_mutex
 103  104  
 104  105  #define PSZC_MTX_TABLE_SIZE     128
 105  106  #define PSZC_MTX_TABLE_SHIFT    7
 106  107  
 107  108  static pad_mutex_t      pszc_mutex[PSZC_MTX_TABLE_SIZE];
 108  109  
 109  110  #define PAGE_SZC_MUTEX(_pp) \
 110  111              &pszc_mutex[((((uintptr_t)(_pp) >> PSZC_MTX_TABLE_SHIFT) ^ \
 111  112                  ((uintptr_t)(_pp) >> (PSZC_MTX_TABLE_SHIFT << 1)) ^ \
 112  113                  ((uintptr_t)(_pp) >> (3 * PSZC_MTX_TABLE_SHIFT))) & \
 113  114                  (PSZC_MTX_TABLE_SIZE - 1))].pad_mutex
 114  115  
 115  116  /*
 116  117   * The vph_mutex[] array  holds the mutexes to protect the vnode chains,
 117  118   * (i.e., the list of pages anchored by v_pages and connected via p_vpprev
 118  119   * and p_vpnext).
 119  120   *
 120  121   * The page_vnode_mutex(vp) function returns the address of the appropriate
 121  122   * mutex from this array given a pointer to a vnode.  It is complicated
 122  123   * by the fact that the kernel's vnode and the swapfs vnode are referenced
 123  124   * frequently enough to warrent their own mutexes.
 124  125   *
 125  126   * The VP_HASH_FUNC returns the index into the vph_mutex array given
 126  127   * an address of a vnode.
 127  128   */
 128  129  
 129  130  #if defined(_LP64)
 130  131  #define VPH_TABLE_SIZE  (8 * NCPU_P2)
 131  132  #else   /* 32 bits */
 132  133  #define VPH_TABLE_SIZE  (2 * NCPU_P2)
 133  134  #endif
 134  135  
 135  136  #define VP_HASH_FUNC(vp) \
 136  137          ((((uintptr_t)(vp) >> 6) + \
 137  138              ((uintptr_t)(vp) >> 8) + \
 138  139              ((uintptr_t)(vp) >> 10) + \
 139  140              ((uintptr_t)(vp) >> 12)) \
 140  141              & (VPH_TABLE_SIZE - 1))
 141  142  
 142  143  /*
 143  144   * Two slots after VPH_TABLE_SIZE are reserved in vph_mutex for kernel vnodes.
 144  145   * The lock for kvp is VPH_TABLE_SIZE + 0, and the lock for zvp is
 145  146   * VPH_TABLE_SIZE + 1.
 146  147   */
 147  148  
 148  149  kmutex_t        vph_mutex[VPH_TABLE_SIZE + 2];
 149  150  
 150  151  /*
 151  152   * Initialize the locks used by the Virtual Memory Management system.
 152  153   */
 153  154  void
 154  155  page_lock_init()
 155  156  {
 156  157  }
 157  158  
 158  159  /*
 159  160   * Return a value for pse_shift based on npg (the number of physical pages)
 160  161   * and ncpu (the maximum number of CPUs).  This is called by platform startup
 161  162   * code.
 162  163   *
 163  164   * Lockstat data from TPC-H runs showed that contention on the pse_mutex[]
 164  165   * locks grew approximately as the square of the number of threads executing.
 165  166   * So the primary scaling factor used is NCPU^2.  The size of the machine in
 166  167   * megabytes is used as an upper bound, particularly for sun4v machines which
 167  168   * all claim to have 256 CPUs maximum, and the old value of PSE_TABLE_SIZE
 168  169   * (128) is used as a minimum.  Since the size of the table has to be a power
 169  170   * of two, the calculated size is rounded up to the next power of two.
 170  171   */
 171  172  /*ARGSUSED*/
 172  173  int
 173  174  size_pse_array(pgcnt_t npg, int ncpu)
 174  175  {
 175  176          size_t size;
 176  177          pgcnt_t pp_per_mb = (1024 * 1024) / PAGESIZE;
 177  178  
 178  179          size = MAX(128, MIN(npg / pp_per_mb, 2 * ncpu * ncpu));
 179  180          size += (1 << (highbit(size) - 1)) - 1;
 180  181          return (highbit(size) - 1);
 181  182  }
 182  183  
 183  184  /*
 184  185   * At present we only use page ownership to aid debugging, so it's
 185  186   * OK if the owner field isn't exact.  In the 32-bit world two thread ids
 186  187   * can map to the same owner because we just 'or' in 0x80000000 and
 187  188   * then clear the second highest bit, so that (for example) 0x2faced00
 188  189   * and 0xafaced00 both map to 0xafaced00.
 189  190   * In the 64-bit world, p_selock may not be large enough to hold a full
 190  191   * thread pointer.  If we ever need precise ownership (e.g. if we implement
 191  192   * priority inheritance for page locks) then p_selock should become a
 192  193   * uintptr_t and SE_WRITER should be -((uintptr_t)curthread >> 2).
 193  194   */
 194  195  #define SE_WRITER       (((selock_t)(ulong_t)curthread | INT_MIN) & ~SE_EWANTED)
 195  196  #define SE_READER       1
 196  197  
 197  198  /*
 198  199   * A page that is deleted must be marked as such using the
 199  200   * page_lock_delete() function. The page must be exclusively locked.
 200  201   * The SE_DELETED marker is put in p_selock when this function is called.
 201  202   * SE_DELETED must be distinct from any SE_WRITER value.
 202  203   */
 203  204  #define SE_DELETED      (1 | INT_MIN)
 204  205  
 205  206  #ifdef VM_STATS
 206  207  uint_t  vph_kvp_count;
 207  208  uint_t  vph_swapfsvp_count;
 208  209  uint_t  vph_other;
 209  210  #endif /* VM_STATS */
 210  211  
 211  212  #ifdef VM_STATS
 212  213  uint_t  page_lock_count;
 213  214  uint_t  page_lock_miss;
 214  215  uint_t  page_lock_miss_lock;
 215  216  uint_t  page_lock_reclaim;
 216  217  uint_t  page_lock_bad_reclaim;
 217  218  uint_t  page_lock_same_page;
 218  219  uint_t  page_lock_upgrade;
 219  220  uint_t  page_lock_retired;
 220  221  uint_t  page_lock_upgrade_failed;
 221  222  uint_t  page_lock_deleted;
 222  223  
 223  224  uint_t  page_trylock_locked;
 224  225  uint_t  page_trylock_failed;
 225  226  uint_t  page_trylock_missed;
 226  227  
 227  228  uint_t  page_try_reclaim_upgrade;
 228  229  #endif /* VM_STATS */
 229  230  
 230  231  /*
 231  232   * Acquire the "shared/exclusive" lock on a page.
 232  233   *
 233  234   * Returns 1 on success and locks the page appropriately.
 234  235   *         0 on failure and does not lock the page.
 235  236   *
 236  237   * If `lock' is non-NULL, it will be dropped and reacquired in the
 237  238   * failure case.  This routine can block, and if it does
 238  239   * it will always return a failure since the page identity [vp, off]
 239  240   * or state may have changed.
 240  241   */
 241  242  
 242  243  int
 243  244  page_lock(page_t *pp, se_t se, kmutex_t *lock, reclaim_t reclaim)
 244  245  {
 245  246          return (page_lock_es(pp, se, lock, reclaim, 0));
 246  247  }
 247  248  
 248  249  /*
 249  250   * With the addition of reader-writer lock semantics to page_lock_es,
 250  251   * callers wanting an exclusive (writer) lock may prevent shared-lock
 251  252   * (reader) starvation by setting the es parameter to SE_EXCL_WANTED.
 252  253   * In this case, when an exclusive lock cannot be acquired, p_selock's
 253  254   * SE_EWANTED bit is set. Shared-lock (reader) requests are also denied
 254  255   * if the page is slated for retirement.
 255  256   *
 256  257   * The se and es parameters determine if the lock should be granted
 257  258   * based on the following decision table:
 258  259   *
 259  260   * Lock wanted   es flags     p_selock/SE_EWANTED  Action
 260  261   * ----------- -------------- -------------------  ---------
 261  262   * SE_EXCL        any [1][2]   unlocked/any        grant lock, clear SE_EWANTED
 262  263   * SE_EXCL        SE_EWANTED   any lock/any        deny, set SE_EWANTED
 263  264   * SE_EXCL        none         any lock/any        deny
 264  265   * SE_SHARED      n/a [2]        shared/0          grant
 265  266   * SE_SHARED      n/a [2]      unlocked/0          grant
 266  267   * SE_SHARED      n/a            shared/1          deny
 267  268   * SE_SHARED      n/a          unlocked/1          deny
 268  269   * SE_SHARED      n/a              excl/any        deny
 269  270   *
 270  271   * Notes:
 271  272   * [1] The code grants an exclusive lock to the caller and clears the bit
 272  273   *   SE_EWANTED whenever p_selock is unlocked, regardless of the SE_EWANTED
 273  274   *   bit's value.  This was deemed acceptable as we are not concerned about
 274  275   *   exclusive-lock starvation. If this ever becomes an issue, a priority or
 275  276   *   fifo mechanism should also be implemented. Meantime, the thread that
 276  277   *   set SE_EWANTED should be prepared to catch this condition and reset it
 277  278   *
 278  279   * [2] Retired pages may not be locked at any time, regardless of the
 279  280   *   dispostion of se, unless the es parameter has SE_RETIRED flag set.
 280  281   *
 281  282   * Notes on values of "es":
 282  283   *
 283  284   *   es & 1: page_lookup_create will attempt page relocation
 284  285   *   es & SE_EXCL_WANTED: caller wants SE_EWANTED set (eg. delete
 285  286   *       memory thread); this prevents reader-starvation of waiting
 286  287   *       writer thread(s) by giving priority to writers over readers.
 287  288   *   es & SE_RETIRED: caller wants to lock pages even if they are
 288  289   *       retired.  Default is to deny the lock if the page is retired.
 289  290   *
 290  291   * And yes, we know, the semantics of this function are too complicated.
 291  292   * It's on the list to be cleaned up.
 292  293   */
 293  294  int
 294  295  page_lock_es(page_t *pp, se_t se, kmutex_t *lock, reclaim_t reclaim, int es)
 295  296  {
 296  297          int             retval;
 297  298          kmutex_t        *pse = PAGE_SE_MUTEX(pp);
 298  299          int             upgraded;
 299  300          int             reclaim_it;
 300  301  
 301  302          ASSERT(lock != NULL ? MUTEX_HELD(lock) : 1);
 302  303  
 303  304          VM_STAT_ADD(page_lock_count);
 304  305  
 305  306          upgraded = 0;
 306  307          reclaim_it = 0;
 307  308  
 308  309          mutex_enter(pse);
 309  310  
 310  311          ASSERT(((es & SE_EXCL_WANTED) == 0) ||
 311  312              ((es & SE_EXCL_WANTED) && (se == SE_EXCL)));
 312  313  
 313  314          if (PP_RETIRED(pp) && !(es & SE_RETIRED)) {
 314  315                  mutex_exit(pse);
 315  316                  VM_STAT_ADD(page_lock_retired);
 316  317                  return (0);
 317  318          }
 318  319  
 319  320          if (se == SE_SHARED && es == 1 && pp->p_selock == 0) {
 320  321                  se = SE_EXCL;
 321  322          }
 322  323  
 323  324          if ((reclaim == P_RECLAIM) && (PP_ISFREE(pp))) {
 324  325  
 325  326                  reclaim_it = 1;
 326  327                  if (se == SE_SHARED) {
 327  328                          /*
 328  329                           * This is an interesting situation.
 329  330                           *
 330  331                           * Remember that p_free can only change if
 331  332                           * p_selock < 0.
 332  333                           * p_free does not depend on our holding `pse'.
 333  334                           * And, since we hold `pse', p_selock can not change.
 334  335                           * So, if p_free changes on us, the page is already
 335  336                           * exclusively held, and we would fail to get p_selock
 336  337                           * regardless.
 337  338                           *
 338  339                           * We want to avoid getting the share
 339  340                           * lock on a free page that needs to be reclaimed.
 340  341                           * It is possible that some other thread has the share
 341  342                           * lock and has left the free page on the cache list.
 342  343                           * pvn_vplist_dirty() does this for brief periods.
 343  344                           * If the se_share is currently SE_EXCL, we will fail
 344  345                           * to acquire p_selock anyway.  Blocking is the
 345  346                           * right thing to do.
 346  347                           * If we need to reclaim this page, we must get
 347  348                           * exclusive access to it, force the upgrade now.
 348  349                           * Again, we will fail to acquire p_selock if the
 349  350                           * page is not free and block.
 350  351                           */
 351  352                          upgraded = 1;
 352  353                          se = SE_EXCL;
 353  354                          VM_STAT_ADD(page_lock_upgrade);
 354  355                  }
 355  356          }
 356  357  
  
    | 
      ↓ open down ↓ | 
    324 lines elided | 
    
      ↑ open up ↑ | 
  
 357  358          if (se == SE_EXCL) {
 358  359                  if (!(es & SE_EXCL_WANTED) && (pp->p_selock & SE_EWANTED)) {
 359  360                          /*
 360  361                           * if the caller wants a writer lock (but did not
 361  362                           * specify exclusive access), and there is a pending
 362  363                           * writer that wants exclusive access, return failure
 363  364                           */
 364  365                          retval = 0;
 365  366                  } else if ((pp->p_selock & ~SE_EWANTED) == 0) {
 366  367                          /* no reader/writer lock held */
 367      -                        THREAD_KPRI_REQUEST();
 368  368                          /* this clears our setting of the SE_EWANTED bit */
 369  369                          pp->p_selock = SE_WRITER;
 370  370                          retval = 1;
 371  371                  } else {
 372  372                          /* page is locked */
 373  373                          if (es & SE_EXCL_WANTED) {
 374  374                                  /* set the SE_EWANTED bit */
 375  375                                  pp->p_selock |= SE_EWANTED;
 376  376                          }
 377  377                          retval = 0;
 378  378                  }
 379  379          } else {
 380  380                  retval = 0;
 381  381                  if (pp->p_selock >= 0) {
 382  382                          if ((pp->p_selock & SE_EWANTED) == 0) {
 383  383                                  pp->p_selock += SE_READER;
 384  384                                  retval = 1;
 385  385                          }
 386  386                  }
 387  387          }
 388  388  
 389  389          if (retval == 0) {
 390  390                  if ((pp->p_selock & ~SE_EWANTED) == SE_DELETED) {
 391  391                          VM_STAT_ADD(page_lock_deleted);
 392  392                          mutex_exit(pse);
 393  393                          return (retval);
 394  394                  }
 395  395  
 396  396  #ifdef VM_STATS
 397  397                  VM_STAT_ADD(page_lock_miss);
 398  398                  if (upgraded) {
 399  399                          VM_STAT_ADD(page_lock_upgrade_failed);
 400  400                  }
 401  401  #endif
 402  402                  if (lock) {
 403  403                          VM_STAT_ADD(page_lock_miss_lock);
 404  404                          mutex_exit(lock);
 405  405                  }
 406  406  
 407  407                  /*
 408  408                   * Now, wait for the page to be unlocked and
 409  409                   * release the lock protecting p_cv and p_selock.
 410  410                   */
 411  411                  cv_wait(&pp->p_cv, pse);
 412  412                  mutex_exit(pse);
 413  413  
 414  414                  /*
 415  415                   * The page identity may have changed while we were
 416  416                   * blocked.  If we are willing to depend on "pp"
 417  417                   * still pointing to a valid page structure (i.e.,
 418  418                   * assuming page structures are not dynamically allocated
 419  419                   * or freed), we could try to lock the page if its
 420  420                   * identity hasn't changed.
 421  421                   *
 422  422                   * This needs to be measured, since we come back from
 423  423                   * cv_wait holding pse (the expensive part of this
 424  424                   * operation) we might as well try the cheap part.
 425  425                   * Though we would also have to confirm that dropping
 426  426                   * `lock' did not cause any grief to the callers.
 427  427                   */
 428  428                  if (lock) {
 429  429                          mutex_enter(lock);
 430  430                  }
 431  431          } else {
 432  432                  /*
 433  433                   * We have the page lock.
 434  434                   * If we needed to reclaim the page, and the page
 435  435                   * needed reclaiming (ie, it was free), then we
 436  436                   * have the page exclusively locked.  We may need
 437  437                   * to downgrade the page.
 438  438                   */
 439  439                  ASSERT((upgraded) ?
 440  440                      ((PP_ISFREE(pp)) && PAGE_EXCL(pp)) : 1);
 441  441                  mutex_exit(pse);
 442  442  
 443  443                  /*
 444  444                   * We now hold this page's lock, either shared or
 445  445                   * exclusive.  This will prevent its identity from changing.
 446  446                   * The page, however, may or may not be free.  If the caller
 447  447                   * requested, and it is free, go reclaim it from the
 448  448                   * free list.  If the page can't be reclaimed, return failure
 449  449                   * so that the caller can start all over again.
 450  450                   *
 451  451                   * NOTE:page_reclaim() releases the page lock (p_selock)
 452  452                   *      if it can't be reclaimed.
 453  453                   */
 454  454                  if (reclaim_it) {
 455  455                          if (!page_reclaim(pp, lock)) {
 456  456                                  VM_STAT_ADD(page_lock_bad_reclaim);
 457  457                                  retval = 0;
 458  458                          } else {
 459  459                                  VM_STAT_ADD(page_lock_reclaim);
 460  460                                  if (upgraded) {
 461  461                                          page_downgrade(pp);
 462  462                                  }
 463  463                          }
 464  464                  }
 465  465          }
 466  466          return (retval);
 467  467  }
 468  468  
 469  469  /*
 470  470   * Clear the SE_EWANTED bit from p_selock.  This function allows
 471  471   * callers of page_lock_es and page_try_reclaim_lock to clear
 472  472   * their setting of this bit if they decide they no longer wish
 473  473   * to gain exclusive access to the page.  Currently only
 474  474   * delete_memory_thread uses this when the delete memory
 475  475   * operation is cancelled.
 476  476   */
 477  477  void
 478  478  page_lock_clr_exclwanted(page_t *pp)
 479  479  {
 480  480          kmutex_t *pse = PAGE_SE_MUTEX(pp);
 481  481  
 482  482          mutex_enter(pse);
 483  483          pp->p_selock &= ~SE_EWANTED;
 484  484          if (CV_HAS_WAITERS(&pp->p_cv))
 485  485                  cv_broadcast(&pp->p_cv);
 486  486          mutex_exit(pse);
 487  487  }
 488  488  
 489  489  /*
 490  490   * Read the comments inside of page_lock_es() carefully.
 491  491   *
 492  492   * SE_EXCL callers specifying es == SE_EXCL_WANTED will cause the
 493  493   * SE_EWANTED bit of p_selock to be set when the lock cannot be obtained.
 494  494   * This is used by threads subject to reader-starvation (eg. memory delete).
 495  495   *
 496  496   * When a thread using SE_EXCL_WANTED does not obtain the SE_EXCL lock,
 497  497   * it is expected that it will retry at a later time.  Threads that will
 498  498   * not retry the lock *must* call page_lock_clr_exclwanted to clear the
 499  499   * SE_EWANTED bit.  (When a thread using SE_EXCL_WANTED obtains the lock,
 500  500   * the bit is cleared.)
 501  501   */
 502  502  int
 503  503  page_try_reclaim_lock(page_t *pp, se_t se, int es)
 504  504  {
 505  505          kmutex_t *pse = PAGE_SE_MUTEX(pp);
 506  506          selock_t old;
 507  507  
 508  508          mutex_enter(pse);
 509  509  
 510  510          old = pp->p_selock;
 511  511  
 512  512          ASSERT(((es & SE_EXCL_WANTED) == 0) ||
 513  513              ((es & SE_EXCL_WANTED) && (se == SE_EXCL)));
 514  514  
 515  515          if (PP_RETIRED(pp) && !(es & SE_RETIRED)) {
 516  516                  mutex_exit(pse);
 517  517                  VM_STAT_ADD(page_trylock_failed);
 518  518                  return (0);
 519  519          }
 520  520  
 521  521          if (se == SE_SHARED && es == 1 && old == 0) {
 522  522                  se = SE_EXCL;
 523  523          }
 524  524  
 525  525          if (se == SE_SHARED) {
 526  526                  if (!PP_ISFREE(pp)) {
 527  527                          if (old >= 0) {
 528  528                                  /*
 529  529                                   * Readers are not allowed when excl wanted
 530  530                                   */
 531  531                                  if ((old & SE_EWANTED) == 0) {
 532  532                                          pp->p_selock = old + SE_READER;
 533  533                                          mutex_exit(pse);
 534  534                                          return (1);
 535  535                                  }
 536  536                          }
 537  537                          mutex_exit(pse);
 538  538                          return (0);
 539  539                  }
 540  540                  /*
 541  541                   * The page is free, so we really want SE_EXCL (below)
 542  542                   */
 543  543                  VM_STAT_ADD(page_try_reclaim_upgrade);
  
    | 
      ↓ open down ↓ | 
    166 lines elided | 
    
      ↑ open up ↑ | 
  
 544  544          }
 545  545  
 546  546          /*
 547  547           * The caller wants a writer lock.  We try for it only if
 548  548           * SE_EWANTED is not set, or if the caller specified
 549  549           * SE_EXCL_WANTED.
 550  550           */
 551  551          if (!(old & SE_EWANTED) || (es & SE_EXCL_WANTED)) {
 552  552                  if ((old & ~SE_EWANTED) == 0) {
 553  553                          /* no reader/writer lock held */
 554      -                        THREAD_KPRI_REQUEST();
 555  554                          /* this clears out our setting of the SE_EWANTED bit */
 556  555                          pp->p_selock = SE_WRITER;
 557  556                          mutex_exit(pse);
 558  557                          return (1);
 559  558                  }
 560  559          }
 561  560          if (es & SE_EXCL_WANTED) {
 562  561                  /* page is locked, set the SE_EWANTED bit */
 563  562                  pp->p_selock |= SE_EWANTED;
 564  563          }
 565  564          mutex_exit(pse);
 566  565          return (0);
 567  566  }
 568  567  
 569  568  /*
 570  569   * Acquire a page's "shared/exclusive" lock, but never block.
 571  570   * Returns 1 on success, 0 on failure.
 572  571   */
 573  572  int
 574  573  page_trylock(page_t *pp, se_t se)
 575  574  {
 576  575          kmutex_t *pse = PAGE_SE_MUTEX(pp);
 577  576  
 578  577          mutex_enter(pse);
 579  578          if (pp->p_selock & SE_EWANTED || PP_RETIRED(pp) ||
 580  579              (se == SE_SHARED && PP_PR_NOSHARE(pp))) {
 581  580                  /*
 582  581                   * Fail if a thread wants exclusive access and page is
  
    | 
      ↓ open down ↓ | 
    18 lines elided | 
    
      ↑ open up ↑ | 
  
 583  582                   * retired, if the page is slated for retirement, or a
 584  583                   * share lock is requested.
 585  584                   */
 586  585                  mutex_exit(pse);
 587  586                  VM_STAT_ADD(page_trylock_failed);
 588  587                  return (0);
 589  588          }
 590  589  
 591  590          if (se == SE_EXCL) {
 592  591                  if (pp->p_selock == 0) {
 593      -                        THREAD_KPRI_REQUEST();
 594  592                          pp->p_selock = SE_WRITER;
 595  593                          mutex_exit(pse);
 596  594                          return (1);
 597  595                  }
 598  596          } else {
 599  597                  if (pp->p_selock >= 0) {
 600  598                          pp->p_selock += SE_READER;
 601  599                          mutex_exit(pse);
 602  600                          return (1);
 603  601                  }
 604  602          }
 605  603          mutex_exit(pse);
 606  604          return (0);
 607  605  }
 608  606  
 609  607  /*
 610  608   * Variant of page_unlock() specifically for the page freelist
 611  609   * code. The mere existence of this code is a vile hack that
 612  610   * has resulted due to the backwards locking order of the page
 613  611   * freelist manager; please don't call it.
 614  612   */
 615  613  void
 616  614  page_unlock_nocapture(page_t *pp)
 617  615  {
 618  616          kmutex_t *pse = PAGE_SE_MUTEX(pp);
 619  617          selock_t old;
 620  618  
  
    | 
      ↓ open down ↓ | 
    17 lines elided | 
    
      ↑ open up ↑ | 
  
 621  619          mutex_enter(pse);
 622  620  
 623  621          old = pp->p_selock;
 624  622          if ((old & ~SE_EWANTED) == SE_READER) {
 625  623                  pp->p_selock = old & ~SE_READER;
 626  624                  if (CV_HAS_WAITERS(&pp->p_cv))
 627  625                          cv_broadcast(&pp->p_cv);
 628  626          } else if ((old & ~SE_EWANTED) == SE_DELETED) {
 629  627                  panic("page_unlock_nocapture: page %p is deleted", (void *)pp);
 630  628          } else if (old < 0) {
 631      -                THREAD_KPRI_RELEASE();
 632  629                  pp->p_selock &= SE_EWANTED;
 633  630                  if (CV_HAS_WAITERS(&pp->p_cv))
 634  631                          cv_broadcast(&pp->p_cv);
 635  632          } else if ((old & ~SE_EWANTED) > SE_READER) {
 636  633                  pp->p_selock = old - SE_READER;
 637  634          } else {
 638  635                  panic("page_unlock_nocapture: page %p is not locked",
 639  636                      (void *)pp);
 640  637          }
 641  638  
 642  639          mutex_exit(pse);
 643  640  }
 644  641  
 645  642  /*
 646  643   * Release the page's "shared/exclusive" lock and wake up anyone
 647  644   * who might be waiting for it.
 648  645   */
 649  646  void
 650  647  page_unlock(page_t *pp)
 651  648  {
 652  649          kmutex_t *pse = PAGE_SE_MUTEX(pp);
 653  650          selock_t old;
 654  651  
  
    | 
      ↓ open down ↓ | 
    13 lines elided | 
    
      ↑ open up ↑ | 
  
 655  652          mutex_enter(pse);
 656  653  
 657  654          old = pp->p_selock;
 658  655          if ((old & ~SE_EWANTED) == SE_READER) {
 659  656                  pp->p_selock = old & ~SE_READER;
 660  657                  if (CV_HAS_WAITERS(&pp->p_cv))
 661  658                          cv_broadcast(&pp->p_cv);
 662  659          } else if ((old & ~SE_EWANTED) == SE_DELETED) {
 663  660                  panic("page_unlock: page %p is deleted", (void *)pp);
 664  661          } else if (old < 0) {
 665      -                THREAD_KPRI_RELEASE();
 666  662                  pp->p_selock &= SE_EWANTED;
 667  663                  if (CV_HAS_WAITERS(&pp->p_cv))
 668  664                          cv_broadcast(&pp->p_cv);
 669  665          } else if ((old & ~SE_EWANTED) > SE_READER) {
 670  666                  pp->p_selock = old - SE_READER;
 671  667          } else {
 672  668                  panic("page_unlock: page %p is not locked", (void *)pp);
 673  669          }
 674  670  
 675  671          if (pp->p_selock == 0) {
 676  672                  /*
 677  673                   * If the T_CAPTURING bit is set, that means that we should
 678  674                   * not try and capture the page again as we could recurse
 679  675                   * which could lead to a stack overflow panic or spending a
 680  676                   * relatively long time in the kernel making no progress.
 681  677                   */
 682  678                  if ((pp->p_toxic & PR_CAPTURE) &&
 683  679                      !(curthread->t_flag & T_CAPTURING) &&
 684  680                      !PP_RETIRED(pp)) {
 685      -                        THREAD_KPRI_REQUEST();
 686  681                          pp->p_selock = SE_WRITER;
 687  682                          mutex_exit(pse);
 688  683                          page_unlock_capture(pp);
 689  684                  } else {
 690  685                          mutex_exit(pse);
 691  686                  }
 692  687          } else {
 693  688                  mutex_exit(pse);
 694  689          }
 695  690  }
 696  691  
 697  692  /*
 698  693   * Try to upgrade the lock on the page from a "shared" to an
 699  694   * "exclusive" lock.  Since this upgrade operation is done while
 700  695   * holding the mutex protecting this page, no one else can acquire this page's
 701  696   * lock and change the page. Thus, it is safe to drop the "shared"
 702  697   * lock and attempt to acquire the "exclusive" lock.
 703  698   *
 704  699   * Returns 1 on success, 0 on failure.
  
    | 
      ↓ open down ↓ | 
    9 lines elided | 
    
      ↑ open up ↑ | 
  
 705  700   */
 706  701  int
 707  702  page_tryupgrade(page_t *pp)
 708  703  {
 709  704          kmutex_t *pse = PAGE_SE_MUTEX(pp);
 710  705  
 711  706          mutex_enter(pse);
 712  707          if (!(pp->p_selock & SE_EWANTED)) {
 713  708                  /* no threads want exclusive access, try upgrade */
 714  709                  if (pp->p_selock == SE_READER) {
 715      -                        THREAD_KPRI_REQUEST();
 716  710                          /* convert to exclusive lock */
 717  711                          pp->p_selock = SE_WRITER;
 718  712                          mutex_exit(pse);
 719  713                          return (1);
 720  714                  }
 721  715          }
 722  716          mutex_exit(pse);
 723  717          return (0);
 724  718  }
 725  719  
 726  720  /*
 727  721   * Downgrade the "exclusive" lock on the page to a "shared" lock
 728  722   * while holding the mutex protecting this page's p_selock field.
 729  723   */
 730  724  void
  
    | 
      ↓ open down ↓ | 
    5 lines elided | 
    
      ↑ open up ↑ | 
  
 731  725  page_downgrade(page_t *pp)
 732  726  {
 733  727          kmutex_t *pse = PAGE_SE_MUTEX(pp);
 734  728          int excl_waiting;
 735  729  
 736  730          ASSERT((pp->p_selock & ~SE_EWANTED) != SE_DELETED);
 737  731          ASSERT(PAGE_EXCL(pp));
 738  732  
 739  733          mutex_enter(pse);
 740  734          excl_waiting =  pp->p_selock & SE_EWANTED;
 741      -        THREAD_KPRI_RELEASE();
 742  735          pp->p_selock = SE_READER | excl_waiting;
 743  736          if (CV_HAS_WAITERS(&pp->p_cv))
 744  737                  cv_broadcast(&pp->p_cv);
 745  738          mutex_exit(pse);
 746  739  }
 747  740  
 748  741  void
 749  742  page_lock_delete(page_t *pp)
 750  743  {
 751  744          kmutex_t *pse = PAGE_SE_MUTEX(pp);
 752  745  
 753  746          ASSERT(PAGE_EXCL(pp));
 754  747          ASSERT(pp->p_vnode == NULL);
 755  748          ASSERT(pp->p_offset == (u_offset_t)-1);
 756  749          ASSERT(!PP_ISFREE(pp));
 757  750  
 758  751          mutex_enter(pse);
 759      -        THREAD_KPRI_RELEASE();
 760  752          pp->p_selock = SE_DELETED;
 761  753          if (CV_HAS_WAITERS(&pp->p_cv))
 762  754                  cv_broadcast(&pp->p_cv);
 763  755          mutex_exit(pse);
 764  756  }
 765  757  
 766  758  int
 767  759  page_deleted(page_t *pp)
 768  760  {
 769  761          return (pp->p_selock == SE_DELETED);
 770  762  }
 771  763  
 772  764  /*
 773  765   * Implement the io lock for pages
 774  766   */
 775  767  void
 776  768  page_iolock_init(page_t *pp)
 777  769  {
 778  770          pp->p_iolock_state = 0;
 779  771          cv_init(&pp->p_io_cv, NULL, CV_DEFAULT, NULL);
 780  772  }
 781  773  
 782  774  /*
 783  775   * Acquire the i/o lock on a page.
 784  776   */
 785  777  void
 786  778  page_io_lock(page_t *pp)
 787  779  {
 788  780          kmutex_t *pio;
 789  781  
 790  782          pio = PAGE_IO_MUTEX(pp);
 791  783          mutex_enter(pio);
 792  784          while (pp->p_iolock_state & PAGE_IO_INUSE) {
 793  785                  cv_wait(&(pp->p_io_cv), pio);
 794  786          }
 795  787          pp->p_iolock_state |= PAGE_IO_INUSE;
 796  788          mutex_exit(pio);
 797  789  }
 798  790  
 799  791  /*
 800  792   * Release the i/o lock on a page.
 801  793   */
 802  794  void
 803  795  page_io_unlock(page_t *pp)
 804  796  {
 805  797          kmutex_t *pio;
 806  798  
 807  799          pio = PAGE_IO_MUTEX(pp);
 808  800          mutex_enter(pio);
 809  801          cv_broadcast(&pp->p_io_cv);
 810  802          pp->p_iolock_state &= ~PAGE_IO_INUSE;
 811  803          mutex_exit(pio);
 812  804  }
 813  805  
 814  806  /*
 815  807   * Try to acquire the i/o lock on a page without blocking.
 816  808   * Returns 1 on success, 0 on failure.
 817  809   */
 818  810  int
 819  811  page_io_trylock(page_t *pp)
 820  812  {
 821  813          kmutex_t *pio;
 822  814  
 823  815          if (pp->p_iolock_state & PAGE_IO_INUSE)
 824  816                  return (0);
 825  817  
 826  818          pio = PAGE_IO_MUTEX(pp);
 827  819          mutex_enter(pio);
 828  820  
 829  821          if (pp->p_iolock_state & PAGE_IO_INUSE) {
 830  822                  mutex_exit(pio);
 831  823                  return (0);
 832  824          }
 833  825          pp->p_iolock_state |= PAGE_IO_INUSE;
 834  826          mutex_exit(pio);
 835  827  
 836  828          return (1);
 837  829  }
 838  830  
 839  831  /*
 840  832   * Wait until the i/o lock is not held.
 841  833   */
 842  834  void
 843  835  page_io_wait(page_t *pp)
 844  836  {
 845  837          kmutex_t *pio;
 846  838  
 847  839          pio = PAGE_IO_MUTEX(pp);
 848  840          mutex_enter(pio);
 849  841          while (pp->p_iolock_state & PAGE_IO_INUSE) {
 850  842                  cv_wait(&(pp->p_io_cv), pio);
 851  843          }
 852  844          mutex_exit(pio);
 853  845  }
 854  846  
 855  847  /*
 856  848   * Returns 1 on success, 0 on failure.
 857  849   */
 858  850  int
 859  851  page_io_locked(page_t *pp)
 860  852  {
 861  853          return (pp->p_iolock_state & PAGE_IO_INUSE);
 862  854  }
 863  855  
 864  856  /*
 865  857   * Assert that the i/o lock on a page is held.
 866  858   * Returns 1 on success, 0 on failure.
 867  859   */
 868  860  int
 869  861  page_iolock_assert(page_t *pp)
 870  862  {
 871  863          return (page_io_locked(pp));
 872  864  }
 873  865  
 874  866  /*
 875  867   * Wrapper exported to kernel routines that are built
 876  868   * platform-independent (the macro is platform-dependent;
 877  869   * the size of vph_mutex[] is based on NCPU).
 878  870   *
 879  871   * Note that you can do stress testing on this by setting the
 880  872   * variable page_vnode_mutex_stress to something other than
 881  873   * zero in a DEBUG kernel in a debugger after loading the kernel.
 882  874   * Setting it after the kernel is running may not work correctly.
 883  875   */
 884  876  #ifdef DEBUG
 885  877  static int page_vnode_mutex_stress = 0;
 886  878  #endif
 887  879  
 888  880  kmutex_t *
 889  881  page_vnode_mutex(vnode_t *vp)
 890  882  {
 891  883          if (vp == &kvp)
 892  884                  return (&vph_mutex[VPH_TABLE_SIZE + 0]);
 893  885  
 894  886          if (vp == &zvp)
 895  887                  return (&vph_mutex[VPH_TABLE_SIZE + 1]);
 896  888  #ifdef DEBUG
 897  889          if (page_vnode_mutex_stress != 0)
 898  890                  return (&vph_mutex[0]);
 899  891  #endif
 900  892  
 901  893          return (&vph_mutex[VP_HASH_FUNC(vp)]);
 902  894  }
 903  895  
 904  896  kmutex_t *
 905  897  page_se_mutex(page_t *pp)
 906  898  {
 907  899          return (PAGE_SE_MUTEX(pp));
 908  900  }
 909  901  
 910  902  #ifdef VM_STATS
 911  903  uint_t pszclck_stat[4];
 912  904  #endif
 913  905  /*
 914  906   * Find, take and return a mutex held by hat_page_demote().
 915  907   * Called by page_demote_vp_pages() before hat_page_demote() call and by
 916  908   * routines that want to block hat_page_demote() but can't do it
 917  909   * via locking all constituent pages.
 918  910   *
 919  911   * Return NULL if p_szc is 0.
 920  912   *
 921  913   * It should only be used for pages that can be demoted by hat_page_demote()
 922  914   * i.e. non swapfs file system pages.  The logic here is lifted from
 923  915   * sfmmu_mlspl_enter() except there's no need to worry about p_szc increase
 924  916   * since the page is locked and not free.
 925  917   *
 926  918   * Hash of the root page is used to find the lock.
 927  919   * To find the root in the presense of hat_page_demote() chageing the location
 928  920   * of the root this routine relies on the fact that hat_page_demote() changes
 929  921   * root last.
 930  922   *
 931  923   * If NULL is returned pp's p_szc is guaranteed to be 0. If non NULL is
 932  924   * returned pp's p_szc may be any value.
 933  925   */
 934  926  kmutex_t *
 935  927  page_szc_lock(page_t *pp)
 936  928  {
 937  929          kmutex_t        *mtx;
 938  930          page_t          *rootpp;
 939  931          uint_t          szc;
 940  932          uint_t          rszc;
 941  933          uint_t          pszc = pp->p_szc;
 942  934  
 943  935          ASSERT(pp != NULL);
 944  936          ASSERT(PAGE_LOCKED(pp));
 945  937          ASSERT(!PP_ISFREE(pp));
 946  938          ASSERT(pp->p_vnode != NULL);
 947  939          ASSERT(!IS_SWAPFSVP(pp->p_vnode));
 948  940          ASSERT(!PP_ISKAS(pp));
 949  941  
 950  942  again:
 951  943          if (pszc == 0) {
 952  944                  VM_STAT_ADD(pszclck_stat[0]);
 953  945                  return (NULL);
 954  946          }
 955  947  
 956  948          /* The lock lives in the root page */
 957  949  
 958  950          rootpp = PP_GROUPLEADER(pp, pszc);
 959  951          mtx = PAGE_SZC_MUTEX(rootpp);
 960  952          mutex_enter(mtx);
 961  953  
 962  954          /*
 963  955           * since p_szc can only decrease if pp == rootpp
 964  956           * rootpp will be always the same i.e we have the right root
 965  957           * regardless of rootpp->p_szc.
 966  958           * If location of pp's root didn't change after we took
 967  959           * the lock we have the right root. return mutex hashed off it.
 968  960           */
 969  961          if (pp == rootpp || (rszc = rootpp->p_szc) == pszc) {
 970  962                  VM_STAT_ADD(pszclck_stat[1]);
 971  963                  return (mtx);
 972  964          }
 973  965  
 974  966          /*
 975  967           * root location changed because page got demoted.
 976  968           * locate the new root.
 977  969           */
 978  970          if (rszc < pszc) {
 979  971                  szc = pp->p_szc;
 980  972                  ASSERT(szc < pszc);
 981  973                  mutex_exit(mtx);
 982  974                  pszc = szc;
 983  975                  VM_STAT_ADD(pszclck_stat[2]);
 984  976                  goto again;
 985  977          }
 986  978  
 987  979          VM_STAT_ADD(pszclck_stat[3]);
 988  980          /*
 989  981           * current hat_page_demote not done yet.
 990  982           * wait for it to finish.
 991  983           */
 992  984          mutex_exit(mtx);
 993  985          rootpp = PP_GROUPLEADER(rootpp, rszc);
 994  986          mtx = PAGE_SZC_MUTEX(rootpp);
 995  987          mutex_enter(mtx);
 996  988          mutex_exit(mtx);
 997  989          ASSERT(rootpp->p_szc < rszc);
 998  990          goto again;
 999  991  }
1000  992  
1001  993  int
1002  994  page_szc_lock_assert(page_t *pp)
1003  995  {
1004  996          page_t *rootpp = PP_PAGEROOT(pp);
1005  997          kmutex_t *mtx = PAGE_SZC_MUTEX(rootpp);
1006  998  
1007  999          return (MUTEX_HELD(mtx));
1008 1000  }
1009 1001  
1010 1002  /*
1011 1003   * memseg locking
1012 1004   */
1013 1005  static krwlock_t memsegslock;
1014 1006  
1015 1007  /*
1016 1008   * memlist (phys_install, phys_avail) locking.
1017 1009   */
1018 1010  static krwlock_t memlists_lock;
1019 1011  
1020 1012  int
1021 1013  memsegs_trylock(int writer)
1022 1014  {
1023 1015          return (rw_tryenter(&memsegslock, writer ? RW_WRITER : RW_READER));
1024 1016  }
1025 1017  
1026 1018  void
1027 1019  memsegs_lock(int writer)
1028 1020  {
1029 1021          rw_enter(&memsegslock, writer ? RW_WRITER : RW_READER);
1030 1022  }
1031 1023  
1032 1024  /*ARGSUSED*/
1033 1025  void
1034 1026  memsegs_unlock(int writer)
1035 1027  {
1036 1028          rw_exit(&memsegslock);
1037 1029  }
1038 1030  
1039 1031  int
1040 1032  memsegs_lock_held(void)
1041 1033  {
1042 1034          return (RW_LOCK_HELD(&memsegslock));
1043 1035  }
1044 1036  
1045 1037  void
1046 1038  memlist_read_lock(void)
1047 1039  {
1048 1040          rw_enter(&memlists_lock, RW_READER);
1049 1041  }
1050 1042  
1051 1043  void
1052 1044  memlist_read_unlock(void)
1053 1045  {
1054 1046          rw_exit(&memlists_lock);
1055 1047  }
1056 1048  
1057 1049  void
1058 1050  memlist_write_lock(void)
1059 1051  {
1060 1052          rw_enter(&memlists_lock, RW_WRITER);
1061 1053  }
1062 1054  
1063 1055  void
1064 1056  memlist_write_unlock(void)
1065 1057  {
1066 1058          rw_exit(&memlists_lock);
1067 1059  }
  
    | 
      ↓ open down ↓ | 
    298 lines elided | 
    
      ↑ open up ↑ | 
  
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX