2 Wdiff usr/src/uts/common/vm/vm_pagelist.c

Print this page

11528 Makefile.noget can get gone
11529 Use -Wno-maybe-initialized

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/vm/vm_pagelist.c
          +++ new/usr/src/uts/common/vm/vm_pagelist.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   */
  24   24  
  25   25  /*
  26   26   * Copyright 2012 Joyent, Inc.  All rights reserved.
  27   27   */
  28   28  
  29   29  /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  30   30  /*      All Rights Reserved   */
  31   31  
  32   32  /*
  33   33   * Portions of this source code were derived from Berkeley 4.3 BSD
  34   34   * under license from the Regents of the University of California.
  35   35   */
  36   36  
  37   37  
  38   38  /*
  39   39   * This file contains common functions to access and manage the page lists.
  40   40   * Many of these routines originated from platform dependent modules
  41   41   * (sun4/vm/vm_dep.c, i86pc/vm/vm_machdep.c) and modified to function in
  42   42   * a platform independent manner.
  43   43   *
  44   44   * vm/vm_dep.h provides for platform specific support.
  45   45   */
  46   46  
  47   47  #include <sys/types.h>
  48   48  #include <sys/debug.h>
  49   49  #include <sys/cmn_err.h>
  50   50  #include <sys/systm.h>
  51   51  #include <sys/atomic.h>
  52   52  #include <sys/sysmacros.h>
  53   53  #include <vm/as.h>
  54   54  #include <vm/page.h>
  55   55  #include <vm/seg_kmem.h>
  56   56  #include <vm/seg_vn.h>
  57   57  #include <sys/vmsystm.h>
  58   58  #include <sys/memnode.h>
  59   59  #include <vm/vm_dep.h>
  60   60  #include <sys/lgrp.h>
  61   61  #include <sys/mem_config.h>
  62   62  #include <sys/callb.h>
  63   63  #include <sys/mem_cage.h>
  64   64  #include <sys/sdt.h>
  65   65  #include <sys/dumphdr.h>
  66   66  #include <sys/swap.h>
  67   67  
  68   68  extern uint_t   vac_colors;
  69   69  
  70   70  #define MAX_PRAGMA_ALIGN        128
  71   71  
  72   72  /* vm_cpu_data0 for the boot cpu before kmem is initialized */
  73   73  
  74   74  #if L2CACHE_ALIGN_MAX <= MAX_PRAGMA_ALIGN
  75   75  #pragma align   L2CACHE_ALIGN_MAX(vm_cpu_data0)
  76   76  #else
  77   77  #pragma align   MAX_PRAGMA_ALIGN(vm_cpu_data0)
  78   78  #endif
  79   79  char            vm_cpu_data0[VM_CPU_DATA_PADSIZE];
  80   80  
  81   81  /*
  82   82   * number of page colors equivalent to reqested color in page_get routines.
  83   83   * If set, keeps large pages intact longer and keeps MPO allocation
  84   84   * from the local mnode in favor of acquiring the 'correct' page color from
  85   85   * a demoted large page or from a remote mnode.
  86   86   */
  87   87  uint_t  colorequiv;
  88   88  
  89   89  /*
  90   90   * color equivalency mask for each page size.
  91   91   * Mask is computed based on cpu L2$ way sizes and colorequiv global.
  92   92   * High 4 bits determine the number of high order bits of the color to ignore.
  93   93   * Low 4 bits determines number of low order bits of color to ignore (it's only
  94   94   * relevant for hashed index based page coloring).
  95   95   */
  96   96  uchar_t colorequivszc[MMU_PAGE_SIZES];
  97   97  
  98   98  /*
  99   99   * if set, specifies the percentage of large pages that are free from within
 100  100   * a large page region before attempting to lock those pages for
 101  101   * page_get_contig_pages processing.
 102  102   *
 103  103   * Should be turned on when kpr is available when page_trylock_contig_pages
 104  104   * can be more selective.
 105  105   */
 106  106  
 107  107  int     ptcpthreshold;
 108  108  
 109  109  /*
 110  110   * Limit page get contig page search based on failure cnts in pgcpfailcnt[].
 111  111   * Enabled by default via pgcplimitsearch.
 112  112   *
 113  113   * pgcpfailcnt[] is bounded by PGCPFAILMAX (>= 1/2 of installed
 114  114   * memory). When reached, pgcpfailcnt[] is reset to 1/2 of this upper
 115  115   * bound. This upper bound range guarantees:
 116  116   *    - all large page 'slots' will be searched over time
 117  117   *    - the minimum (1) large page candidates considered on each pgcp call
 118  118   *    - count doesn't wrap around to 0
 119  119   */
 120  120  pgcnt_t pgcpfailcnt[MMU_PAGE_SIZES];
 121  121  int     pgcplimitsearch = 1;
 122  122  
 123  123  #define PGCPFAILMAX             (1 << (highbit(physinstalled) - 1))
 124  124  #define SETPGCPFAILCNT(szc)                                             \
 125  125          if (++pgcpfailcnt[szc] >= PGCPFAILMAX)                          \
 126  126                  pgcpfailcnt[szc] = PGCPFAILMAX / 2;
 127  127  
 128  128  #ifdef VM_STATS
 129  129  struct vmm_vmstats_str  vmm_vmstats;
 130  130  
 131  131  #endif /* VM_STATS */
 132  132  
 133  133  #if defined(__sparc)
 134  134  #define LPGCREATE       0
 135  135  #else
 136  136  /* enable page_get_contig_pages */
 137  137  #define LPGCREATE       1
 138  138  #endif
 139  139  
 140  140  int pg_contig_disable;
 141  141  int pg_lpgcreate_nocage = LPGCREATE;
 142  142  
 143  143  /*
 144  144   * page_freelist_split pfn flag to signify no lo or hi pfn requirement.
 145  145   */
 146  146  #define PFNNULL         0
 147  147  
 148  148  /* Flags involved in promotion and demotion routines */
 149  149  #define PC_FREE         0x1     /* put page on freelist */
 150  150  #define PC_ALLOC        0x2     /* return page for allocation */
 151  151  
 152  152  /*
 153  153   * Flag for page_demote to be used with PC_FREE to denote that we don't care
 154  154   * what the color is as the color parameter to the function is ignored.
 155  155   */
 156  156  #define PC_NO_COLOR     (-1)
 157  157  
 158  158  /* mtype value for page_promote to use when mtype does not matter */
 159  159  #define PC_MTYPE_ANY    (-1)
 160  160  
 161  161  /*
 162  162   * page counters candidates info
 163  163   * See page_ctrs_cands comment below for more details.
 164  164   * fields are as follows:
 165  165   *      pcc_pages_free:         # pages which freelist coalesce can create
 166  166   *      pcc_color_free:         pointer to page free counts per color
 167  167   */
 168  168  typedef struct pcc_info {
 169  169          pgcnt_t pcc_pages_free;
 170  170          pgcnt_t *pcc_color_free;
 171  171          uint_t  pad[12];
 172  172  } pcc_info_t;
 173  173  
 174  174  /*
 175  175   * On big machines it can take a long time to check page_counters
 176  176   * arrays. page_ctrs_cands is a summary array whose elements are a dynamically
 177  177   * updated sum of all elements of the corresponding page_counters arrays.
 178  178   * page_freelist_coalesce() searches page_counters only if an appropriate
 179  179   * element of page_ctrs_cands array is greater than 0.
 180  180   *
 181  181   * page_ctrs_cands is indexed by mutex (i), region (r), mnode (m), mrange (g)
 182  182   */
 183  183  pcc_info_t **page_ctrs_cands[NPC_MUTEX][MMU_PAGE_SIZES];
 184  184  
 185  185  /*
 186  186   * Return in val the total number of free pages which can be created
 187  187   * for the given mnode (m), mrange (g), and region size (r)
 188  188   */
 189  189  #define PGCTRS_CANDS_GETVALUE(m, g, r, val) {                           \
 190  190          int i;                                                          \
 191  191          val = 0;                                                        \
 192  192          for (i = 0; i < NPC_MUTEX; i++) {                               \
 193  193              val += page_ctrs_cands[i][(r)][(m)][(g)].pcc_pages_free;    \
 194  194          }                                                               \
 195  195  }
 196  196  
 197  197  /*
 198  198   * Return in val the total number of free pages which can be created
 199  199   * for the given mnode (m), mrange (g), region size (r), and color (c)
 200  200   */
 201  201  #define PGCTRS_CANDS_GETVALUECOLOR(m, g, r, c, val) {                   \
 202  202          int i;                                                          \
 203  203          val = 0;                                                        \
 204  204          ASSERT((c) < PAGE_GET_PAGECOLORS(r));                           \
 205  205          for (i = 0; i < NPC_MUTEX; i++) {                               \
 206  206              val +=                                                      \
 207  207                  page_ctrs_cands[i][(r)][(m)][(g)].pcc_color_free[(c)];  \
 208  208          }                                                               \
 209  209  }
 210  210  
 211  211  /*
 212  212   * We can only allow a single thread to update a counter within the physical
 213  213   * range of the largest supported page size. That is the finest granularity
 214  214   * possible since the counter values are dependent on each other
 215  215   * as you move accross region sizes. PP_CTR_LOCK_INDX is used to determine the
 216  216   * ctr_mutex lock index for a particular physical range.
 217  217   */
 218  218  static kmutex_t *ctr_mutex[NPC_MUTEX];
 219  219  
 220  220  #define PP_CTR_LOCK_INDX(pp)                                            \
 221  221          (((pp)->p_pagenum >>                                            \
 222  222              (PAGE_BSZS_SHIFT(mmu_page_sizes - 1))) & (NPC_MUTEX - 1))
 223  223  
 224  224  #define INVALID_COLOR 0xffffffff
 225  225  #define INVALID_MASK  0xffffffff
 226  226  
 227  227  /*
 228  228   * Local functions prototypes.
 229  229   */
 230  230  
 231  231  void page_ctr_add(int, int, page_t *, int);
 232  232  void page_ctr_add_internal(int, int, page_t *, int);
 233  233  void page_ctr_sub(int, int, page_t *, int);
 234  234  void page_ctr_sub_internal(int, int, page_t *, int);
 235  235  void page_freelist_lock(int);
 236  236  void page_freelist_unlock(int);
 237  237  page_t *page_promote(int, pfn_t, uchar_t, int, int);
 238  238  page_t *page_demote(int, pfn_t, pfn_t, uchar_t, uchar_t, int, int);
 239  239  page_t *page_freelist_split(uchar_t,
 240  240      uint_t, int, int, pfn_t, pfn_t, page_list_walker_t *);
 241  241  page_t *page_get_mnode_cachelist(uint_t, uint_t, int, int);
 242  242  static int page_trylock_cons(page_t *pp, se_t se);
 243  243  
 244  244  /*
 245  245   * The page_counters array below is used to keep track of free contiguous
 246  246   * physical memory.  A hw_page_map_t will be allocated per mnode per szc.
 247  247   * This contains an array of counters, the size of the array, a shift value
 248  248   * used to convert a pagenum into a counter array index or vice versa, as
 249  249   * well as a cache of the last successful index to be promoted to a larger
 250  250   * page size.  As an optimization, we keep track of the last successful index
 251  251   * to be promoted per page color for the given size region, and this is
 252  252   * allocated dynamically based upon the number of colors for a given
 253  253   * region size.
 254  254   *
 255  255   * Conceptually, the page counters are represented as:
 256  256   *
 257  257   *      page_counters[region_size][mnode]
 258  258   *
 259  259   *      region_size:    size code of a candidate larger page made up
 260  260   *                      of contiguous free smaller pages.
 261  261   *
 262  262   *      page_counters[region_size][mnode].hpm_counters[index]:
 263  263   *              represents how many (region_size - 1) pages either
 264  264   *              exist or can be created within the given index range.
 265  265   *
 266  266   * Let's look at a sparc example:
 267  267   *      If we want to create a free 512k page, we look at region_size 2
 268  268   *      for the mnode we want.  We calculate the index and look at a specific
 269  269   *      hpm_counters location.  If we see 8 (FULL_REGION_CNT on sparc) at
 270  270   *      this location, it means that 8 64k pages either exist or can be created
 271  271   *      from 8K pages in order to make a single free 512k page at the given
 272  272   *      index.  Note that when a region is full, it will contribute to the
 273  273   *      counts in the region above it.  Thus we will not know what page
 274  274   *      size the free pages will be which can be promoted to this new free
 275  275   *      page unless we look at all regions below the current region.
 276  276   */
 277  277  
 278  278  /*
 279  279   * Note: hpmctr_t is defined in platform vm_dep.h
 280  280   * hw_page_map_t contains all the information needed for the page_counters
 281  281   * logic. The fields are as follows:
 282  282   *
 283  283   *      hpm_counters:   dynamically allocated array to hold counter data
 284  284   *      hpm_entries:    entries in hpm_counters
 285  285   *      hpm_shift:      shift for pnum/array index conv
 286  286   *      hpm_base:       PFN mapped to counter index 0
 287  287   *      hpm_color_current:      last index in counter array for this color at
 288  288   *                              which we successfully created a large page
 289  289   */
 290  290  typedef struct hw_page_map {
 291  291          hpmctr_t        *hpm_counters;
 292  292          size_t          hpm_entries;
 293  293          int             hpm_shift;
 294  294          pfn_t           hpm_base;
 295  295          size_t          *hpm_color_current[MAX_MNODE_MRANGES];
 296  296  #if defined(__sparc)
 297  297          uint_t          pad[4];
 298  298  #endif
 299  299  } hw_page_map_t;
 300  300  
 301  301  /*
 302  302   * Element zero is not used, but is allocated for convenience.
 303  303   */
 304  304  static hw_page_map_t *page_counters[MMU_PAGE_SIZES];
 305  305  
 306  306  /*
 307  307   * Cached value of MNODE_RANGE_CNT(mnode).
 308  308   * This is a function call in x86.
 309  309   */
 310  310  static int mnode_nranges[MAX_MEM_NODES];
 311  311  static int mnode_maxmrange[MAX_MEM_NODES];
 312  312  
 313  313  /*
 314  314   * The following macros are convenient ways to get access to the individual
 315  315   * elements of the page_counters arrays.  They can be used on both
 316  316   * the left side and right side of equations.
 317  317   */
 318  318  #define PAGE_COUNTERS(mnode, rg_szc, idx)                       \
 319  319          (page_counters[(rg_szc)][(mnode)].hpm_counters[(idx)])
 320  320  
 321  321  #define PAGE_COUNTERS_COUNTERS(mnode, rg_szc)                   \
 322  322          (page_counters[(rg_szc)][(mnode)].hpm_counters)
 323  323  
 324  324  #define PAGE_COUNTERS_SHIFT(mnode, rg_szc)                      \
 325  325          (page_counters[(rg_szc)][(mnode)].hpm_shift)
 326  326  
 327  327  #define PAGE_COUNTERS_ENTRIES(mnode, rg_szc)                    \
 328  328          (page_counters[(rg_szc)][(mnode)].hpm_entries)
 329  329  
 330  330  #define PAGE_COUNTERS_BASE(mnode, rg_szc)                       \
 331  331          (page_counters[(rg_szc)][(mnode)].hpm_base)
 332  332  
 333  333  #define PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, rg_szc, g)             \
 334  334          (page_counters[(rg_szc)][(mnode)].hpm_color_current[(g)])
 335  335  
 336  336  #define PAGE_COUNTERS_CURRENT_COLOR(mnode, rg_szc, color, mrange)       \
 337  337          (page_counters[(rg_szc)][(mnode)].                              \
 338  338          hpm_color_current[(mrange)][(color)])
 339  339  
 340  340  #define PNUM_TO_IDX(mnode, rg_szc, pnum)                        \
 341  341          (((pnum) - PAGE_COUNTERS_BASE((mnode), (rg_szc))) >>    \
 342  342                  PAGE_COUNTERS_SHIFT((mnode), (rg_szc)))
 343  343  
 344  344  #define IDX_TO_PNUM(mnode, rg_szc, index)                       \
 345  345          (PAGE_COUNTERS_BASE((mnode), (rg_szc)) +                \
 346  346                  ((index) << PAGE_COUNTERS_SHIFT((mnode), (rg_szc))))
 347  347  
 348  348  /*
 349  349   * Protects the hpm_counters and hpm_color_current memory from changing while
 350  350   * looking at page counters information.
 351  351   * Grab the write lock to modify what these fields point at.
 352  352   * Grab the read lock to prevent any pointers from changing.
 353  353   * The write lock can not be held during memory allocation due to a possible
 354  354   * recursion deadlock with trying to grab the read lock while the
 355  355   * write lock is already held.
 356  356   */
 357  357  krwlock_t page_ctrs_rwlock[MAX_MEM_NODES];
 358  358  
 359  359  
 360  360  /*
 361  361   * initialize cpu_vm_data to point at cache aligned vm_cpu_data_t.
 362  362   */
 363  363  void
 364  364  cpu_vm_data_init(struct cpu *cp)
 365  365  {
 366  366          if (cp == CPU0) {
 367  367                  cp->cpu_vm_data = (void *)&vm_cpu_data0;
 368  368          } else {
 369  369                  void    *kmptr;
 370  370                  int     align;
 371  371                  size_t  sz;
 372  372  
 373  373                  align = (L2CACHE_ALIGN) ? L2CACHE_ALIGN : L2CACHE_ALIGN_MAX;
 374  374                  sz = P2ROUNDUP(sizeof (vm_cpu_data_t), align) + align;
 375  375                  kmptr = kmem_zalloc(sz, KM_SLEEP);
 376  376                  cp->cpu_vm_data = (void *) P2ROUNDUP((uintptr_t)kmptr, align);
 377  377                  ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr = kmptr;
 378  378                  ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize = sz;
 379  379          }
 380  380  }
 381  381  
 382  382  /*
 383  383   * free cpu_vm_data
 384  384   */
 385  385  void
 386  386  cpu_vm_data_destroy(struct cpu *cp)
 387  387  {
 388  388          if (cp->cpu_seqid && cp->cpu_vm_data) {
 389  389                  ASSERT(cp != CPU0);
 390  390                  kmem_free(((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr,
 391  391                      ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize);
 392  392          }
 393  393          cp->cpu_vm_data = NULL;
 394  394  }
 395  395  
 396  396  
 397  397  /*
 398  398   * page size to page size code
 399  399   */
 400  400  int
 401  401  page_szc(size_t pagesize)
 402  402  {
 403  403          int     i = 0;
 404  404  
 405  405          while (hw_page_array[i].hp_size) {
 406  406                  if (pagesize == hw_page_array[i].hp_size)
 407  407                          return (i);
 408  408                  i++;
 409  409          }
 410  410          return (-1);
 411  411  }
 412  412  
 413  413  /*
 414  414   * page size to page size code with the restriction that it be a supported
 415  415   * user page size.  If it's not a supported user page size, -1 will be returned.
 416  416   */
 417  417  int
 418  418  page_szc_user_filtered(size_t pagesize)
 419  419  {
 420  420          int szc = page_szc(pagesize);
 421  421          if ((szc != -1) && (SZC_2_USERSZC(szc) != -1)) {
 422  422                  return (szc);
 423  423          }
 424  424          return (-1);
 425  425  }
 426  426  
 427  427  /*
 428  428   * Return how many page sizes are available for the user to use.  This is
 429  429   * what the hardware supports and not based upon how the OS implements the
 430  430   * support of different page sizes.
 431  431   *
 432  432   * If legacy is non-zero, return the number of pagesizes available to legacy
 433  433   * applications. The number of legacy page sizes might be less than the
 434  434   * exported user page sizes. This is to prevent legacy applications that
 435  435   * use the largest page size returned from getpagesizes(3c) from inadvertantly
 436  436   * using the 'new' large pagesizes.
 437  437   */
 438  438  uint_t
 439  439  page_num_user_pagesizes(int legacy)
 440  440  {
 441  441          if (legacy)
 442  442                  return (mmu_legacy_page_sizes);
 443  443          return (mmu_exported_page_sizes);
 444  444  }
 445  445  
 446  446  uint_t
 447  447  page_num_pagesizes(void)
 448  448  {
 449  449          return (mmu_page_sizes);
 450  450  }
 451  451  
 452  452  /*
 453  453   * returns the count of the number of base pagesize pages associated with szc
 454  454   */
 455  455  pgcnt_t
 456  456  page_get_pagecnt(uint_t szc)
 457  457  {
 458  458          if (szc >= mmu_page_sizes)
 459  459                  panic("page_get_pagecnt: out of range %d", szc);
 460  460          return (hw_page_array[szc].hp_pgcnt);
 461  461  }
 462  462  
 463  463  size_t
 464  464  page_get_pagesize(uint_t szc)
 465  465  {
 466  466          if (szc >= mmu_page_sizes)
 467  467                  panic("page_get_pagesize: out of range %d", szc);
 468  468          return (hw_page_array[szc].hp_size);
 469  469  }
 470  470  
 471  471  /*
 472  472   * Return the size of a page based upon the index passed in.  An index of
 473  473   * zero refers to the smallest page size in the system, and as index increases
 474  474   * it refers to the next larger supported page size in the system.
 475  475   * Note that szc and userszc may not be the same due to unsupported szc's on
 476  476   * some systems.
 477  477   */
 478  478  size_t
 479  479  page_get_user_pagesize(uint_t userszc)
 480  480  {
 481  481          uint_t szc = USERSZC_2_SZC(userszc);
 482  482  
 483  483          if (szc >= mmu_page_sizes)
 484  484                  panic("page_get_user_pagesize: out of range %d", szc);
 485  485          return (hw_page_array[szc].hp_size);
 486  486  }
 487  487  
 488  488  uint_t
 489  489  page_get_shift(uint_t szc)
 490  490  {
 491  491          if (szc >= mmu_page_sizes)
 492  492                  panic("page_get_shift: out of range %d", szc);
 493  493          return (PAGE_GET_SHIFT(szc));
 494  494  }
 495  495  
 496  496  uint_t
 497  497  page_get_pagecolors(uint_t szc)
 498  498  {
 499  499          if (szc >= mmu_page_sizes)
 500  500                  panic("page_get_pagecolors: out of range %d", szc);
 501  501          return (PAGE_GET_PAGECOLORS(szc));
 502  502  }
 503  503  
 504  504  /*
 505  505   * this assigns the desired equivalent color after a split
 506  506   */
 507  507  uint_t
 508  508  page_correct_color(uchar_t szc, uchar_t nszc, uint_t color,
 509  509      uint_t ncolor, uint_t ceq_mask)
 510  510  {
 511  511          ASSERT(nszc > szc);
 512  512          ASSERT(szc < mmu_page_sizes);
 513  513          ASSERT(color < PAGE_GET_PAGECOLORS(szc));
 514  514          ASSERT(ncolor < PAGE_GET_PAGECOLORS(nszc));
 515  515  
 516  516          color &= ceq_mask;
 517  517          ncolor = PAGE_CONVERT_COLOR(ncolor, szc, nszc);
 518  518          return (color | (ncolor & ~ceq_mask));
 519  519  }
 520  520  
 521  521  /*
 522  522   * The interleaved_mnodes flag is set when mnodes overlap in
 523  523   * the physbase..physmax range, but have disjoint slices.
 524  524   * In this case hpm_counters is shared by all mnodes.
 525  525   * This flag is set dynamically by the platform.
 526  526   */
 527  527  int interleaved_mnodes = 0;
 528  528  
 529  529  /*
 530  530   * Called by startup().
 531  531   * Size up the per page size free list counters based on physmax
 532  532   * of each node and max_mem_nodes.
 533  533   *
 534  534   * If interleaved_mnodes is set we need to find the first mnode that
 535  535   * exists. hpm_counters for the first mnode will then be shared by
 536  536   * all other mnodes. If interleaved_mnodes is not set, just set
 537  537   * first=mnode each time. That means there will be no sharing.
 538  538   */
 539  539  size_t
 540  540  page_ctrs_sz(void)
 541  541  {
 542  542          int     r;              /* region size */
 543  543          int     mnode;
 544  544          int     firstmn;        /* first mnode that exists */
 545  545          int     nranges;
 546  546          pfn_t   physbase;
 547  547          pfn_t   physmax;
 548  548          uint_t  ctrs_sz = 0;
 549  549          int     i;
 550  550          pgcnt_t colors_per_szc[MMU_PAGE_SIZES];
 551  551  
 552  552          /*
 553  553           * We need to determine how many page colors there are for each
 554  554           * page size in order to allocate memory for any color specific
 555  555           * arrays.
 556  556           */
 557  557          for (i = 0; i < mmu_page_sizes; i++) {
 558  558                  colors_per_szc[i] = PAGE_GET_PAGECOLORS(i);
 559  559          }
 560  560  
 561  561          for (firstmn = -1, mnode = 0; mnode < max_mem_nodes; mnode++) {
 562  562  
 563  563                  pgcnt_t r_pgcnt;
 564  564                  pfn_t   r_base;
 565  565                  pgcnt_t r_align;
 566  566  
 567  567                  if (mem_node_config[mnode].exists == 0)
 568  568                          continue;
 569  569  
 570  570                  HPM_COUNTERS_LIMITS(mnode, physbase, physmax, firstmn);
 571  571                  nranges = MNODE_RANGE_CNT(mnode);
 572  572                  mnode_nranges[mnode] = nranges;
 573  573                  mnode_maxmrange[mnode] = MNODE_MAX_MRANGE(mnode);
 574  574  
 575  575                  /*
 576  576                   * determine size needed for page counter arrays with
 577  577                   * base aligned to large page size.
 578  578                   */
 579  579                  for (r = 1; r < mmu_page_sizes; r++) {
 580  580                          /* add in space for hpm_color_current */
 581  581                          ctrs_sz += sizeof (size_t) *
 582  582                              colors_per_szc[r] * nranges;
 583  583  
 584  584                          if (firstmn != mnode)
 585  585                                  continue;
 586  586  
 587  587                          /* add in space for hpm_counters */
 588  588                          r_align = page_get_pagecnt(r);
 589  589                          r_base = physbase;
 590  590                          r_base &= ~(r_align - 1);
 591  591                          r_pgcnt = howmany(physmax - r_base + 1, r_align);
 592  592  
 593  593                          /*
 594  594                           * Round up to always allocate on pointer sized
 595  595                           * boundaries.
 596  596                           */
 597  597                          ctrs_sz += P2ROUNDUP((r_pgcnt * sizeof (hpmctr_t)),
 598  598                              sizeof (hpmctr_t *));
 599  599                  }
 600  600          }
 601  601  
 602  602          for (r = 1; r < mmu_page_sizes; r++) {
 603  603                  ctrs_sz += (max_mem_nodes * sizeof (hw_page_map_t));
 604  604          }
 605  605  
 606  606          /* add in space for page_ctrs_cands and pcc_color_free */
 607  607          ctrs_sz += sizeof (pcc_info_t *) * max_mem_nodes *
 608  608              mmu_page_sizes * NPC_MUTEX;
 609  609  
 610  610          for (mnode = 0; mnode < max_mem_nodes; mnode++) {
 611  611  
 612  612                  if (mem_node_config[mnode].exists == 0)
 613  613                          continue;
 614  614  
 615  615                  nranges = mnode_nranges[mnode];
 616  616                  ctrs_sz += sizeof (pcc_info_t) * nranges *
 617  617                      mmu_page_sizes * NPC_MUTEX;
 618  618                  for (r = 1; r < mmu_page_sizes; r++) {
 619  619                          ctrs_sz += sizeof (pgcnt_t) * nranges *
 620  620                              colors_per_szc[r] * NPC_MUTEX;
 621  621                  }
 622  622          }
 623  623  
 624  624          /* ctr_mutex */
 625  625          ctrs_sz += (max_mem_nodes * NPC_MUTEX * sizeof (kmutex_t));
 626  626  
 627  627          /* size for page list counts */
 628  628          PLCNT_SZ(ctrs_sz);
 629  629  
 630  630          /*
 631  631           * add some slop for roundups. page_ctrs_alloc will roundup the start
 632  632           * address of the counters to ecache_alignsize boundary for every
 633  633           * memory node.
 634  634           */
 635  635          return (ctrs_sz + max_mem_nodes * L2CACHE_ALIGN);
 636  636  }
 637  637  
 638  638  caddr_t
 639  639  page_ctrs_alloc(caddr_t alloc_base)
 640  640  {
 641  641          int     mnode;
 642  642          int     mrange, nranges;
 643  643          int     r;              /* region size */
 644  644          int     i;
 645  645          int     firstmn;        /* first mnode that exists */
 646  646          pfn_t   physbase;
 647  647          pfn_t   physmax;
 648  648          pgcnt_t colors_per_szc[MMU_PAGE_SIZES];
 649  649  
 650  650          /*
 651  651           * We need to determine how many page colors there are for each
 652  652           * page size in order to allocate memory for any color specific
 653  653           * arrays.
 654  654           */
 655  655          for (i = 0; i < mmu_page_sizes; i++) {
 656  656                  colors_per_szc[i] = PAGE_GET_PAGECOLORS(i);
 657  657          }
 658  658  
 659  659          for (r = 1; r < mmu_page_sizes; r++) {
 660  660                  page_counters[r] = (hw_page_map_t *)alloc_base;
 661  661                  alloc_base += (max_mem_nodes * sizeof (hw_page_map_t));
 662  662          }
 663  663  
 664  664          /* page_ctrs_cands and pcc_color_free array */
 665  665          for (i = 0; i < NPC_MUTEX; i++) {
 666  666                  for (r = 1; r < mmu_page_sizes; r++) {
 667  667  
 668  668                          page_ctrs_cands[i][r] = (pcc_info_t **)alloc_base;
 669  669                          alloc_base += sizeof (pcc_info_t *) * max_mem_nodes;
 670  670  
 671  671                          for (mnode = 0; mnode < max_mem_nodes; mnode++) {
 672  672                                  pcc_info_t *pi;
 673  673  
 674  674                                  if (mem_node_config[mnode].exists == 0)
 675  675                                          continue;
 676  676  
 677  677                                  nranges = mnode_nranges[mnode];
 678  678  
 679  679                                  pi = (pcc_info_t *)alloc_base;
 680  680                                  alloc_base += sizeof (pcc_info_t) * nranges;
 681  681                                  page_ctrs_cands[i][r][mnode] = pi;
 682  682  
 683  683                                  for (mrange = 0; mrange < nranges; mrange++) {
 684  684                                          pi->pcc_color_free =
 685  685                                              (pgcnt_t *)alloc_base;
 686  686                                          alloc_base += sizeof (pgcnt_t) *
 687  687                                              colors_per_szc[r];
 688  688                                          pi++;
 689  689                                  }
 690  690                          }
 691  691                  }
 692  692          }
 693  693  
 694  694          /* ctr_mutex */
 695  695          for (i = 0; i < NPC_MUTEX; i++) {
 696  696                  ctr_mutex[i] = (kmutex_t *)alloc_base;
 697  697                  alloc_base += (max_mem_nodes * sizeof (kmutex_t));
 698  698          }
 699  699  
 700  700          /* initialize page list counts */
 701  701          PLCNT_INIT(alloc_base);
 702  702  
 703  703          for (firstmn = -1, mnode = 0; mnode < max_mem_nodes; mnode++) {
 704  704  
 705  705                  pgcnt_t r_pgcnt;
 706  706                  pfn_t   r_base;
 707  707                  pgcnt_t r_align;
 708  708                  int     r_shift;
 709  709                  int     nranges = mnode_nranges[mnode];
 710  710  
 711  711                  if (mem_node_config[mnode].exists == 0)
 712  712                          continue;
 713  713  
 714  714                  HPM_COUNTERS_LIMITS(mnode, physbase, physmax, firstmn);
 715  715  
 716  716                  for (r = 1; r < mmu_page_sizes; r++) {
 717  717                          /*
 718  718                           * the page_counters base has to be aligned to the
 719  719                           * page count of page size code r otherwise the counts
 720  720                           * will cross large page boundaries.
 721  721                           */
 722  722                          r_align = page_get_pagecnt(r);
 723  723                          r_base = physbase;
 724  724                          /* base needs to be aligned - lower to aligned value */
 725  725                          r_base &= ~(r_align - 1);
 726  726                          r_pgcnt = howmany(physmax - r_base + 1, r_align);
 727  727                          r_shift = PAGE_BSZS_SHIFT(r);
 728  728  
 729  729                          PAGE_COUNTERS_SHIFT(mnode, r) = r_shift;
 730  730                          PAGE_COUNTERS_ENTRIES(mnode, r) = r_pgcnt;
 731  731                          PAGE_COUNTERS_BASE(mnode, r) = r_base;
 732  732                          for (mrange = 0; mrange < nranges; mrange++) {
 733  733                                  PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode,
 734  734                                      r, mrange) = (size_t *)alloc_base;
 735  735                                  alloc_base += sizeof (size_t) *
 736  736                                      colors_per_szc[r];
 737  737                          }
 738  738                          for (i = 0; i < colors_per_szc[r]; i++) {
 739  739                                  uint_t color_mask = colors_per_szc[r] - 1;
 740  740                                  pfn_t  pfnum = r_base;
 741  741                                  size_t idx;
 742  742                                  int mrange;
 743  743                                  MEM_NODE_ITERATOR_DECL(it);
 744  744  
 745  745                                  MEM_NODE_ITERATOR_INIT(pfnum, mnode, r, &it);
 746  746                                  if (pfnum == (pfn_t)-1) {
 747  747                                          idx = 0;
 748  748                                  } else {
 749  749                                          PAGE_NEXT_PFN_FOR_COLOR(pfnum, r, i,
 750  750                                              color_mask, color_mask, &it);
 751  751                                          idx = PNUM_TO_IDX(mnode, r, pfnum);
 752  752                                          idx = (idx >= r_pgcnt) ? 0 : idx;
 753  753                                  }
 754  754                                  for (mrange = 0; mrange < nranges; mrange++) {
 755  755                                          PAGE_COUNTERS_CURRENT_COLOR(mnode,
 756  756                                              r, i, mrange) = idx;
 757  757                                  }
 758  758                          }
 759  759  
 760  760                          /* hpm_counters may be shared by all mnodes */
 761  761                          if (firstmn == mnode) {
 762  762                                  PAGE_COUNTERS_COUNTERS(mnode, r) =
 763  763                                      (hpmctr_t *)alloc_base;
 764  764                                  alloc_base +=
 765  765                                      P2ROUNDUP((sizeof (hpmctr_t) * r_pgcnt),
 766  766                                      sizeof (hpmctr_t *));
 767  767                          } else {
 768  768                                  PAGE_COUNTERS_COUNTERS(mnode, r) =
 769  769                                      PAGE_COUNTERS_COUNTERS(firstmn, r);
 770  770                          }
 771  771  
 772  772                          /*
 773  773                           * Verify that PNUM_TO_IDX and IDX_TO_PNUM
 774  774                           * satisfy the identity requirement.
 775  775                           * We should be able to go from one to the other
 776  776                           * and get consistent values.
 777  777                           */
 778  778                          ASSERT(PNUM_TO_IDX(mnode, r,
 779  779                              (IDX_TO_PNUM(mnode, r, 0))) == 0);
 780  780                          ASSERT(IDX_TO_PNUM(mnode, r,
 781  781                              (PNUM_TO_IDX(mnode, r, r_base))) == r_base);
 782  782                  }
 783  783                  /*
 784  784                   * Roundup the start address of the page_counters to
 785  785                   * cache aligned boundary for every memory node.
 786  786                   * page_ctrs_sz() has added some slop for these roundups.
 787  787                   */
 788  788                  alloc_base = (caddr_t)P2ROUNDUP((uintptr_t)alloc_base,
 789  789                      L2CACHE_ALIGN);
 790  790          }
 791  791  
 792  792          /* Initialize other page counter specific data structures. */
 793  793          for (mnode = 0; mnode < MAX_MEM_NODES; mnode++) {
 794  794                  rw_init(&page_ctrs_rwlock[mnode], NULL, RW_DEFAULT, NULL);
 795  795          }
 796  796  
 797  797          return (alloc_base);
 798  798  }
 799  799  
 800  800  /*
 801  801   * Functions to adjust region counters for each size free list.
 802  802   * Caller is responsible to acquire the ctr_mutex lock if necessary and
 803  803   * thus can be called during startup without locks.
 804  804   */
 805  805  /* ARGSUSED */
 806  806  void
 807  807  page_ctr_add_internal(int mnode, int mtype, page_t *pp, int flags)
 808  808  {
 809  809          ssize_t         r;      /* region size */
 810  810          ssize_t         idx;
 811  811          pfn_t           pfnum;
 812  812          int             lckidx;
 813  813  
 814  814          ASSERT(mnode == PP_2_MEM_NODE(pp));
 815  815          ASSERT(mtype == PP_2_MTYPE(pp));
 816  816  
 817  817          ASSERT(pp->p_szc < mmu_page_sizes);
 818  818  
 819  819          PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags);
 820  820  
 821  821          /* no counter update needed for largest page size */
 822  822          if (pp->p_szc >= mmu_page_sizes - 1) {
 823  823                  return;
 824  824          }
 825  825  
 826  826          r = pp->p_szc + 1;
 827  827          pfnum = pp->p_pagenum;
 828  828          lckidx = PP_CTR_LOCK_INDX(pp);
 829  829  
 830  830          /*
 831  831           * Increment the count of free pages for the current
 832  832           * region. Continue looping up in region size incrementing
 833  833           * count if the preceeding region is full.
 834  834           */
 835  835          while (r < mmu_page_sizes) {
 836  836                  idx = PNUM_TO_IDX(mnode, r, pfnum);
 837  837  
 838  838                  ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r));
 839  839                  ASSERT(PAGE_COUNTERS(mnode, r, idx) < FULL_REGION_CNT(r));
 840  840  
 841  841                  if (++PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r)) {
 842  842                          break;
 843  843                  } else {
 844  844                          int root_mtype = PP_2_MTYPE(PP_GROUPLEADER(pp, r));
 845  845                          pcc_info_t *cand = &page_ctrs_cands[lckidx][r][mnode]
 846  846                              [MTYPE_2_MRANGE(mnode, root_mtype)];
 847  847  
 848  848                          cand->pcc_pages_free++;
 849  849                          cand->pcc_color_free[PP_2_BIN_SZC(pp, r)]++;
 850  850                  }
 851  851                  r++;
 852  852          }
 853  853  }
 854  854  
 855  855  void
 856  856  page_ctr_add(int mnode, int mtype, page_t *pp, int flags)
 857  857  {
 858  858          int             lckidx = PP_CTR_LOCK_INDX(pp);
 859  859          kmutex_t        *lock = &ctr_mutex[lckidx][mnode];
 860  860  
 861  861          mutex_enter(lock);
 862  862          page_ctr_add_internal(mnode, mtype, pp, flags);
 863  863          mutex_exit(lock);
 864  864  }
 865  865  
 866  866  void
 867  867  page_ctr_sub_internal(int mnode, int mtype, page_t *pp, int flags)
 868  868  {
 869  869          int             lckidx;
 870  870          ssize_t         r;      /* region size */
 871  871          ssize_t         idx;
 872  872          pfn_t           pfnum;
 873  873  
 874  874          ASSERT(mnode == PP_2_MEM_NODE(pp));
 875  875          ASSERT(mtype == PP_2_MTYPE(pp));
 876  876  
 877  877          ASSERT(pp->p_szc < mmu_page_sizes);
 878  878  
 879  879          PLCNT_DECR(pp, mnode, mtype, pp->p_szc, flags);
 880  880  
 881  881          /* no counter update needed for largest page size */
 882  882          if (pp->p_szc >= mmu_page_sizes - 1) {
 883  883                  return;
 884  884          }
 885  885  
 886  886          r = pp->p_szc + 1;
 887  887          pfnum = pp->p_pagenum;
 888  888          lckidx = PP_CTR_LOCK_INDX(pp);
 889  889  
 890  890          /*
 891  891           * Decrement the count of free pages for the current
 892  892           * region. Continue looping up in region size decrementing
 893  893           * count if the preceeding region was full.
 894  894           */
 895  895          while (r < mmu_page_sizes) {
 896  896                  idx = PNUM_TO_IDX(mnode, r, pfnum);
 897  897  
 898  898                  ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r));
 899  899                  ASSERT(PAGE_COUNTERS(mnode, r, idx) > 0);
 900  900  
 901  901                  if (--PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r) - 1) {
 902  902                          break;
 903  903                  } else {
 904  904                          int root_mtype = PP_2_MTYPE(PP_GROUPLEADER(pp, r));
 905  905                          pcc_info_t *cand = &page_ctrs_cands[lckidx][r][mnode]
 906  906                              [MTYPE_2_MRANGE(mnode, root_mtype)];
 907  907  
 908  908                          ASSERT(cand->pcc_pages_free != 0);
 909  909                          ASSERT(cand->pcc_color_free[PP_2_BIN_SZC(pp, r)] != 0);
 910  910  
 911  911                          cand->pcc_pages_free--;
 912  912                          cand->pcc_color_free[PP_2_BIN_SZC(pp, r)]--;
 913  913                  }
 914  914                  r++;
 915  915          }
 916  916  }
 917  917  
 918  918  void
 919  919  page_ctr_sub(int mnode, int mtype, page_t *pp, int flags)
 920  920  {
 921  921          int             lckidx = PP_CTR_LOCK_INDX(pp);
 922  922          kmutex_t        *lock = &ctr_mutex[lckidx][mnode];
 923  923  
 924  924          mutex_enter(lock);
 925  925          page_ctr_sub_internal(mnode, mtype, pp, flags);
 926  926          mutex_exit(lock);
 927  927  }
 928  928  
 929  929  /*
 930  930   * Adjust page counters following a memory attach, since typically the
 931  931   * size of the array needs to change, and the PFN to counter index
 932  932   * mapping needs to change.
 933  933   *
 934  934   * It is possible this mnode did not exist at startup. In that case
 935  935   * allocate pcc_info_t and pcc_color_free arrays. Also, allow for nranges
 936  936   * to change (a theoretical possibility on x86), which means pcc_color_free
 937  937   * arrays must be extended.
 938  938   */
 939  939  uint_t
 940  940  page_ctrs_adjust(int mnode)
 941  941  {
 942  942          pgcnt_t npgs;
 943  943          int     r;              /* region size */
 944  944          int     i;
 945  945          size_t  pcsz, old_csz;
 946  946          hpmctr_t *new_ctr, *old_ctr;
 947  947          pfn_t   oldbase, newbase;
 948  948          pfn_t   physbase, physmax;
 949  949          size_t  old_npgs;
 950  950          hpmctr_t *ctr_cache[MMU_PAGE_SIZES];
 951  951          size_t  size_cache[MMU_PAGE_SIZES];
 952  952          size_t  *color_cache[MMU_PAGE_SIZES][MAX_MNODE_MRANGES];
 953  953          size_t  *old_color_array[MAX_MNODE_MRANGES];
 954  954          pgcnt_t colors_per_szc[MMU_PAGE_SIZES];
 955  955          pcc_info_t **cands_cache;
 956  956          pcc_info_t *old_pi, *pi;
 957  957          pgcnt_t *pgcntp;
 958  958          int nr, old_nranges, mrange, nranges = MNODE_RANGE_CNT(mnode);
 959  959          int cands_cache_nranges;
 960  960          int old_maxmrange, new_maxmrange;
 961  961          int rc = 0;
 962  962          int oldmnode;
 963  963  
 964  964          cands_cache = kmem_zalloc(sizeof (pcc_info_t *) * NPC_MUTEX *
 965  965              MMU_PAGE_SIZES, KM_NOSLEEP);
 966  966          if (cands_cache == NULL)
 967  967                  return (ENOMEM);
 968  968  
 969  969          i = -1;
 970  970          HPM_COUNTERS_LIMITS(mnode, physbase, physmax, i);
 971  971  
 972  972          newbase = physbase & ~PC_BASE_ALIGN_MASK;
 973  973          npgs = roundup(physmax, PC_BASE_ALIGN) - newbase;
 974  974  
 975  975          /* prepare to free non-null pointers on the way out */
 976  976          cands_cache_nranges = nranges;
 977  977          bzero(ctr_cache, sizeof (ctr_cache));
 978  978          bzero(color_cache, sizeof (color_cache));
 979  979  
 980  980          /*
 981  981           * We need to determine how many page colors there are for each
 982  982           * page size in order to allocate memory for any color specific
 983  983           * arrays.
 984  984           */
 985  985          for (r = 0; r < mmu_page_sizes; r++) {
 986  986                  colors_per_szc[r] = PAGE_GET_PAGECOLORS(r);
 987  987          }
 988  988  
 989  989          /*
 990  990           * Preallocate all of the new hpm_counters arrays as we can't
 991  991           * hold the page_ctrs_rwlock as a writer and allocate memory.
 992  992           * If we can't allocate all of the arrays, undo our work so far
 993  993           * and return failure.
 994  994           */
 995  995          for (r = 1; r < mmu_page_sizes; r++) {
 996  996                  pcsz = npgs >> PAGE_BSZS_SHIFT(r);
 997  997                  size_cache[r] = pcsz;
 998  998                  ctr_cache[r] = kmem_zalloc(pcsz *
 999  999                      sizeof (hpmctr_t), KM_NOSLEEP);
1000 1000                  if (ctr_cache[r] == NULL) {
1001 1001                          rc = ENOMEM;
1002 1002                          goto cleanup;
1003 1003                  }
1004 1004          }
1005 1005  
1006 1006          /*
1007 1007           * Preallocate all of the new color current arrays as we can't
1008 1008           * hold the page_ctrs_rwlock as a writer and allocate memory.
1009 1009           * If we can't allocate all of the arrays, undo our work so far
1010 1010           * and return failure.
1011 1011           */
1012 1012          for (r = 1; r < mmu_page_sizes; r++) {
1013 1013                  for (mrange = 0; mrange < nranges; mrange++) {
1014 1014                          color_cache[r][mrange] = kmem_zalloc(sizeof (size_t) *
1015 1015                              colors_per_szc[r], KM_NOSLEEP);
1016 1016                          if (color_cache[r][mrange] == NULL) {
1017 1017                                  rc = ENOMEM;
1018 1018                                  goto cleanup;
1019 1019                          }
1020 1020                  }
1021 1021          }
1022 1022  
1023 1023          /*
1024 1024           * Preallocate all of the new pcc_info_t arrays as we can't
1025 1025           * hold the page_ctrs_rwlock as a writer and allocate memory.
1026 1026           * If we can't allocate all of the arrays, undo our work so far
1027 1027           * and return failure.
1028 1028           */
1029 1029          for (r = 1; r < mmu_page_sizes; r++) {
1030 1030                  for (i = 0; i < NPC_MUTEX; i++) {
1031 1031                          pi = kmem_zalloc(nranges * sizeof (pcc_info_t),
1032 1032                              KM_NOSLEEP);
1033 1033                          if (pi == NULL) {
1034 1034                                  rc = ENOMEM;
1035 1035                                  goto cleanup;
1036 1036                          }
1037 1037                          cands_cache[i * MMU_PAGE_SIZES + r] = pi;
1038 1038  
1039 1039                          for (mrange = 0; mrange < nranges; mrange++, pi++) {
1040 1040                                  pgcntp = kmem_zalloc(colors_per_szc[r] *
1041 1041                                      sizeof (pgcnt_t), KM_NOSLEEP);
1042 1042                                  if (pgcntp == NULL) {
1043 1043                                          rc = ENOMEM;
1044 1044                                          goto cleanup;
1045 1045                                  }
1046 1046                                  pi->pcc_color_free = pgcntp;
1047 1047                          }
1048 1048                  }
1049 1049          }
1050 1050  
1051 1051          /*
1052 1052           * Grab the write lock to prevent others from walking these arrays
1053 1053           * while we are modifying them.
1054 1054           */
1055 1055          PAGE_CTRS_WRITE_LOCK(mnode);
1056 1056  
1057 1057          /*
1058 1058           * For interleaved mnodes, find the first mnode
1059 1059           * with valid page counters since the current
1060 1060           * mnode may have just been added and not have
1061 1061           * valid page counters.
1062 1062           */
1063 1063          if (interleaved_mnodes) {
1064 1064                  for (i = 0; i < max_mem_nodes; i++)
1065 1065                          if (PAGE_COUNTERS_COUNTERS(i, 1) != NULL)
1066 1066                                  break;
1067 1067                  ASSERT(i < max_mem_nodes);
1068 1068                  oldmnode = i;
1069 1069          } else
1070 1070                  oldmnode = mnode;
1071 1071  
1072 1072          old_nranges = mnode_nranges[mnode];
1073 1073          cands_cache_nranges = old_nranges;
1074 1074          mnode_nranges[mnode] = nranges;
1075 1075          old_maxmrange = mnode_maxmrange[mnode];
1076 1076          mnode_maxmrange[mnode] = MNODE_MAX_MRANGE(mnode);
1077 1077          new_maxmrange = mnode_maxmrange[mnode];
1078 1078  
1079 1079          for (r = 1; r < mmu_page_sizes; r++) {
1080 1080                  PAGE_COUNTERS_SHIFT(mnode, r) = PAGE_BSZS_SHIFT(r);
1081 1081                  old_ctr = PAGE_COUNTERS_COUNTERS(oldmnode, r);
1082 1082                  old_csz = PAGE_COUNTERS_ENTRIES(oldmnode, r);
1083 1083                  oldbase = PAGE_COUNTERS_BASE(oldmnode, r);
1084 1084                  old_npgs = old_csz << PAGE_COUNTERS_SHIFT(oldmnode, r);
1085 1085                  for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
1086 1086                          old_color_array[mrange] =
1087 1087                              PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode,
1088 1088                              r, mrange);
1089 1089                  }
1090 1090  
1091 1091                  pcsz = npgs >> PAGE_COUNTERS_SHIFT(mnode, r);
1092 1092                  new_ctr = ctr_cache[r];
1093 1093                  ctr_cache[r] = NULL;
1094 1094                  if (old_ctr != NULL &&
1095 1095                      (oldbase + old_npgs > newbase) &&
1096 1096                      (newbase + npgs > oldbase)) {
1097 1097                          /*
1098 1098                           * Map the intersection of the old and new
1099 1099                           * counters into the new array.
1100 1100                           */
1101 1101                          size_t offset;
1102 1102                          if (newbase > oldbase) {
1103 1103                                  offset = (newbase - oldbase) >>
1104 1104                                      PAGE_COUNTERS_SHIFT(mnode, r);
1105 1105                                  bcopy(old_ctr + offset, new_ctr,
1106 1106                                      MIN(pcsz, (old_csz - offset)) *
1107 1107                                      sizeof (hpmctr_t));
1108 1108                          } else {
1109 1109                                  offset = (oldbase - newbase) >>
1110 1110                                      PAGE_COUNTERS_SHIFT(mnode, r);
1111 1111                                  bcopy(old_ctr, new_ctr + offset,
1112 1112                                      MIN(pcsz - offset, old_csz) *
1113 1113                                      sizeof (hpmctr_t));
1114 1114                          }
1115 1115                  }
1116 1116  
1117 1117                  PAGE_COUNTERS_COUNTERS(mnode, r) = new_ctr;
1118 1118                  PAGE_COUNTERS_ENTRIES(mnode, r) = pcsz;
1119 1119                  PAGE_COUNTERS_BASE(mnode, r) = newbase;
1120 1120  
1121 1121                  /* update shared hpm_counters in other mnodes */
1122 1122                  if (interleaved_mnodes) {
1123 1123                          for (i = 0; i < max_mem_nodes; i++) {
1124 1124                                  if ((i == mnode) ||
1125 1125                                      (mem_node_config[i].exists == 0))
1126 1126                                          continue;
1127 1127                                  ASSERT(
1128 1128                                      PAGE_COUNTERS_COUNTERS(i, r) == old_ctr ||
1129 1129                                      PAGE_COUNTERS_COUNTERS(i, r) == NULL);
1130 1130                                  PAGE_COUNTERS_COUNTERS(i, r) = new_ctr;
1131 1131                                  PAGE_COUNTERS_ENTRIES(i, r) = pcsz;
1132 1132                                  PAGE_COUNTERS_BASE(i, r) = newbase;
1133 1133                          }
1134 1134                  }
1135 1135  
1136 1136                  for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
1137 1137                          PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r, mrange) =
1138 1138                              color_cache[r][mrange];
1139 1139                          color_cache[r][mrange] = NULL;
1140 1140                  }
1141 1141                  /*
1142 1142                   * for now, just reset on these events as it's probably
1143 1143                   * not worthwhile to try and optimize this.
1144 1144                   */
1145 1145                  for (i = 0; i < colors_per_szc[r]; i++) {
1146 1146                          uint_t color_mask = colors_per_szc[r] - 1;
1147 1147                          int mlo = interleaved_mnodes ? 0 : mnode;
1148 1148                          int mhi = interleaved_mnodes ? max_mem_nodes :
1149 1149                              (mnode + 1);
1150 1150                          int m;
1151 1151                          pfn_t  pfnum;
1152 1152                          size_t idx;
1153 1153                          MEM_NODE_ITERATOR_DECL(it);
1154 1154  
1155 1155                          for (m = mlo; m < mhi; m++) {
1156 1156                                  if (mem_node_config[m].exists == 0)
1157 1157                                          continue;
1158 1158                                  pfnum = newbase;
1159 1159                                  MEM_NODE_ITERATOR_INIT(pfnum, m, r, &it);
1160 1160                                  if (pfnum == (pfn_t)-1) {
1161 1161                                          idx = 0;
1162 1162                                  } else {
1163 1163                                          PAGE_NEXT_PFN_FOR_COLOR(pfnum, r, i,
1164 1164                                              color_mask, color_mask, &it);
1165 1165                                          idx = PNUM_TO_IDX(m, r, pfnum);
1166 1166                                          idx = (idx < pcsz) ? idx : 0;
1167 1167                                  }
1168 1168                                  for (mrange = 0; mrange < nranges; mrange++) {
1169 1169                                          if (PAGE_COUNTERS_CURRENT_COLOR_ARRAY(m,
1170 1170                                              r, mrange) != NULL)
1171 1171                                                  PAGE_COUNTERS_CURRENT_COLOR(m,
1172 1172                                                      r, i, mrange) = idx;
1173 1173                                  }
1174 1174                          }
1175 1175                  }
1176 1176  
1177 1177                  /* cache info for freeing out of the critical path */
1178 1178                  if ((caddr_t)old_ctr >= kernelheap &&
1179 1179                      (caddr_t)old_ctr < ekernelheap) {
1180 1180                          ctr_cache[r] = old_ctr;
1181 1181                          size_cache[r] = old_csz;
1182 1182                  }
1183 1183                  for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
1184 1184                          size_t *tmp = old_color_array[mrange];
1185 1185                          if ((caddr_t)tmp >= kernelheap &&
1186 1186                              (caddr_t)tmp < ekernelheap) {
1187 1187                                  color_cache[r][mrange] = tmp;
1188 1188                          }
1189 1189                  }
1190 1190                  /*
1191 1191                   * Verify that PNUM_TO_IDX and IDX_TO_PNUM
1192 1192                   * satisfy the identity requirement.
1193 1193                   * We should be able to go from one to the other
1194 1194                   * and get consistent values.
1195 1195                   */
1196 1196                  ASSERT(PNUM_TO_IDX(mnode, r,
1197 1197                      (IDX_TO_PNUM(mnode, r, 0))) == 0);
1198 1198                  ASSERT(IDX_TO_PNUM(mnode, r,
1199 1199                      (PNUM_TO_IDX(mnode, r, newbase))) == newbase);
1200 1200  
1201 1201                  /* pcc_info_t and pcc_color_free */
1202 1202                  for (i = 0; i < NPC_MUTEX; i++) {
1203 1203                          pcc_info_t *epi;
1204 1204                          pcc_info_t *eold_pi;
1205 1205  
1206 1206                          pi = cands_cache[i * MMU_PAGE_SIZES + r];
1207 1207                          old_pi = page_ctrs_cands[i][r][mnode];
1208 1208                          page_ctrs_cands[i][r][mnode] = pi;
1209 1209                          cands_cache[i * MMU_PAGE_SIZES + r] = old_pi;
1210 1210  
1211 1211                          /* preserve old pcc_color_free values, if any */
1212 1212                          if (old_pi == NULL)
1213 1213                                  continue;
1214 1214  
1215 1215                          /*
1216 1216                           * when/if x86 does DR, must account for
1217 1217                           * possible change in range index when
1218 1218                           * preserving pcc_info
1219 1219                           */
1220 1220                          epi = &pi[nranges];
1221 1221                          eold_pi = &old_pi[old_nranges];
1222 1222                          if (new_maxmrange > old_maxmrange) {
1223 1223                                  pi += new_maxmrange - old_maxmrange;
1224 1224                          } else if (new_maxmrange < old_maxmrange) {
1225 1225                                  old_pi += old_maxmrange - new_maxmrange;
1226 1226                          }
1227 1227                          for (; pi < epi && old_pi < eold_pi; pi++, old_pi++) {
1228 1228                                  pcc_info_t tmp = *pi;
1229 1229                                  *pi = *old_pi;
1230 1230                                  *old_pi = tmp;
1231 1231                          }
1232 1232                  }
1233 1233          }
1234 1234          PAGE_CTRS_WRITE_UNLOCK(mnode);
1235 1235  
1236 1236          /*
1237 1237           * Now that we have dropped the write lock, it is safe to free all
1238 1238           * of the memory we have cached above.
1239 1239           * We come thru here to free memory when pre-alloc fails, and also to
1240 1240           * free old pointers which were recorded while locked.
1241 1241           */
1242 1242  cleanup:
1243 1243          for (r = 1; r < mmu_page_sizes; r++) {
1244 1244                  if (ctr_cache[r] != NULL) {
1245 1245                          kmem_free(ctr_cache[r],
1246 1246                              size_cache[r] * sizeof (hpmctr_t));
1247 1247                  }
1248 1248                  for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
1249 1249                          if (color_cache[r][mrange] != NULL) {
1250 1250                                  kmem_free(color_cache[r][mrange],
1251 1251                                      colors_per_szc[r] * sizeof (size_t));
1252 1252                          }
1253 1253                  }
1254 1254                  for (i = 0; i < NPC_MUTEX; i++) {
1255 1255                          pi = cands_cache[i * MMU_PAGE_SIZES + r];
1256 1256                          if (pi == NULL)
1257 1257                                  continue;
1258 1258                          nr = cands_cache_nranges;
1259 1259                          for (mrange = 0; mrange < nr; mrange++, pi++) {
1260 1260                                  pgcntp = pi->pcc_color_free;
1261 1261                                  if (pgcntp == NULL)
1262 1262                                          continue;
1263 1263                                  if ((caddr_t)pgcntp >= kernelheap &&
1264 1264                                      (caddr_t)pgcntp < ekernelheap) {
1265 1265                                          kmem_free(pgcntp,
1266 1266                                              colors_per_szc[r] *
1267 1267                                              sizeof (pgcnt_t));
1268 1268                                  }
1269 1269                          }
1270 1270                          pi = cands_cache[i * MMU_PAGE_SIZES + r];
1271 1271                          if ((caddr_t)pi >= kernelheap &&
1272 1272                              (caddr_t)pi < ekernelheap) {
1273 1273                                  kmem_free(pi, nr * sizeof (pcc_info_t));
1274 1274                          }
1275 1275                  }
1276 1276          }
1277 1277  
1278 1278          kmem_free(cands_cache,
1279 1279              sizeof (pcc_info_t *) * NPC_MUTEX * MMU_PAGE_SIZES);
1280 1280          return (rc);
1281 1281  }
1282 1282  
1283 1283  /*
1284 1284   * Cleanup the hpm_counters field in the page counters
1285 1285   * array.
1286 1286   */
1287 1287  void
1288 1288  page_ctrs_cleanup(void)
1289 1289  {
1290 1290          int r;  /* region size */
1291 1291          int i;  /* mnode index */
1292 1292  
1293 1293          /*
1294 1294           * Get the page counters write lock while we are
1295 1295           * setting the page hpm_counters field to NULL
1296 1296           * for non-existent mnodes.
1297 1297           */
1298 1298          for (i = 0; i < max_mem_nodes; i++) {
1299 1299                  PAGE_CTRS_WRITE_LOCK(i);
1300 1300                  if (mem_node_config[i].exists) {
1301 1301                          PAGE_CTRS_WRITE_UNLOCK(i);
1302 1302                          continue;
1303 1303                  }
1304 1304                  for (r = 1; r < mmu_page_sizes; r++) {
1305 1305                          PAGE_COUNTERS_COUNTERS(i, r) = NULL;
1306 1306                  }
1307 1307                  PAGE_CTRS_WRITE_UNLOCK(i);
1308 1308          }
1309 1309  }
1310 1310  
1311 1311  #ifdef DEBUG
1312 1312  
1313 1313  /*
1314 1314   * confirm pp is a large page corresponding to szc
1315 1315   */
1316 1316  void
1317 1317  chk_lpg(page_t *pp, uchar_t szc)
1318 1318  {
1319 1319          spgcnt_t npgs = page_get_pagecnt(pp->p_szc);
1320 1320          uint_t noreloc;
1321 1321  
1322 1322          if (npgs == 1) {
1323 1323                  ASSERT(pp->p_szc == 0);
1324 1324                  ASSERT(pp->p_next == pp);
1325 1325                  ASSERT(pp->p_prev == pp);
1326 1326                  return;
1327 1327          }
1328 1328  
1329 1329          ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL);
1330 1330          ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL);
1331 1331  
1332 1332          ASSERT(IS_P2ALIGNED(pp->p_pagenum, npgs));
1333 1333          ASSERT(pp->p_pagenum == (pp->p_next->p_pagenum - 1));
1334 1334          ASSERT(pp->p_prev->p_pagenum == (pp->p_pagenum + (npgs - 1)));
1335 1335          ASSERT(pp->p_prev == (pp + (npgs - 1)));
1336 1336  
1337 1337          /*
1338 1338           * Check list of pages.
1339 1339           */
1340 1340          noreloc = PP_ISNORELOC(pp);
1341 1341          while (npgs--) {
1342 1342                  if (npgs != 0) {
1343 1343                          ASSERT(pp->p_pagenum == pp->p_next->p_pagenum - 1);
1344 1344                          ASSERT(pp->p_next == (pp + 1));
1345 1345                  }
1346 1346                  ASSERT(pp->p_szc == szc);
1347 1347                  ASSERT(PP_ISFREE(pp));
1348 1348                  ASSERT(PP_ISAGED(pp));
1349 1349                  ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL);
1350 1350                  ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL);
1351 1351                  ASSERT(pp->p_vnode  == NULL);
1352 1352                  ASSERT(PP_ISNORELOC(pp) == noreloc);
1353 1353  
1354 1354                  pp = pp->p_next;
1355 1355          }
1356 1356  }
1357 1357  #endif /* DEBUG */
1358 1358  
1359 1359  void
1360 1360  page_freelist_lock(int mnode)
1361 1361  {
1362 1362          int i;
1363 1363          for (i = 0; i < NPC_MUTEX; i++) {
1364 1364                  mutex_enter(FPC_MUTEX(mnode, i));
1365 1365                  mutex_enter(CPC_MUTEX(mnode, i));
1366 1366          }
1367 1367  }
1368 1368  
1369 1369  void
1370 1370  page_freelist_unlock(int mnode)
1371 1371  {
1372 1372          int i;
1373 1373          for (i = 0; i < NPC_MUTEX; i++) {
1374 1374                  mutex_exit(FPC_MUTEX(mnode, i));
1375 1375                  mutex_exit(CPC_MUTEX(mnode, i));
1376 1376          }
1377 1377  }
1378 1378  
1379 1379  /*
1380 1380   * add pp to the specified page list. Defaults to head of the page list
1381 1381   * unless PG_LIST_TAIL is specified.
1382 1382   */
1383 1383  void
1384 1384  page_list_add(page_t *pp, int flags)
1385 1385  {
1386 1386          page_t          **ppp;
1387 1387          kmutex_t        *pcm;
1388 1388          uint_t          bin, mtype;
1389 1389          int             mnode;
1390 1390  
1391 1391          ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT));
1392 1392          ASSERT(PP_ISFREE(pp));
1393 1393          ASSERT(!hat_page_is_mapped(pp));
1394 1394          ASSERT(hat_page_getshare(pp) == 0);
1395 1395  
1396 1396          /*
1397 1397           * Large pages should be freed via page_list_add_pages().
1398 1398           */
1399 1399          ASSERT(pp->p_szc == 0);
1400 1400  
1401 1401          /*
1402 1402           * Don't need to lock the freelist first here
1403 1403           * because the page isn't on the freelist yet.
1404 1404           * This means p_szc can't change on us.
1405 1405           */
1406 1406  
1407 1407          bin = PP_2_BIN(pp);
1408 1408          mnode = PP_2_MEM_NODE(pp);
1409 1409          mtype = PP_2_MTYPE(pp);
1410 1410  
1411 1411          if (flags & PG_LIST_ISINIT) {
1412 1412                  /*
1413 1413                   * PG_LIST_ISINIT is set during system startup (ie. single
1414 1414                   * threaded), add a page to the free list and add to the
1415 1415                   * the free region counters w/o any locking
1416 1416                   */
1417 1417                  ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
1418 1418  
1419 1419                  /* inline version of page_add() */
1420 1420                  if (*ppp != NULL) {
1421 1421                          pp->p_next = *ppp;
1422 1422                          pp->p_prev = (*ppp)->p_prev;
1423 1423                          (*ppp)->p_prev = pp;
1424 1424                          pp->p_prev->p_next = pp;
1425 1425                  } else
1426 1426                          *ppp = pp;
1427 1427  
1428 1428                  page_ctr_add_internal(mnode, mtype, pp, flags);
1429 1429                  VM_STAT_ADD(vmm_vmstats.pladd_free[0]);
1430 1430          } else {
1431 1431                  pcm = PC_BIN_MUTEX(mnode, bin, flags);
1432 1432  
1433 1433                  if (flags & PG_FREE_LIST) {
1434 1434                          VM_STAT_ADD(vmm_vmstats.pladd_free[0]);
1435 1435                          ASSERT(PP_ISAGED(pp));
1436 1436                          ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
1437 1437  
1438 1438                  } else {
1439 1439                          VM_STAT_ADD(vmm_vmstats.pladd_cache);
1440 1440                          ASSERT(pp->p_vnode);
1441 1441                          ASSERT((pp->p_offset & PAGEOFFSET) == 0);
1442 1442                          ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
1443 1443                  }
1444 1444                  mutex_enter(pcm);
1445 1445                  page_add(ppp, pp);
1446 1446  
1447 1447                  if (flags & PG_LIST_TAIL)
1448 1448                          *ppp = (*ppp)->p_next;
1449 1449                  /*
1450 1450                   * Add counters before releasing pcm mutex to avoid a race with
1451 1451                   * page_freelist_coalesce and page_freelist_split.
1452 1452                   */
1453 1453                  page_ctr_add(mnode, mtype, pp, flags);
1454 1454                  mutex_exit(pcm);
1455 1455          }
1456 1456  
1457 1457  
1458 1458  #if defined(__sparc)
1459 1459          if (PP_ISNORELOC(pp)) {
1460 1460                  kcage_freemem_add(1);
1461 1461          }
1462 1462  #endif
1463 1463          /*
1464 1464           * It is up to the caller to unlock the page!
1465 1465           */
1466 1466          ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT));
1467 1467  }
1468 1468  
1469 1469  
1470 1470  #ifdef __sparc
1471 1471  /*
1472 1472   * This routine is only used by kcage_init during system startup.
1473 1473   * It performs the function of page_list_sub/PP_SETNORELOC/page_list_add
1474 1474   * without the overhead of taking locks and updating counters.
1475 1475   */
1476 1476  void
1477 1477  page_list_noreloc_startup(page_t *pp)
1478 1478  {
1479 1479          page_t          **ppp;
1480 1480          uint_t          bin;
1481 1481          int             mnode;
1482 1482          int             mtype;
1483 1483          int             flags = 0;
1484 1484  
1485 1485          /*
1486 1486           * If this is a large page on the freelist then
1487 1487           * break it up into smaller pages.
1488 1488           */
1489 1489          if (pp->p_szc != 0)
1490 1490                  page_boot_demote(pp);
1491 1491  
1492 1492          /*
1493 1493           * Get list page is currently on.
1494 1494           */
1495 1495          bin = PP_2_BIN(pp);
1496 1496          mnode = PP_2_MEM_NODE(pp);
1497 1497          mtype = PP_2_MTYPE(pp);
1498 1498          ASSERT(mtype == MTYPE_RELOC);
1499 1499          ASSERT(pp->p_szc == 0);
1500 1500  
1501 1501          if (PP_ISAGED(pp)) {
1502 1502                  ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
1503 1503                  flags |= PG_FREE_LIST;
1504 1504          } else {
1505 1505                  ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
1506 1506                  flags |= PG_CACHE_LIST;
1507 1507          }
1508 1508  
1509 1509          ASSERT(*ppp != NULL);
1510 1510  
1511 1511          /*
1512 1512           * Delete page from current list.
1513 1513           */
1514 1514          if (*ppp == pp)
1515 1515                  *ppp = pp->p_next;              /* go to next page */
1516 1516          if (*ppp == pp) {
1517 1517                  *ppp = NULL;                    /* page list is gone */
1518 1518          } else {
1519 1519                  pp->p_prev->p_next = pp->p_next;
1520 1520                  pp->p_next->p_prev = pp->p_prev;
1521 1521          }
1522 1522  
1523 1523          /*
1524 1524           * Decrement page counters
1525 1525           */
1526 1526          page_ctr_sub_internal(mnode, mtype, pp, flags);
1527 1527  
1528 1528          /*
1529 1529           * Set no reloc for cage initted pages.
1530 1530           */
1531 1531          PP_SETNORELOC(pp);
1532 1532  
1533 1533          mtype = PP_2_MTYPE(pp);
1534 1534          ASSERT(mtype == MTYPE_NORELOC);
1535 1535  
1536 1536          /*
1537 1537           * Get new list for page.
1538 1538           */
1539 1539          if (PP_ISAGED(pp)) {
1540 1540                  ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
1541 1541          } else {
1542 1542                  ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
1543 1543          }
1544 1544  
1545 1545          /*
1546 1546           * Insert page on new list.
1547 1547           */
1548 1548          if (*ppp == NULL) {
1549 1549                  *ppp = pp;
1550 1550                  pp->p_next = pp->p_prev = pp;
1551 1551          } else {
1552 1552                  pp->p_next = *ppp;
1553 1553                  pp->p_prev = (*ppp)->p_prev;
1554 1554                  (*ppp)->p_prev = pp;
1555 1555                  pp->p_prev->p_next = pp;
1556 1556          }
1557 1557  
1558 1558          /*
1559 1559           * Increment page counters
1560 1560           */
1561 1561          page_ctr_add_internal(mnode, mtype, pp, flags);
1562 1562  
1563 1563          /*
1564 1564           * Update cage freemem counter
1565 1565           */
1566 1566          atomic_inc_ulong(&kcage_freemem);
1567 1567  }
1568 1568  #else   /* __sparc */
1569 1569  
1570 1570  /* ARGSUSED */
1571 1571  void
1572 1572  page_list_noreloc_startup(page_t *pp)
1573 1573  {
1574 1574          panic("page_list_noreloc_startup: should be here only for sparc");
1575 1575  }
1576 1576  #endif
1577 1577  
1578 1578  void
1579 1579  page_list_add_pages(page_t *pp, int flags)
1580 1580  {
1581 1581          kmutex_t *pcm;
1582 1582          pgcnt_t pgcnt;
1583 1583          uint_t  bin, mtype, i;
1584 1584          int     mnode;
1585 1585  
1586 1586          /* default to freelist/head */
1587 1587          ASSERT((flags & (PG_CACHE_LIST | PG_LIST_TAIL)) == 0);
1588 1588  
1589 1589          CHK_LPG(pp, pp->p_szc);
1590 1590          VM_STAT_ADD(vmm_vmstats.pladd_free[pp->p_szc]);
1591 1591  
1592 1592          bin = PP_2_BIN(pp);
1593 1593          mnode = PP_2_MEM_NODE(pp);
1594 1594          mtype = PP_2_MTYPE(pp);
1595 1595  
1596 1596          if (flags & PG_LIST_ISINIT) {
1597 1597                  ASSERT(pp->p_szc == mmu_page_sizes - 1);
1598 1598                  page_vpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
1599 1599                  ASSERT(!PP_ISNORELOC(pp));
1600 1600                  PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags);
1601 1601          } else {
1602 1602  
1603 1603                  ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes);
1604 1604  
1605 1605                  pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
1606 1606  
1607 1607                  mutex_enter(pcm);
1608 1608                  page_vpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
1609 1609                  page_ctr_add(mnode, mtype, pp, PG_FREE_LIST);
1610 1610                  mutex_exit(pcm);
1611 1611  
1612 1612                  pgcnt = page_get_pagecnt(pp->p_szc);
1613 1613  #if defined(__sparc)
1614 1614                  if (PP_ISNORELOC(pp))
1615 1615                          kcage_freemem_add(pgcnt);
1616 1616  #endif
1617 1617                  for (i = 0; i < pgcnt; i++, pp++)
1618 1618                          page_unlock_nocapture(pp);
1619 1619          }
1620 1620  }
1621 1621  
1622 1622  /*
1623 1623   * During boot, need to demote a large page to base
1624 1624   * pagesize pages for seg_kmem for use in boot_alloc()
1625 1625   */
1626 1626  void
1627 1627  page_boot_demote(page_t *pp)
1628 1628  {
1629 1629          ASSERT(pp->p_szc != 0);
1630 1630          ASSERT(PP_ISFREE(pp));
1631 1631          ASSERT(PP_ISAGED(pp));
1632 1632  
1633 1633          (void) page_demote(PP_2_MEM_NODE(pp),
1634 1634              PFN_BASE(pp->p_pagenum, pp->p_szc), 0, pp->p_szc, 0, PC_NO_COLOR,
1635 1635              PC_FREE);
1636 1636  
1637 1637          ASSERT(PP_ISFREE(pp));
1638 1638          ASSERT(PP_ISAGED(pp));
1639 1639          ASSERT(pp->p_szc == 0);
1640 1640  }
1641 1641  
1642 1642  /*
1643 1643   * Take a particular page off of whatever freelist the page
1644 1644   * is claimed to be on.
1645 1645   *
1646 1646   * NOTE: Only used for PAGESIZE pages.
1647 1647   */
1648 1648  void
1649 1649  page_list_sub(page_t *pp, int flags)
1650 1650  {
1651 1651          int             bin;
1652 1652          uint_t          mtype;
1653 1653          int             mnode;
1654 1654          kmutex_t        *pcm;
1655 1655          page_t          **ppp;
1656 1656  
1657 1657          ASSERT(PAGE_EXCL(pp));
1658 1658          ASSERT(PP_ISFREE(pp));
1659 1659  
1660 1660          /*
1661 1661           * The p_szc field can only be changed by page_promote()
1662 1662           * and page_demote(). Only free pages can be promoted and
1663 1663           * demoted and the free list MUST be locked during these
1664 1664           * operations. So to prevent a race in page_list_sub()
1665 1665           * between computing which bin of the freelist lock to
1666 1666           * grab and actually grabing the lock we check again that
1667 1667           * the bin we locked is still the correct one. Notice that
1668 1668           * the p_szc field could have actually changed on us but
1669 1669           * if the bin happens to still be the same we are safe.
1670 1670           */
1671 1671  try_again:
1672 1672          bin = PP_2_BIN(pp);
1673 1673          mnode = PP_2_MEM_NODE(pp);
1674 1674          pcm = PC_BIN_MUTEX(mnode, bin, flags);
1675 1675          mutex_enter(pcm);
1676 1676          if (PP_2_BIN(pp) != bin) {
1677 1677                  mutex_exit(pcm);
1678 1678                  goto try_again;
1679 1679          }
1680 1680          mtype = PP_2_MTYPE(pp);
1681 1681  
1682 1682          if (flags & PG_FREE_LIST) {
1683 1683                  VM_STAT_ADD(vmm_vmstats.plsub_free[0]);
1684 1684                  ASSERT(PP_ISAGED(pp));
1685 1685                  ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype);
1686 1686          } else {
1687 1687                  VM_STAT_ADD(vmm_vmstats.plsub_cache);
1688 1688                  ASSERT(!PP_ISAGED(pp));
1689 1689                  ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
1690 1690          }
1691 1691  
1692 1692          /*
1693 1693           * Common PAGESIZE case.
1694 1694           *
1695 1695           * Note that we locked the freelist. This prevents
1696 1696           * any page promotion/demotion operations. Therefore
1697 1697           * the p_szc will not change until we drop pcm mutex.
1698 1698           */
1699 1699          if (pp->p_szc == 0) {
1700 1700                  page_sub(ppp, pp);
1701 1701                  /*
1702 1702                   * Subtract counters before releasing pcm mutex
1703 1703                   * to avoid race with page_freelist_coalesce.
1704 1704                   */
1705 1705                  page_ctr_sub(mnode, mtype, pp, flags);
1706 1706                  mutex_exit(pcm);
1707 1707  
1708 1708  #if defined(__sparc)
1709 1709                  if (PP_ISNORELOC(pp)) {
1710 1710                          kcage_freemem_sub(1);
1711 1711                  }
1712 1712  #endif
1713 1713                  return;
1714 1714          }
1715 1715  
1716 1716          /*
1717 1717           * Large pages on the cache list are not supported.
1718 1718           */
1719 1719          if (flags & PG_CACHE_LIST)
1720 1720                  panic("page_list_sub: large page on cachelist");
1721 1721  
1722 1722          /*
1723 1723           * Slow but rare.
1724 1724           *
1725 1725           * Somebody wants this particular page which is part
1726 1726           * of a large page. In this case we just demote the page
1727 1727           * if it's on the freelist.
1728 1728           *
1729 1729           * We have to drop pcm before locking the entire freelist.
1730 1730           * Once we have re-locked the freelist check to make sure
1731 1731           * the page hasn't already been demoted or completely
1732 1732           * freed.
1733 1733           */
1734 1734          mutex_exit(pcm);
1735 1735          page_freelist_lock(mnode);
1736 1736          if (pp->p_szc != 0) {
1737 1737                  /*
1738 1738                   * Large page is on freelist.
1739 1739                   */
1740 1740                  (void) page_demote(mnode, PFN_BASE(pp->p_pagenum, pp->p_szc),
1741 1741                      0, pp->p_szc, 0, PC_NO_COLOR, PC_FREE);
1742 1742          }
1743 1743          ASSERT(PP_ISFREE(pp));
1744 1744          ASSERT(PP_ISAGED(pp));
1745 1745          ASSERT(pp->p_szc == 0);
1746 1746  
1747 1747          /*
1748 1748           * Subtract counters before releasing pcm mutex
1749 1749           * to avoid race with page_freelist_coalesce.
1750 1750           */
1751 1751          bin = PP_2_BIN(pp);
1752 1752          mtype = PP_2_MTYPE(pp);
1753 1753          ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype);
1754 1754  
1755 1755          page_sub(ppp, pp);
1756 1756          page_ctr_sub(mnode, mtype, pp, flags);
1757 1757          page_freelist_unlock(mnode);
1758 1758  
1759 1759  #if defined(__sparc)
1760 1760          if (PP_ISNORELOC(pp)) {
1761 1761                  kcage_freemem_sub(1);
1762 1762          }
1763 1763  #endif
1764 1764  }
1765 1765  
1766 1766  void
1767 1767  page_list_sub_pages(page_t *pp, uint_t szc)
1768 1768  {
1769 1769          kmutex_t *pcm;
1770 1770          uint_t  bin, mtype;
1771 1771          int     mnode;
1772 1772  
1773 1773          ASSERT(PAGE_EXCL(pp));
1774 1774          ASSERT(PP_ISFREE(pp));
1775 1775          ASSERT(PP_ISAGED(pp));
1776 1776  
1777 1777          /*
1778 1778           * See comment in page_list_sub().
1779 1779           */
1780 1780  try_again:
1781 1781          bin = PP_2_BIN(pp);
1782 1782          mnode = PP_2_MEM_NODE(pp);
1783 1783          pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
1784 1784          mutex_enter(pcm);
1785 1785          if (PP_2_BIN(pp) != bin) {
1786 1786                  mutex_exit(pcm);
1787 1787                  goto    try_again;
1788 1788          }
1789 1789  
1790 1790          /*
1791 1791           * If we're called with a page larger than szc or it got
1792 1792           * promoted above szc before we locked the freelist then
1793 1793           * drop pcm and re-lock entire freelist. If page still larger
1794 1794           * than szc then demote it.
1795 1795           */
1796 1796          if (pp->p_szc > szc) {
1797 1797                  mutex_exit(pcm);
1798 1798                  pcm = NULL;
1799 1799                  page_freelist_lock(mnode);
1800 1800                  if (pp->p_szc > szc) {
1801 1801                          VM_STAT_ADD(vmm_vmstats.plsubpages_szcbig);
1802 1802                          (void) page_demote(mnode,
1803 1803                              PFN_BASE(pp->p_pagenum, pp->p_szc), 0,
1804 1804                              pp->p_szc, szc, PC_NO_COLOR, PC_FREE);
1805 1805                  }
1806 1806                  bin = PP_2_BIN(pp);
1807 1807          }
1808 1808          ASSERT(PP_ISFREE(pp));
1809 1809          ASSERT(PP_ISAGED(pp));
1810 1810          ASSERT(pp->p_szc <= szc);
1811 1811          ASSERT(pp == PP_PAGEROOT(pp));
1812 1812  
1813 1813          VM_STAT_ADD(vmm_vmstats.plsub_free[pp->p_szc]);
1814 1814  
1815 1815          mtype = PP_2_MTYPE(pp);
1816 1816          if (pp->p_szc != 0) {
1817 1817                  page_vpsub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
1818 1818                  CHK_LPG(pp, pp->p_szc);
1819 1819          } else {
1820 1820                  VM_STAT_ADD(vmm_vmstats.plsubpages_szc0);
1821 1821                  page_sub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
1822 1822          }
1823 1823          page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST);
1824 1824  
1825 1825          if (pcm != NULL) {
1826 1826                  mutex_exit(pcm);
1827 1827          } else {
1828 1828                  page_freelist_unlock(mnode);
1829 1829          }
1830 1830  
1831 1831  #if defined(__sparc)
1832 1832          if (PP_ISNORELOC(pp)) {
1833 1833                  pgcnt_t pgcnt;
1834 1834  
1835 1835                  pgcnt = page_get_pagecnt(pp->p_szc);
1836 1836                  kcage_freemem_sub(pgcnt);
1837 1837          }
1838 1838  #endif
1839 1839  }
1840 1840  
1841 1841  /*
1842 1842   * Add the page to the front of a linked list of pages
1843 1843   * using the p_next & p_prev pointers for the list.
1844 1844   * The caller is responsible for protecting the list pointers.
1845 1845   */
1846 1846  void
1847 1847  mach_page_add(page_t **ppp, page_t *pp)
1848 1848  {
1849 1849          if (*ppp == NULL) {
1850 1850                  pp->p_next = pp->p_prev = pp;
1851 1851          } else {
1852 1852                  pp->p_next = *ppp;
1853 1853                  pp->p_prev = (*ppp)->p_prev;
1854 1854                  (*ppp)->p_prev = pp;
1855 1855                  pp->p_prev->p_next = pp;
1856 1856          }
1857 1857          *ppp = pp;
1858 1858  }
1859 1859  
1860 1860  /*
1861 1861   * Remove this page from a linked list of pages
1862 1862   * using the p_next & p_prev pointers for the list.
1863 1863   *
1864 1864   * The caller is responsible for protecting the list pointers.
1865 1865   */
1866 1866  void
1867 1867  mach_page_sub(page_t **ppp, page_t *pp)
1868 1868  {
1869 1869          ASSERT(PP_ISFREE(pp));
1870 1870  
1871 1871          if (*ppp == NULL || pp == NULL)
1872 1872                  panic("mach_page_sub");
1873 1873  
1874 1874          if (*ppp == pp)
1875 1875                  *ppp = pp->p_next;              /* go to next page */
1876 1876  
1877 1877          if (*ppp == pp)
1878 1878                  *ppp = NULL;                    /* page list is gone */
1879 1879          else {
1880 1880                  pp->p_prev->p_next = pp->p_next;
1881 1881                  pp->p_next->p_prev = pp->p_prev;
1882 1882          }
1883 1883          pp->p_prev = pp->p_next = pp;           /* make pp a list of one */
1884 1884  }
1885 1885  
1886 1886  /*
1887 1887   * Routine fsflush uses to gradually coalesce the free list into larger pages.
1888 1888   */
1889 1889  void
1890 1890  page_promote_size(page_t *pp, uint_t cur_szc)
1891 1891  {
1892 1892          pfn_t pfn;
1893 1893          int mnode;
1894 1894          int idx;
1895 1895          int new_szc = cur_szc + 1;
1896 1896          int full = FULL_REGION_CNT(new_szc);
1897 1897  
1898 1898          pfn = page_pptonum(pp);
1899 1899          mnode = PFN_2_MEM_NODE(pfn);
1900 1900  
1901 1901          page_freelist_lock(mnode);
1902 1902  
1903 1903          idx = PNUM_TO_IDX(mnode, new_szc, pfn);
1904 1904          if (PAGE_COUNTERS(mnode, new_szc, idx) == full)
1905 1905                  (void) page_promote(mnode, pfn, new_szc, PC_FREE, PC_MTYPE_ANY);
1906 1906  
1907 1907          page_freelist_unlock(mnode);
1908 1908  }
1909 1909  
1910 1910  static uint_t page_promote_err;
1911 1911  static uint_t page_promote_noreloc_err;
1912 1912  
1913 1913  /*
1914 1914   * Create a single larger page (of szc new_szc) from smaller contiguous pages
1915 1915   * for the given mnode starting at pfnum. Pages involved are on the freelist
1916 1916   * before the call and may be returned to the caller if requested, otherwise
1917 1917   * they will be placed back on the freelist.
1918 1918   * If flags is PC_ALLOC, then the large page will be returned to the user in
1919 1919   * a state which is consistent with a page being taken off the freelist.  If
1920 1920   * we failed to lock the new large page, then we will return NULL to the
1921 1921   * caller and put the large page on the freelist instead.
1922 1922   * If flags is PC_FREE, then the large page will be placed on the freelist,
1923 1923   * and NULL will be returned.
1924 1924   * The caller is responsible for locking the freelist as well as any other
1925 1925   * accounting which needs to be done for a returned page.
1926 1926   *
1927 1927   * RFE: For performance pass in pp instead of pfnum so
1928 1928   *      we can avoid excessive calls to page_numtopp_nolock().
1929 1929   *      This would depend on an assumption that all contiguous
1930 1930   *      pages are in the same memseg so we can just add/dec
1931 1931   *      our pp.
1932 1932   *
1933 1933   * Lock ordering:
1934 1934   *
1935 1935   *      There is a potential but rare deadlock situation
1936 1936   *      for page promotion and demotion operations. The problem
1937 1937   *      is there are two paths into the freelist manager and
1938 1938   *      they have different lock orders:
1939 1939   *
1940 1940   *      page_create()
1941 1941   *              lock freelist
1942 1942   *              page_lock(EXCL)
1943 1943   *              unlock freelist
1944 1944   *              return
1945 1945   *              caller drops page_lock
1946 1946   *
1947 1947   *      page_free() and page_reclaim()
1948 1948   *              caller grabs page_lock(EXCL)
1949 1949   *
1950 1950   *              lock freelist
1951 1951   *              unlock freelist
1952 1952   *              drop page_lock
1953 1953   *
1954 1954   *      What prevents a thread in page_create() from deadlocking
1955 1955   *      with a thread freeing or reclaiming the same page is the
1956 1956   *      page_trylock() in page_get_freelist(). If the trylock fails
1957 1957   *      it skips the page.
1958 1958   *
1959 1959   *      The lock ordering for promotion and demotion is the same as
1960 1960   *      for page_create(). Since the same deadlock could occur during
1961 1961   *      page promotion and freeing or reclaiming of a page on the
1962 1962   *      cache list we might have to fail the operation and undo what
1963 1963   *      have done so far. Again this is rare.
1964 1964   */
1965 1965  page_t *
1966 1966  page_promote(int mnode, pfn_t pfnum, uchar_t new_szc, int flags, int mtype)
1967 1967  {
1968 1968          page_t          *pp, *pplist, *tpp, *start_pp;
1969 1969          pgcnt_t         new_npgs, npgs;
1970 1970          uint_t          bin;
1971 1971          pgcnt_t         tmpnpgs, pages_left;
1972 1972          uint_t          noreloc;
1973 1973          int             which_list;
1974 1974          ulong_t         index;
1975 1975          kmutex_t        *phm;
1976 1976  
1977 1977          /*
1978 1978           * General algorithm:
1979 1979           * Find the starting page
1980 1980           * Walk each page struct removing it from the freelist,
1981 1981           * and linking it to all the other pages removed.
1982 1982           * Once all pages are off the freelist,
1983 1983           * walk the list, modifying p_szc to new_szc and what
1984 1984           * ever other info needs to be done to create a large free page.
1985 1985           * According to the flags, either return the page or put it
1986 1986           * on the freelist.
1987 1987           */
1988 1988  
1989 1989          start_pp = page_numtopp_nolock(pfnum);
1990 1990          ASSERT(start_pp && (start_pp->p_pagenum == pfnum));
1991 1991          new_npgs = page_get_pagecnt(new_szc);
1992 1992          ASSERT(IS_P2ALIGNED(pfnum, new_npgs));
1993 1993  
1994 1994          /* don't return page of the wrong mtype */
1995 1995          if (mtype != PC_MTYPE_ANY && mtype != PP_2_MTYPE(start_pp))
1996 1996                          return (NULL);
1997 1997  
1998 1998          /*
1999 1999           * Loop through smaller pages to confirm that all pages
2000 2000           * give the same result for PP_ISNORELOC().
2001 2001           * We can check this reliably here as the protocol for setting
2002 2002           * P_NORELOC requires pages to be taken off the free list first.
2003 2003           */
2004 2004          noreloc = PP_ISNORELOC(start_pp);
2005 2005          for (pp = start_pp + new_npgs; --pp > start_pp; ) {
2006 2006                  if (noreloc != PP_ISNORELOC(pp)) {
2007 2007                          page_promote_noreloc_err++;
2008 2008                          page_promote_err++;
2009 2009                          return (NULL);
2010 2010                  }
2011 2011          }
2012 2012  
2013 2013          pages_left = new_npgs;
2014 2014          pplist = NULL;
2015 2015          pp = start_pp;
2016 2016  
2017 2017          /* Loop around coalescing the smaller pages into a big page. */
2018 2018          while (pages_left) {
2019 2019                  /*
2020 2020                   * Remove from the freelist.
2021 2021                   */
2022 2022                  ASSERT(PP_ISFREE(pp));
2023 2023                  bin = PP_2_BIN(pp);
2024 2024                  ASSERT(mnode == PP_2_MEM_NODE(pp));
2025 2025                  mtype = PP_2_MTYPE(pp);
2026 2026                  if (PP_ISAGED(pp)) {
2027 2027  
2028 2028                          /*
2029 2029                           * PG_FREE_LIST
2030 2030                           */
2031 2031                          if (pp->p_szc) {
2032 2032                                  page_vpsub(&PAGE_FREELISTS(mnode,
2033 2033                                      pp->p_szc, bin, mtype), pp);
2034 2034                          } else {
2035 2035                                  mach_page_sub(&PAGE_FREELISTS(mnode, 0,
2036 2036                                      bin, mtype), pp);
2037 2037                          }
2038 2038                          which_list = PG_FREE_LIST;
2039 2039                  } else {
2040 2040                          ASSERT(pp->p_szc == 0);
2041 2041  
2042 2042                          /*
2043 2043                           * PG_CACHE_LIST
2044 2044                           *
2045 2045                           * Since this page comes from the
2046 2046                           * cachelist, we must destroy the
2047 2047                           * vnode association.
2048 2048                           */
2049 2049                          if (!page_trylock(pp, SE_EXCL)) {
2050 2050                                  goto fail_promote;
2051 2051                          }
2052 2052  
2053 2053                          /*
2054 2054                           * We need to be careful not to deadlock
2055 2055                           * with another thread in page_lookup().
2056 2056                           * The page_lookup() thread could be holding
2057 2057                           * the same phm that we need if the two
2058 2058                           * pages happen to hash to the same phm lock.
2059 2059                           * At this point we have locked the entire
2060 2060                           * freelist and page_lookup() could be trying
2061 2061                           * to grab a freelist lock.
2062 2062                           */
2063 2063                          index = PAGE_HASH_FUNC(pp->p_vnode, pp->p_offset);
2064 2064                          phm = PAGE_HASH_MUTEX(index);
2065 2065                          if (!mutex_tryenter(phm)) {
2066 2066                                  page_unlock_nocapture(pp);
2067 2067                                  goto fail_promote;
2068 2068                          }
2069 2069  
2070 2070                          mach_page_sub(&PAGE_CACHELISTS(mnode, bin, mtype), pp);
2071 2071                          page_hashout(pp, phm);
2072 2072                          mutex_exit(phm);
2073 2073                          PP_SETAGED(pp);
2074 2074                          page_unlock_nocapture(pp);
2075 2075                          which_list = PG_CACHE_LIST;
2076 2076                  }
2077 2077                  page_ctr_sub(mnode, mtype, pp, which_list);
2078 2078  
2079 2079                  /*
2080 2080                   * Concatenate the smaller page(s) onto
2081 2081                   * the large page list.
2082 2082                   */
2083 2083                  tmpnpgs = npgs = page_get_pagecnt(pp->p_szc);
2084 2084                  pages_left -= npgs;
2085 2085                  tpp = pp;
2086 2086                  while (npgs--) {
2087 2087                          tpp->p_szc = new_szc;
2088 2088                          tpp = tpp->p_next;
2089 2089                  }
2090 2090                  page_list_concat(&pplist, &pp);
2091 2091                  pp += tmpnpgs;
2092 2092          }
2093 2093          CHK_LPG(pplist, new_szc);
2094 2094  
2095 2095          /*
2096 2096           * return the page to the user if requested
2097 2097           * in the properly locked state.
2098 2098           */
2099 2099          if (flags == PC_ALLOC && (page_trylock_cons(pplist, SE_EXCL))) {
2100 2100                  return (pplist);
2101 2101          }
2102 2102  
2103 2103          /*
2104 2104           * Otherwise place the new large page on the freelist
2105 2105           */
2106 2106          bin = PP_2_BIN(pplist);
2107 2107          mnode = PP_2_MEM_NODE(pplist);
2108 2108          mtype = PP_2_MTYPE(pplist);
2109 2109          page_vpadd(&PAGE_FREELISTS(mnode, new_szc, bin, mtype), pplist);
2110 2110  
2111 2111          page_ctr_add(mnode, mtype, pplist, PG_FREE_LIST);
2112 2112          return (NULL);
2113 2113  
2114 2114  fail_promote:
2115 2115          /*
2116 2116           * A thread must have still been freeing or
2117 2117           * reclaiming the page on the cachelist.
2118 2118           * To prevent a deadlock undo what we have
2119 2119           * done sofar and return failure. This
2120 2120           * situation can only happen while promoting
2121 2121           * PAGESIZE pages.
2122 2122           */
2123 2123          page_promote_err++;
2124 2124          while (pplist) {
2125 2125                  pp = pplist;
2126 2126                  mach_page_sub(&pplist, pp);
2127 2127                  pp->p_szc = 0;
2128 2128                  bin = PP_2_BIN(pp);
2129 2129                  mtype = PP_2_MTYPE(pp);
2130 2130                  mach_page_add(&PAGE_FREELISTS(mnode, 0, bin, mtype), pp);
2131 2131                  page_ctr_add(mnode, mtype, pp, PG_FREE_LIST);
2132 2132          }
2133 2133          return (NULL);
2134 2134  
2135 2135  }
2136 2136  
2137 2137  /*
2138 2138   * Break up a large page into smaller size pages.
2139 2139   * Pages involved are on the freelist before the call and may
2140 2140   * be returned to the caller if requested, otherwise they will
2141 2141   * be placed back on the freelist.
2142 2142   * The caller is responsible for locking the freelist as well as any other
2143 2143   * accounting which needs to be done for a returned page.
2144 2144   * If flags is not PC_ALLOC, the color argument is ignored, and thus
2145 2145   * technically, any value may be passed in but PC_NO_COLOR is the standard
2146 2146   * which should be followed for clarity's sake.
2147 2147   * Returns a page whose pfn is < pfnmax
2148 2148   */
2149 2149  page_t *
2150 2150  page_demote(int mnode, pfn_t pfnum, pfn_t pfnmax, uchar_t cur_szc,
2151 2151      uchar_t new_szc, int color, int flags)
2152 2152  {
2153 2153          page_t  *pp, *pplist, *npplist;
2154 2154          pgcnt_t npgs, n;
2155 2155          uint_t  bin;
2156 2156          uint_t  mtype;
2157 2157          page_t  *ret_pp = NULL;
2158 2158  
2159 2159          ASSERT(cur_szc != 0);
2160 2160          ASSERT(new_szc < cur_szc);
2161 2161  
2162 2162          pplist = page_numtopp_nolock(pfnum);
2163 2163          ASSERT(pplist != NULL);
2164 2164  
2165 2165          ASSERT(pplist->p_szc == cur_szc);
2166 2166  
2167 2167          bin = PP_2_BIN(pplist);
2168 2168          ASSERT(mnode == PP_2_MEM_NODE(pplist));
2169 2169          mtype = PP_2_MTYPE(pplist);
2170 2170          page_vpsub(&PAGE_FREELISTS(mnode, cur_szc, bin, mtype), pplist);
2171 2171  
2172 2172          CHK_LPG(pplist, cur_szc);
2173 2173          page_ctr_sub(mnode, mtype, pplist, PG_FREE_LIST);
2174 2174  
2175 2175          /*
2176 2176           * Number of PAGESIZE pages for smaller new_szc
2177 2177           * page.
2178 2178           */
2179 2179          npgs = page_get_pagecnt(new_szc);
2180 2180  
2181 2181          while (pplist) {
2182 2182                  pp = pplist;
2183 2183  
2184 2184                  ASSERT(pp->p_szc == cur_szc);
2185 2185  
2186 2186                  /*
2187 2187                   * We either break it up into PAGESIZE pages or larger.
2188 2188                   */
2189 2189                  if (npgs == 1) {        /* PAGESIZE case */
2190 2190                          mach_page_sub(&pplist, pp);
2191 2191                          ASSERT(pp->p_szc == cur_szc);
2192 2192                          ASSERT(new_szc == 0);
2193 2193                          ASSERT(mnode == PP_2_MEM_NODE(pp));
2194 2194                          pp->p_szc = new_szc;
2195 2195                          bin = PP_2_BIN(pp);
2196 2196                          if ((bin == color) && (flags == PC_ALLOC) &&
2197 2197                              (ret_pp == NULL) && (pfnmax == 0 ||
2198 2198                              pp->p_pagenum < pfnmax) &&
2199 2199                              page_trylock_cons(pp, SE_EXCL)) {
2200 2200                                  ret_pp = pp;
2201 2201                          } else {
2202 2202                                  mtype = PP_2_MTYPE(pp);
2203 2203                                  mach_page_add(&PAGE_FREELISTS(mnode, 0, bin,
2204 2204                                      mtype), pp);
2205 2205                                  page_ctr_add(mnode, mtype, pp, PG_FREE_LIST);
2206 2206                          }
2207 2207                  } else {
2208 2208                          page_t *try_to_return_this_page = NULL;
2209 2209                          int count = 0;
2210 2210  
2211 2211                          /*
2212 2212                           * Break down into smaller lists of pages.
2213 2213                           */
2214 2214                          page_list_break(&pplist, &npplist, npgs);
2215 2215  
2216 2216                          pp = pplist;
2217 2217                          n = npgs;
2218 2218                          while (n--) {
2219 2219                                  ASSERT(pp->p_szc == cur_szc);
2220 2220                                  /*
2221 2221                                   * Check whether all the pages in this list
2222 2222                                   * fit the request criteria.
2223 2223                                   */
2224 2224                                  if (pfnmax == 0 || pp->p_pagenum < pfnmax) {
2225 2225                                          count++;
2226 2226                                  }
2227 2227                                  pp->p_szc = new_szc;
2228 2228                                  pp = pp->p_next;
2229 2229                          }
2230 2230  
2231 2231                          if (count == npgs &&
2232 2232                              (pfnmax == 0 || pp->p_pagenum < pfnmax)) {
2233 2233                                  try_to_return_this_page = pp;
2234 2234                          }
2235 2235  
2236 2236                          CHK_LPG(pplist, new_szc);
2237 2237  
2238 2238                          bin = PP_2_BIN(pplist);
2239 2239                          if (try_to_return_this_page)
2240 2240                                  ASSERT(mnode ==
2241 2241                                      PP_2_MEM_NODE(try_to_return_this_page));
2242 2242                          if ((bin == color) && (flags == PC_ALLOC) &&
2243 2243                              (ret_pp == NULL) && try_to_return_this_page &&
2244 2244                              page_trylock_cons(try_to_return_this_page,
2245 2245                              SE_EXCL)) {
2246 2246                                  ret_pp = try_to_return_this_page;
2247 2247                          } else {
2248 2248                                  mtype = PP_2_MTYPE(pp);
2249 2249                                  page_vpadd(&PAGE_FREELISTS(mnode, new_szc,
2250 2250                                      bin, mtype), pplist);
2251 2251  
2252 2252                                  page_ctr_add(mnode, mtype, pplist,
2253 2253                                      PG_FREE_LIST);
2254 2254                          }
2255 2255                          pplist = npplist;
2256 2256                  }
2257 2257          }
2258 2258          return (ret_pp);
2259 2259  }
2260 2260  
2261 2261  int mpss_coalesce_disable = 0;
2262 2262  
2263 2263  /*
2264 2264   * Coalesce free pages into a page of the given szc and color if possible.
2265 2265   * Return the pointer to the page created, otherwise, return NULL.
2266 2266   *
2267 2267   * If pfnhi is non-zero, search for large page with pfn range less than pfnhi.
2268 2268   */
2269 2269  page_t *
2270 2270  page_freelist_coalesce(int mnode, uchar_t szc, uint_t color, uint_t ceq_mask,
2271 2271      int mtype, pfn_t pfnhi)
2272 2272  {
2273 2273          int     r = szc;                /* region size */
2274 2274          int     mrange;
2275 2275          uint_t  full, bin, color_mask, wrap = 0;
2276 2276          pfn_t   pfnum, lo, hi;
2277 2277          size_t  len, idx, idx0;
2278 2278          pgcnt_t cands = 0, szcpgcnt = page_get_pagecnt(szc);
2279 2279          page_t  *ret_pp;
2280 2280          MEM_NODE_ITERATOR_DECL(it);
2281 2281  #if defined(__sparc)
2282 2282          pfn_t pfnum0, nlo, nhi;
2283 2283  #endif
2284 2284  
2285 2285          if (mpss_coalesce_disable) {
2286 2286                  ASSERT(szc < MMU_PAGE_SIZES);
2287 2287                  VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce[szc][0]);
2288 2288                  return (NULL);
2289 2289          }
2290 2290  
2291 2291          ASSERT(szc < mmu_page_sizes);
2292 2292          color_mask = PAGE_GET_PAGECOLORS(szc) - 1;
2293 2293          ASSERT(ceq_mask <= color_mask);
2294 2294          ASSERT(color <= color_mask);
2295 2295          color &= ceq_mask;
2296 2296  
2297 2297          /* Prevent page_counters dynamic memory from being freed */
2298 2298          rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
2299 2299  
2300 2300          mrange = MTYPE_2_MRANGE(mnode, mtype);
2301 2301          ASSERT(mrange < mnode_nranges[mnode]);
2302 2302          VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce[r][mrange]);
2303 2303  
2304 2304          /* get pfn range for mtype */
2305 2305          len = PAGE_COUNTERS_ENTRIES(mnode, r);
2306 2306          MNODETYPE_2_PFN(mnode, mtype, lo, hi);
2307 2307          hi++;
2308 2308  
2309 2309          /* use lower limit if given */
2310 2310          if (pfnhi != PFNNULL && pfnhi < hi)
2311 2311                  hi = pfnhi;
2312 2312  
2313 2313          /* round to szcpgcnt boundaries */
2314 2314          lo = P2ROUNDUP(lo, szcpgcnt);
2315 2315          MEM_NODE_ITERATOR_INIT(lo, mnode, szc, &it);
2316 2316          if (lo == (pfn_t)-1) {
2317 2317                  rw_exit(&page_ctrs_rwlock[mnode]);
2318 2318                  return (NULL);
2319 2319          }
2320 2320          hi = hi & ~(szcpgcnt - 1);
2321 2321  
2322 2322          /* set lo to the closest pfn of the right color */
2323 2323          if (((PFN_2_COLOR(lo, szc, &it) ^ color) & ceq_mask) ||
2324 2324              (interleaved_mnodes && PFN_2_MEM_NODE(lo) != mnode)) {
2325 2325                  PAGE_NEXT_PFN_FOR_COLOR(lo, szc, color, ceq_mask, color_mask,
2326 2326                      &it);
2327 2327          }
2328 2328  
2329 2329          if (hi <= lo) {
2330 2330                  rw_exit(&page_ctrs_rwlock[mnode]);
2331 2331                  return (NULL);
2332 2332          }
2333 2333  
2334 2334          full = FULL_REGION_CNT(r);
2335 2335  
2336 2336          /* calculate the number of page candidates and initial search index */
2337 2337          bin = color;
2338 2338          idx0 = (size_t)(-1);
2339 2339          do {
2340 2340                  pgcnt_t acand;
2341 2341  
2342 2342                  PGCTRS_CANDS_GETVALUECOLOR(mnode, mrange, r, bin, acand);
2343 2343                  if (acand) {
2344 2344                          idx = PAGE_COUNTERS_CURRENT_COLOR(mnode,
2345 2345                              r, bin, mrange);
2346 2346                          idx0 = MIN(idx0, idx);
2347 2347                          cands += acand;
2348 2348                  }
2349 2349                  bin = ADD_MASKED(bin, 1, ceq_mask, color_mask);
2350 2350          } while (bin != color);
2351 2351  
2352 2352          if (cands == 0) {
2353 2353                  VM_STAT_ADD(vmm_vmstats.page_ctrs_cands_skip[r][mrange]);
2354 2354                  rw_exit(&page_ctrs_rwlock[mnode]);
2355 2355                  return (NULL);
2356 2356          }
2357 2357  
2358 2358          pfnum = IDX_TO_PNUM(mnode, r, idx0);
2359 2359          if (pfnum < lo || pfnum >= hi) {
2360 2360                  pfnum = lo;
2361 2361          } else {
2362 2362                  MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
2363 2363                  if (pfnum == (pfn_t)-1) {
2364 2364                          pfnum = lo;
2365 2365                          MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
2366 2366                          ASSERT(pfnum != (pfn_t)-1);
2367 2367                  } else if ((PFN_2_COLOR(pfnum, szc, &it) ^ color) & ceq_mask ||
2368 2368                      (interleaved_mnodes && PFN_2_MEM_NODE(pfnum) != mnode)) {
2369 2369                          /* invalid color, get the closest correct pfn */
2370 2370                          PAGE_NEXT_PFN_FOR_COLOR(pfnum, szc, color, ceq_mask,
2371 2371                              color_mask, &it);
2372 2372                          if (pfnum >= hi) {
2373 2373                                  pfnum = lo;
2374 2374                                  MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
2375 2375                          }
2376 2376                  }
2377 2377          }
2378 2378  
2379 2379          /* set starting index */
2380 2380          idx0 = PNUM_TO_IDX(mnode, r, pfnum);
2381 2381          ASSERT(idx0 < len);
2382 2382  
2383 2383  #if defined(__sparc)
2384 2384          pfnum0 = pfnum;         /* page corresponding to idx0 */
2385 2385          nhi = 0;                /* search kcage ranges */
2386 2386  #endif
2387 2387  
2388 2388          for (idx = idx0; wrap == 0 || (idx < idx0 && wrap < 2); ) {
2389 2389  
2390 2390  #if defined(__sparc)
2391 2391                  /*
2392 2392                   * Find lowest intersection of kcage ranges and mnode.
2393 2393                   * MTYPE_NORELOC means look in the cage, otherwise outside.
2394 2394                   */
2395 2395                  if (nhi <= pfnum) {
2396 2396                          if (kcage_next_range(mtype == MTYPE_NORELOC, pfnum,
2397 2397                              (wrap == 0 ? hi : pfnum0), &nlo, &nhi))
2398 2398                                  goto wrapit;
2399 2399  
2400 2400                          /* jump to the next page in the range */
2401 2401                          if (pfnum < nlo) {
2402 2402                                  pfnum = P2ROUNDUP(nlo, szcpgcnt);
2403 2403                                  MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
2404 2404                                  idx = PNUM_TO_IDX(mnode, r, pfnum);
2405 2405                                  if (idx >= len || pfnum >= hi)
2406 2406                                          goto wrapit;
2407 2407                                  if ((PFN_2_COLOR(pfnum, szc, &it) ^ color) &
2408 2408                                      ceq_mask)
2409 2409                                          goto next;
2410 2410                                  if (interleaved_mnodes &&
2411 2411                                      PFN_2_MEM_NODE(pfnum) != mnode)
2412 2412                                          goto next;
2413 2413                          }
2414 2414                  }
2415 2415  #endif
2416 2416  
2417 2417                  if (PAGE_COUNTERS(mnode, r, idx) != full)
2418 2418                          goto next;
2419 2419  
2420 2420                  /*
2421 2421                   * RFE: For performance maybe we can do something less
2422 2422                   *      brutal than locking the entire freelist. So far
2423 2423                   *      this doesn't seem to be a performance problem?
2424 2424                   */
2425 2425                  page_freelist_lock(mnode);
2426 2426                  if (PAGE_COUNTERS(mnode, r, idx) == full) {
2427 2427                          ret_pp =
2428 2428                              page_promote(mnode, pfnum, r, PC_ALLOC, mtype);
2429 2429                          if (ret_pp != NULL) {
2430 2430                                  VM_STAT_ADD(vmm_vmstats.pfc_coalok[r][mrange]);
2431 2431                                  PAGE_COUNTERS_CURRENT_COLOR(mnode, r,
2432 2432                                      PFN_2_COLOR(pfnum, szc, &it), mrange) = idx;
2433 2433                                  page_freelist_unlock(mnode);
2434 2434                                  rw_exit(&page_ctrs_rwlock[mnode]);
2435 2435  #if defined(__sparc)
2436 2436                                  if (PP_ISNORELOC(ret_pp)) {
2437 2437                                          pgcnt_t npgs;
2438 2438  
2439 2439                                          npgs = page_get_pagecnt(ret_pp->p_szc);
2440 2440                                          kcage_freemem_sub(npgs);
2441 2441                                  }
2442 2442  #endif
2443 2443                                  return (ret_pp);
2444 2444                          }
2445 2445                  } else {
2446 2446                          VM_STAT_ADD(vmm_vmstats.page_ctrs_changed[r][mrange]);
2447 2447                  }
2448 2448  
2449 2449                  page_freelist_unlock(mnode);
2450 2450                  /*
2451 2451                   * No point looking for another page if we've
2452 2452                   * already tried all of the ones that
2453 2453                   * page_ctr_cands indicated.  Stash off where we left
2454 2454                   * off.
2455 2455                   * Note: this is not exact since we don't hold the
2456 2456                   * page_freelist_locks before we initially get the
2457 2457                   * value of cands for performance reasons, but should
2458 2458                   * be a decent approximation.
2459 2459                   */
2460 2460                  if (--cands == 0) {
2461 2461                          PAGE_COUNTERS_CURRENT_COLOR(mnode, r, color, mrange) =
2462 2462                              idx;
2463 2463                          break;
2464 2464                  }
2465 2465  next:
2466 2466                  PAGE_NEXT_PFN_FOR_COLOR(pfnum, szc, color, ceq_mask,
2467 2467                      color_mask, &it);
2468 2468                  idx = PNUM_TO_IDX(mnode, r, pfnum);
2469 2469                  if (idx >= len || pfnum >= hi) {
2470 2470  wrapit:
2471 2471                          pfnum = lo;
2472 2472                          MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
2473 2473                          idx = PNUM_TO_IDX(mnode, r, pfnum);
2474 2474                          wrap++;
2475 2475  #if defined(__sparc)
2476 2476                          nhi = 0;        /* search kcage ranges */
2477 2477  #endif
2478 2478                  }
2479 2479          }
2480 2480  
2481 2481          rw_exit(&page_ctrs_rwlock[mnode]);
2482 2482          VM_STAT_ADD(vmm_vmstats.page_ctrs_failed[r][mrange]);
2483 2483          return (NULL);
2484 2484  }
2485 2485  
2486 2486  /*
2487 2487   * For the given mnode, promote as many small pages to large pages as possible.
2488 2488   * mnode can be -1, which means do them all
2489 2489   */
2490 2490  void
2491 2491  page_freelist_coalesce_all(int mnode)
2492 2492  {
2493 2493          int     r;              /* region size */
2494 2494          int     idx, full;
2495 2495          size_t  len;
2496 2496          int doall = interleaved_mnodes || mnode < 0;
2497 2497          int mlo = doall ? 0 : mnode;
2498 2498          int mhi = doall ? max_mem_nodes : (mnode + 1);
2499 2499  
2500 2500          VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce_all);
2501 2501  
2502 2502          if (mpss_coalesce_disable) {
2503 2503                  return;
2504 2504          }
2505 2505  
2506 2506          /*
2507 2507           * Lock the entire freelist and coalesce what we can.
2508 2508           *
2509 2509           * Always promote to the largest page possible
2510 2510           * first to reduce the number of page promotions.
2511 2511           */
2512 2512          for (mnode = mlo; mnode < mhi; mnode++) {
2513 2513                  rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
2514 2514                  page_freelist_lock(mnode);
2515 2515          }
2516 2516          for (r = mmu_page_sizes - 1; r > 0; r--) {
2517 2517                  for (mnode = mlo; mnode < mhi; mnode++) {
2518 2518                          pgcnt_t cands = 0;
2519 2519                          int mrange, nranges = mnode_nranges[mnode];
2520 2520  
2521 2521                          for (mrange = 0; mrange < nranges; mrange++) {
2522 2522                                  PGCTRS_CANDS_GETVALUE(mnode, mrange, r, cands);
2523 2523                                  if (cands != 0)
2524 2524                                          break;
2525 2525                          }
2526 2526                          if (cands == 0) {
2527 2527                                  VM_STAT_ADD(vmm_vmstats.
2528 2528                                      page_ctrs_cands_skip_all);
2529 2529                                  continue;
2530 2530                          }
2531 2531  
2532 2532                          full = FULL_REGION_CNT(r);
2533 2533                          len  = PAGE_COUNTERS_ENTRIES(mnode, r);
2534 2534  
2535 2535                          for (idx = 0; idx < len; idx++) {
2536 2536                                  if (PAGE_COUNTERS(mnode, r, idx) == full) {
2537 2537                                          pfn_t pfnum =
2538 2538                                              IDX_TO_PNUM(mnode, r, idx);
2539 2539                                          int tmnode = interleaved_mnodes ?
2540 2540                                              PFN_2_MEM_NODE(pfnum) : mnode;
2541 2541  
2542 2542                                          ASSERT(pfnum >=
2543 2543                                              mem_node_config[tmnode].physbase &&
2544 2544                                              pfnum <
2545 2545                                              mem_node_config[tmnode].physmax);
2546 2546  
2547 2547                                          (void) page_promote(tmnode,
2548 2548                                              pfnum, r, PC_FREE, PC_MTYPE_ANY);
2549 2549                                  }
2550 2550                          }
2551 2551                          /* shared hpm_counters covers all mnodes, so we quit */
2552 2552                          if (interleaved_mnodes)
2553 2553                                  break;
2554 2554                  }
2555 2555          }
2556 2556          for (mnode = mlo; mnode < mhi; mnode++) {
2557 2557                  page_freelist_unlock(mnode);
2558 2558                  rw_exit(&page_ctrs_rwlock[mnode]);
2559 2559          }
2560 2560  }
2561 2561  
2562 2562  /*
2563 2563   * This is where all polices for moving pages around
2564 2564   * to different page size free lists is implemented.
2565 2565   * Returns 1 on success, 0 on failure.
2566 2566   *
2567 2567   * So far these are the priorities for this algorithm in descending
2568 2568   * order:
2569 2569   *
2570 2570   *      1) When servicing a request try to do so with a free page
2571 2571   *         from next size up. Helps defer fragmentation as long
2572 2572   *         as possible.
2573 2573   *
2574 2574   *      2) Page coalesce on demand. Only when a freelist
2575 2575   *         larger than PAGESIZE is empty and step 1
2576 2576   *         will not work since all larger size lists are
2577 2577   *         also empty.
2578 2578   *
2579 2579   * If pfnhi is non-zero, search for large page with pfn range less than pfnhi.
2580 2580   */
2581 2581  
2582 2582  page_t *
2583 2583  page_freelist_split(uchar_t szc, uint_t color, int mnode, int mtype,
2584 2584      pfn_t pfnlo, pfn_t pfnhi, page_list_walker_t *plw)
2585 2585  {
2586 2586          uchar_t nszc = szc + 1;
2587 2587          uint_t  bin, sbin, bin_prev;
2588 2588          page_t  *pp, *firstpp;
2589 2589          page_t  *ret_pp = NULL;
2590 2590          uint_t  color_mask;
2591 2591  
2592 2592          if (nszc == mmu_page_sizes)
2593 2593                  return (NULL);
2594 2594  
2595 2595          ASSERT(nszc < mmu_page_sizes);
2596 2596          color_mask = PAGE_GET_PAGECOLORS(nszc) - 1;
2597 2597          bin = sbin = PAGE_GET_NSZ_COLOR(szc, color);
2598 2598          bin_prev = (plw->plw_bin_split_prev == color) ? INVALID_COLOR :
2599 2599              PAGE_GET_NSZ_COLOR(szc, plw->plw_bin_split_prev);
2600 2600  
2601 2601          VM_STAT_ADD(vmm_vmstats.pfs_req[szc]);
2602 2602          /*
2603 2603           * First try to break up a larger page to fill current size freelist.
2604 2604           */
2605 2605          while (plw->plw_bins[nszc] != 0) {
2606 2606  
2607 2607                  ASSERT(nszc < mmu_page_sizes);
2608 2608  
2609 2609                  /*
2610 2610                   * If page found then demote it.
2611 2611                   */
2612 2612                  if (PAGE_FREELISTS(mnode, nszc, bin, mtype)) {
2613 2613                          page_freelist_lock(mnode);
2614 2614                          firstpp = pp = PAGE_FREELISTS(mnode, nszc, bin, mtype);
2615 2615  
2616 2616                          /*
2617 2617                           * If pfnhi is not PFNNULL, look for large page below
2618 2618                           * pfnhi. PFNNULL signifies no pfn requirement.
2619 2619                           */
2620 2620                          if (pp &&
2621 2621                              ((pfnhi != PFNNULL && pp->p_pagenum >= pfnhi) ||
2622 2622                              (pfnlo != PFNNULL && pp->p_pagenum < pfnlo))) {
2623 2623                                  do {
2624 2624                                          pp = pp->p_vpnext;
2625 2625                                          if (pp == firstpp) {
2626 2626                                                  pp = NULL;
2627 2627                                                  break;
2628 2628                                          }
2629 2629                                  } while ((pfnhi != PFNNULL &&
2630 2630                                      pp->p_pagenum >= pfnhi) ||
2631 2631                                      (pfnlo != PFNNULL &&
2632 2632                                      pp->p_pagenum < pfnlo));
2633 2633  
2634 2634                                  if (pfnhi != PFNNULL && pp != NULL)
2635 2635                                          ASSERT(pp->p_pagenum < pfnhi);
2636 2636  
2637 2637                                  if (pfnlo != PFNNULL && pp != NULL)
2638 2638                                          ASSERT(pp->p_pagenum >= pfnlo);
2639 2639                          }
2640 2640                          if (pp) {
2641 2641                                  uint_t ccolor = page_correct_color(szc, nszc,
2642 2642                                      color, bin, plw->plw_ceq_mask[szc]);
2643 2643  
2644 2644                                  ASSERT(pp->p_szc == nszc);
2645 2645                                  VM_STAT_ADD(vmm_vmstats.pfs_demote[nszc]);
2646 2646                                  ret_pp = page_demote(mnode, pp->p_pagenum,
2647 2647                                      pfnhi, pp->p_szc, szc, ccolor, PC_ALLOC);
2648 2648                                  if (ret_pp) {
2649 2649                                          page_freelist_unlock(mnode);
2650 2650  #if defined(__sparc)
2651 2651                                          if (PP_ISNORELOC(ret_pp)) {
2652 2652                                                  pgcnt_t npgs;
2653 2653  
2654 2654                                                  npgs = page_get_pagecnt(
2655 2655                                                      ret_pp->p_szc);
2656 2656                                                  kcage_freemem_sub(npgs);
2657 2657                                          }
2658 2658  #endif
2659 2659                                          return (ret_pp);
2660 2660                                  }
2661 2661                          }
2662 2662                          page_freelist_unlock(mnode);
2663 2663                  }
2664 2664  
2665 2665                  /* loop through next size bins */
2666 2666                  bin = ADD_MASKED(bin, 1, plw->plw_ceq_mask[nszc], color_mask);
2667 2667                  plw->plw_bins[nszc]--;
2668 2668  
2669 2669                  if (bin == sbin) {
2670 2670                          uchar_t nnszc = nszc + 1;
2671 2671  
2672 2672                          /* we are done with this page size - check next */
2673 2673                          if (plw->plw_bins[nnszc] == 0)
2674 2674                                  /* we have already checked next size bins */
2675 2675                                  break;
2676 2676  
2677 2677                          bin = sbin = PAGE_GET_NSZ_COLOR(nszc, bin);
2678 2678                          if (bin_prev != INVALID_COLOR) {
2679 2679                                  bin_prev = PAGE_GET_NSZ_COLOR(nszc, bin_prev);
2680 2680                                  if (!((bin ^ bin_prev) &
2681 2681                                      plw->plw_ceq_mask[nnszc]))
2682 2682                                          break;
2683 2683                          }
2684 2684                          ASSERT(nnszc < mmu_page_sizes);
2685 2685                          color_mask = PAGE_GET_PAGECOLORS(nnszc) - 1;
2686 2686                          nszc = nnszc;
2687 2687                          ASSERT(nszc < mmu_page_sizes);
2688 2688                  }
2689 2689          }
2690 2690  
2691 2691          return (ret_pp);
2692 2692  }
2693 2693  
2694 2694  /*
2695 2695   * Helper routine used only by the freelist code to lock
2696 2696   * a page. If the page is a large page then it succeeds in
2697 2697   * locking all the constituent pages or none at all.
2698 2698   * Returns 1 on sucess, 0 on failure.
2699 2699   */
2700 2700  static int
2701 2701  page_trylock_cons(page_t *pp, se_t se)
2702 2702  {
2703 2703          page_t  *tpp, *first_pp = pp;
2704 2704  
2705 2705          /*
2706 2706           * Fail if can't lock first or only page.
2707 2707           */
2708 2708          if (!page_trylock(pp, se)) {
2709 2709                  return (0);
2710 2710          }
2711 2711  
2712 2712          /*
2713 2713           * PAGESIZE: common case.
2714 2714           */
2715 2715          if (pp->p_szc == 0) {
2716 2716                  return (1);
2717 2717          }
2718 2718  
2719 2719          /*
2720 2720           * Large page case.
2721 2721           */
2722 2722          tpp = pp->p_next;
2723 2723          while (tpp != pp) {
2724 2724                  if (!page_trylock(tpp, se)) {
2725 2725                          /*
2726 2726                           * On failure unlock what we have locked so far.
2727 2727                           * We want to avoid attempting to capture these
2728 2728                           * pages as the pcm mutex may be held which could
2729 2729                           * lead to a recursive mutex panic.
2730 2730                           */
2731 2731                          while (first_pp != tpp) {
2732 2732                                  page_unlock_nocapture(first_pp);
2733 2733                                  first_pp = first_pp->p_next;
2734 2734                          }
2735 2735                          return (0);
2736 2736                  }
2737 2737                  tpp = tpp->p_next;
2738 2738          }
2739 2739          return (1);
2740 2740  }
2741 2741  
2742 2742  /*
2743 2743   * init context for walking page lists
2744 2744   * Called when a page of the given szc in unavailable. Sets markers
2745 2745   * for the beginning of the search to detect when search has
2746 2746   * completed a full cycle. Sets flags for splitting larger pages
2747 2747   * and coalescing smaller pages. Page walking procedes until a page
2748 2748   * of the desired equivalent color is found.
2749 2749   */
2750 2750  void
2751 2751  page_list_walk_init(uchar_t szc, uint_t flags, uint_t bin, int can_split,
2752 2752      int use_ceq, page_list_walker_t *plw)
2753 2753  {
2754 2754          uint_t  nszc, ceq_mask, colors;
2755 2755          uchar_t ceq = use_ceq ? colorequivszc[szc] : 0;
2756 2756  
2757 2757          ASSERT(szc < mmu_page_sizes);
2758 2758          colors = PAGE_GET_PAGECOLORS(szc);
2759 2759  
2760 2760          plw->plw_colors = colors;
2761 2761          plw->plw_color_mask = colors - 1;
2762 2762          plw->plw_bin_marker = plw->plw_bin0 = bin;
2763 2763          plw->plw_bin_split_prev = bin;
2764 2764          plw->plw_bin_step = (szc == 0) ? vac_colors : 1;
2765 2765  
2766 2766          /*
2767 2767           * if vac aliasing is possible make sure lower order color
2768 2768           * bits are never ignored
2769 2769           */
2770 2770          if (vac_colors > 1)
2771 2771                  ceq &= 0xf0;
2772 2772  
2773 2773          /*
2774 2774           * calculate the number of non-equivalent colors and
2775 2775           * color equivalency mask
2776 2776           */
2777 2777          plw->plw_ceq_dif = colors >> ((ceq >> 4) + (ceq & 0xf));
2778 2778          ASSERT(szc > 0 || plw->plw_ceq_dif >= vac_colors);
2779 2779          ASSERT(plw->plw_ceq_dif > 0);
2780 2780          plw->plw_ceq_mask[szc] = (plw->plw_ceq_dif - 1) << (ceq & 0xf);
2781 2781  
2782 2782          if (flags & PG_MATCH_COLOR) {
2783 2783                  if (cpu_page_colors <  0) {
2784 2784                          /*
2785 2785                           * this is a heterogeneous machine with different CPUs
2786 2786                           * having different size e$ (not supported for ni2/rock
2787 2787                           */
2788 2788                          uint_t cpucolors = CPUSETSIZE() >> PAGE_GET_SHIFT(szc);
2789 2789                          cpucolors = MAX(cpucolors, 1);
2790 2790                          ceq_mask = plw->plw_color_mask & (cpucolors - 1);
2791 2791                          plw->plw_ceq_mask[szc] =
2792 2792                              MIN(ceq_mask, plw->plw_ceq_mask[szc]);
2793 2793                  }
2794 2794                  plw->plw_ceq_dif = 1;
2795 2795          }
2796 2796  
2797 2797          /* we can split pages in the freelist, but not the cachelist */
2798 2798          if (can_split) {
2799 2799                  plw->plw_do_split = (szc + 1 < mmu_page_sizes) ? 1 : 0;
2800 2800  
2801 2801                  /* set next szc color masks and number of free list bins */
2802 2802                  for (nszc = szc + 1; nszc < mmu_page_sizes; nszc++, szc++) {
2803 2803                          plw->plw_ceq_mask[nszc] = PAGE_GET_NSZ_MASK(szc,
2804 2804                              plw->plw_ceq_mask[szc]);
2805 2805                          plw->plw_bins[nszc] = PAGE_GET_PAGECOLORS(nszc);
2806 2806                  }
2807 2807                  plw->plw_ceq_mask[nszc] = INVALID_MASK;
2808 2808                  plw->plw_bins[nszc] = 0;
2809 2809  
2810 2810          } else {
2811 2811                  ASSERT(szc == 0);
2812 2812                  plw->plw_do_split = 0;
2813 2813                  plw->plw_bins[1] = 0;
2814 2814                  plw->plw_ceq_mask[1] = INVALID_MASK;
2815 2815          }
2816 2816  }
2817 2817  
2818 2818  /*
2819 2819   * set mark to flag where next split should occur
2820 2820   */
2821 2821  #define PAGE_SET_NEXT_SPLIT_MARKER(szc, nszc, bin, plw) {                    \
2822 2822          uint_t bin_nsz = PAGE_GET_NSZ_COLOR(szc, bin);                       \
2823 2823          uint_t bin0_nsz = PAGE_GET_NSZ_COLOR(szc, plw->plw_bin0);            \
2824 2824          uint_t neq_mask = ~plw->plw_ceq_mask[nszc] & plw->plw_color_mask;    \
2825 2825          plw->plw_split_next =                                                \
2826 2826                  INC_MASKED(bin_nsz, neq_mask, plw->plw_color_mask);          \
2827 2827          if (!((plw->plw_split_next ^ bin0_nsz) & plw->plw_ceq_mask[nszc])) { \
2828 2828                  plw->plw_split_next =                                        \
2829 2829                  INC_MASKED(plw->plw_split_next,                              \
2830 2830                      neq_mask, plw->plw_color_mask);                          \
2831 2831          }                                                                    \
2832 2832  }
2833 2833  
2834 2834  uint_t
2835 2835  page_list_walk_next_bin(uchar_t szc, uint_t bin, page_list_walker_t *plw)
2836 2836  {
2837 2837          uint_t  neq_mask = ~plw->plw_ceq_mask[szc] & plw->plw_color_mask;
2838 2838          uint_t  bin0_nsz, nbin_nsz, nbin0, nbin;
2839 2839          uchar_t nszc = szc + 1;
2840 2840  
2841 2841          nbin = ADD_MASKED(bin,
2842 2842              plw->plw_bin_step, neq_mask, plw->plw_color_mask);
2843 2843  
2844 2844          if (plw->plw_do_split) {
2845 2845                  plw->plw_bin_split_prev = bin;
2846 2846                  PAGE_SET_NEXT_SPLIT_MARKER(szc, nszc, bin, plw);
2847 2847                  plw->plw_do_split = 0;
2848 2848          }
2849 2849  
2850 2850          if (szc == 0) {
2851 2851                  if (plw->plw_count != 0 || plw->plw_ceq_dif == vac_colors) {
2852 2852                          if (nbin == plw->plw_bin0 &&
2853 2853                              (vac_colors == 1 || nbin != plw->plw_bin_marker)) {
2854 2854                                  nbin = ADD_MASKED(nbin, plw->plw_bin_step,
2855 2855                                      neq_mask, plw->plw_color_mask);
2856 2856                                  plw->plw_bin_split_prev = plw->plw_bin0;
2857 2857                          }
2858 2858  
2859 2859                          if (vac_colors > 1 && nbin == plw->plw_bin_marker) {
2860 2860                                  plw->plw_bin_marker =
2861 2861                                      nbin = INC_MASKED(nbin, neq_mask,
2862 2862                                      plw->plw_color_mask);
2863 2863                                  plw->plw_bin_split_prev = plw->plw_bin0;
2864 2864                                  /*
2865 2865                                   * large pages all have the same vac color
2866 2866                                   * so by now we should be done with next
2867 2867                                   * size page splitting process
2868 2868                                   */
2869 2869                                  ASSERT(plw->plw_bins[1] == 0);
2870 2870                                  plw->plw_do_split = 0;
2871 2871                                  return (nbin);
2872 2872                          }
2873 2873  
2874 2874                  } else {
2875 2875                          uint_t bin_jump = (vac_colors == 1) ?
2876 2876                              (BIN_STEP & ~3) - (plw->plw_bin0 & 3) : BIN_STEP;
2877 2877  
2878 2878                          bin_jump &= ~(vac_colors - 1);
2879 2879  
2880 2880                          nbin0 = ADD_MASKED(plw->plw_bin0, bin_jump, neq_mask,
2881 2881                              plw->plw_color_mask);
2882 2882  
2883 2883                          if ((nbin0 ^ plw->plw_bin0) & plw->plw_ceq_mask[szc]) {
2884 2884  
2885 2885                                  plw->plw_bin_marker = nbin = nbin0;
2886 2886  
2887 2887                                  if (plw->plw_bins[nszc] != 0) {
2888 2888                                          /*
2889 2889                                           * check if next page size bin is the
2890 2890                                           * same as the next page size bin for
2891 2891                                           * bin0
2892 2892                                           */
2893 2893                                          nbin_nsz = PAGE_GET_NSZ_COLOR(szc,
2894 2894                                              nbin);
2895 2895                                          bin0_nsz = PAGE_GET_NSZ_COLOR(szc,
2896 2896                                              plw->plw_bin0);
2897 2897  
2898 2898                                          if ((bin0_nsz ^ nbin_nsz) &
2899 2899                                              plw->plw_ceq_mask[nszc])
2900 2900                                                  plw->plw_do_split = 1;
2901 2901                                  }
2902 2902                                  return (nbin);
2903 2903                          }
2904 2904                  }
2905 2905          }
2906 2906  
2907 2907          if (plw->plw_bins[nszc] != 0) {
2908 2908                  nbin_nsz = PAGE_GET_NSZ_COLOR(szc, nbin);
2909 2909                  if (!((plw->plw_split_next ^ nbin_nsz) &
2910 2910                      plw->plw_ceq_mask[nszc]))
2911 2911                          plw->plw_do_split = 1;
2912 2912          }
2913 2913  
2914 2914          return (nbin);
2915 2915  }
2916 2916  
2917 2917  page_t *
2918 2918  page_get_mnode_freelist(int mnode, uint_t bin, int mtype, uchar_t szc,
2919 2919      uint_t flags)
2920 2920  {
2921 2921          kmutex_t                *pcm;
2922 2922          page_t                  *pp, *first_pp;
2923 2923          uint_t                  sbin;
2924 2924          int                     plw_initialized;
2925 2925          page_list_walker_t      plw;
2926 2926  
2927 2927          ASSERT(szc < mmu_page_sizes);
2928 2928  
2929 2929          VM_STAT_ADD(vmm_vmstats.pgmf_alloc[szc]);
2930 2930  
2931 2931          MTYPE_START(mnode, mtype, flags);
2932 2932          if (mtype < 0) {        /* mnode does not have memory in mtype range */
2933 2933                  VM_STAT_ADD(vmm_vmstats.pgmf_allocempty[szc]);
2934 2934                  return (NULL);
2935 2935          }
2936 2936  try_again:
2937 2937  
2938 2938          plw_initialized = 0;
2939 2939          plw.plw_ceq_dif = 1;
2940 2940  
2941 2941          /*
2942 2942           * Only hold one freelist lock at a time, that way we
2943 2943           * can start anywhere and not have to worry about lock
2944 2944           * ordering.
2945 2945           */
2946 2946          for (plw.plw_count = 0;
2947 2947              plw.plw_count < plw.plw_ceq_dif; plw.plw_count++) {
2948 2948                  sbin = bin;
2949 2949                  do {
2950 2950                          if (!PAGE_FREELISTS(mnode, szc, bin, mtype))
2951 2951                                  goto bin_empty_1;
2952 2952  
2953 2953                          pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
2954 2954                          mutex_enter(pcm);
2955 2955                          pp = PAGE_FREELISTS(mnode, szc, bin, mtype);
2956 2956                          if (pp == NULL)
2957 2957                                  goto bin_empty_0;
2958 2958  
2959 2959                          /*
2960 2960                           * These were set before the page
2961 2961                           * was put on the free list,
2962 2962                           * they must still be set.
2963 2963                           */
2964 2964                          ASSERT(PP_ISFREE(pp));
2965 2965                          ASSERT(PP_ISAGED(pp));
2966 2966                          ASSERT(pp->p_vnode == NULL);
2967 2967                          ASSERT(pp->p_hash == NULL);
2968 2968                          ASSERT(pp->p_offset == (u_offset_t)-1);
2969 2969                          ASSERT(pp->p_szc == szc);
2970 2970                          ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
2971 2971  
2972 2972                          /*
2973 2973                           * Walk down the hash chain.
2974 2974                           * 8k pages are linked on p_next
2975 2975                           * and p_prev fields. Large pages
2976 2976                           * are a contiguous group of
2977 2977                           * constituent pages linked together
2978 2978                           * on their p_next and p_prev fields.
2979 2979                           * The large pages are linked together
2980 2980                           * on the hash chain using p_vpnext
2981 2981                           * p_vpprev of the base constituent
2982 2982                           * page of each large page.
2983 2983                           */
2984 2984                          first_pp = pp;
2985 2985                          while (IS_DUMP_PAGE(pp) || !page_trylock_cons(pp,
2986 2986                              SE_EXCL)) {
2987 2987                                  if (szc == 0) {
2988 2988                                          pp = pp->p_next;
2989 2989                                  } else {
2990 2990                                          pp = pp->p_vpnext;
2991 2991                                  }
2992 2992  
2993 2993                                  ASSERT(PP_ISFREE(pp));
2994 2994                                  ASSERT(PP_ISAGED(pp));
2995 2995                                  ASSERT(pp->p_vnode == NULL);
2996 2996                                  ASSERT(pp->p_hash == NULL);
2997 2997                                  ASSERT(pp->p_offset == (u_offset_t)-1);
2998 2998                                  ASSERT(pp->p_szc == szc);
2999 2999                                  ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
3000 3000  
3001 3001                                  if (pp == first_pp)
3002 3002                                          goto bin_empty_0;
3003 3003                          }
3004 3004  
3005 3005                          ASSERT(pp != NULL);
3006 3006                          ASSERT(mtype == PP_2_MTYPE(pp));
3007 3007                          ASSERT(pp->p_szc == szc);
3008 3008                          if (szc == 0) {
3009 3009                                  page_sub(&PAGE_FREELISTS(mnode,
3010 3010                                      szc, bin, mtype), pp);
3011 3011                          } else {
3012 3012                                  page_vpsub(&PAGE_FREELISTS(mnode,
3013 3013                                      szc, bin, mtype), pp);
3014 3014                                  CHK_LPG(pp, szc);
3015 3015                          }
3016 3016                          page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST);
3017 3017  
3018 3018                          if ((PP_ISFREE(pp) == 0) || (PP_ISAGED(pp) == 0))
3019 3019                                  panic("free page is not. pp %p", (void *)pp);
3020 3020                          mutex_exit(pcm);
3021 3021  
3022 3022  #if defined(__sparc)
3023 3023                          ASSERT(!kcage_on || PP_ISNORELOC(pp) ||
3024 3024                              (flags & PG_NORELOC) == 0);
3025 3025  
3026 3026                          if (PP_ISNORELOC(pp))
3027 3027                                  kcage_freemem_sub(page_get_pagecnt(szc));
3028 3028  #endif
3029 3029                          VM_STAT_ADD(vmm_vmstats.pgmf_allocok[szc]);
3030 3030                          return (pp);
3031 3031  
3032 3032  bin_empty_0:
3033 3033                          mutex_exit(pcm);
3034 3034  bin_empty_1:
3035 3035                          if (plw_initialized == 0) {
3036 3036                                  page_list_walk_init(szc, flags, bin, 1, 1,
3037 3037                                      &plw);
3038 3038                                  plw_initialized = 1;
3039 3039                                  ASSERT(plw.plw_colors <=
3040 3040                                      PAGE_GET_PAGECOLORS(szc));
3041 3041                                  ASSERT(plw.plw_colors > 0);
3042 3042                                  ASSERT((plw.plw_colors &
3043 3043                                      (plw.plw_colors - 1)) == 0);
3044 3044                                  ASSERT(bin < plw.plw_colors);
3045 3045                                  ASSERT(plw.plw_ceq_mask[szc] < plw.plw_colors);
3046 3046                          }
3047 3047                          /* calculate the next bin with equivalent color */
3048 3048                          bin = ADD_MASKED(bin, plw.plw_bin_step,
3049 3049                              plw.plw_ceq_mask[szc], plw.plw_color_mask);
3050 3050                  } while (sbin != bin);
3051 3051  
3052 3052                  /*
3053 3053                   * color bins are all empty if color match. Try and
3054 3054                   * satisfy the request by breaking up or coalescing
3055 3055                   * pages from a different size freelist of the correct
3056 3056                   * color that satisfies the ORIGINAL color requested.
3057 3057                   * If that fails then try pages of the same size but
3058 3058                   * different colors assuming we are not called with
3059 3059                   * PG_MATCH_COLOR.
3060 3060                   */
3061 3061                  if (plw.plw_do_split &&
3062 3062                      (pp = page_freelist_split(szc, bin, mnode,
3063 3063                      mtype, PFNNULL, PFNNULL, &plw)) != NULL)
3064 3064                          return (pp);
3065 3065  
3066 3066                  if (szc > 0 && (pp = page_freelist_coalesce(mnode, szc,
3067 3067                      bin, plw.plw_ceq_mask[szc], mtype, PFNNULL)) !=  NULL)
3068 3068                          return (pp);
3069 3069  
3070 3070                  if (plw.plw_ceq_dif > 1)
3071 3071                          bin = page_list_walk_next_bin(szc, bin, &plw);
3072 3072          }
3073 3073  
3074 3074          /* if allowed, cycle through additional mtypes */
3075 3075          MTYPE_NEXT(mnode, mtype, flags);
3076 3076          if (mtype >= 0)
3077 3077                  goto try_again;
3078 3078  
3079 3079          VM_STAT_ADD(vmm_vmstats.pgmf_allocfailed[szc]);
3080 3080  
3081 3081          return (NULL);
3082 3082  }
3083 3083  
3084 3084  /*
3085 3085   * Returns the count of free pages for 'pp' with size code 'szc'.
3086 3086   * Note: This function does not return an exact value as the page freelist
3087 3087   * locks are not held and thus the values in the page_counters may be
3088 3088   * changing as we walk through the data.
3089 3089   */
3090 3090  static int
3091 3091  page_freecnt(int mnode, page_t *pp, uchar_t szc)
3092 3092  {
3093 3093          pgcnt_t pgfree;
3094 3094          pgcnt_t cnt;
3095 3095          ssize_t r = szc;        /* region size */
3096 3096          ssize_t idx;
3097 3097          int     i;
3098 3098          int     full, range;
3099 3099  
3100 3100          /* Make sure pagenum passed in is aligned properly */
3101 3101          ASSERT((pp->p_pagenum & (PNUM_SIZE(szc) - 1)) == 0);
3102 3102          ASSERT(szc > 0);
3103 3103  
3104 3104          /* Prevent page_counters dynamic memory from being freed */
3105 3105          rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
3106 3106          idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum);
3107 3107          cnt = PAGE_COUNTERS(mnode, r, idx);
3108 3108          pgfree = cnt << PNUM_SHIFT(r - 1);
3109 3109          range = FULL_REGION_CNT(szc);
3110 3110  
3111 3111          /* Check for completely full region */
3112 3112          if (cnt == range) {
3113 3113                  rw_exit(&page_ctrs_rwlock[mnode]);
3114 3114                  return (pgfree);
3115 3115          }
3116 3116  
3117 3117          while (--r > 0) {
3118 3118                  idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum);
3119 3119                  full = FULL_REGION_CNT(r);
3120 3120                  for (i = 0; i < range; i++, idx++) {
3121 3121                          cnt = PAGE_COUNTERS(mnode, r, idx);
3122 3122                          /*
3123 3123                           * If cnt here is full, that means we have already
3124 3124                           * accounted for these pages earlier.
3125 3125                           */
3126 3126                          if (cnt != full) {
3127 3127                                  pgfree += (cnt << PNUM_SHIFT(r - 1));
3128 3128                          }
3129 3129                  }
3130 3130                  range *= full;
3131 3131          }
3132 3132          rw_exit(&page_ctrs_rwlock[mnode]);
3133 3133          return (pgfree);
3134 3134  }
3135 3135  
3136 3136  /*
3137 3137   * Called from page_geti_contig_pages to exclusively lock constituent pages
3138 3138   * starting from 'spp' for page size code 'szc'.
3139 3139   *
3140 3140   * If 'ptcpthreshold' is set, the number of free pages needed in the 'szc'
3141 3141   * region needs to be greater than or equal to the threshold.
3142 3142   */
3143 3143  static int
3144 3144  page_trylock_contig_pages(int mnode, page_t *spp, uchar_t szc, int flags)
3145 3145  {
3146 3146          pgcnt_t pgcnt = PNUM_SIZE(szc);
3147 3147          pgcnt_t pgfree, i;
3148 3148          page_t *pp;
3149 3149  
3150 3150          VM_STAT_ADD(vmm_vmstats.ptcp[szc]);
3151 3151  
3152 3152  
3153 3153          if ((ptcpthreshold == 0) || (flags & PGI_PGCPHIPRI))
3154 3154                  goto skipptcpcheck;
3155 3155          /*
3156 3156           * check if there are sufficient free pages available before attempting
3157 3157           * to trylock. Count is approximate as page counters can change.
3158 3158           */
3159 3159          pgfree = page_freecnt(mnode, spp, szc);
3160 3160  
3161 3161          /* attempt to trylock if there are sufficient already free pages */
3162 3162          if (pgfree < pgcnt/ptcpthreshold) {
3163 3163                  VM_STAT_ADD(vmm_vmstats.ptcpfreethresh[szc]);
3164 3164                  return (0);
3165 3165          }
3166 3166  
3167 3167  skipptcpcheck:
3168 3168  
3169 3169          for (i = 0; i < pgcnt; i++) {
3170 3170                  pp = &spp[i];
3171 3171                  if (!page_trylock(pp, SE_EXCL)) {
3172 3172                          VM_STAT_ADD(vmm_vmstats.ptcpfailexcl[szc]);
3173 3173                          while (--i != (pgcnt_t)-1) {
3174 3174                                  pp = &spp[i];
3175 3175                                  ASSERT(PAGE_EXCL(pp));
3176 3176                                  page_unlock_nocapture(pp);
3177 3177                          }
3178 3178                          return (0);
3179 3179                  }
3180 3180                  ASSERT(spp[i].p_pagenum == spp->p_pagenum + i);
3181 3181                  if ((pp->p_szc > szc || (szc && pp->p_szc == szc)) &&
3182 3182                      !PP_ISFREE(pp)) {
3183 3183                          VM_STAT_ADD(vmm_vmstats.ptcpfailszc[szc]);
3184 3184                          ASSERT(i == 0);
3185 3185                          page_unlock_nocapture(pp);
3186 3186                          return (0);
3187 3187                  }
3188 3188  
3189 3189                  /*
3190 3190                   * If a page has been marked non-relocatable or has been
3191 3191                   * explicitly locked in memory, we don't want to relocate it;
3192 3192                   * unlock the pages and fail the operation.
3193 3193                   */
3194 3194                  if (PP_ISNORELOC(pp) ||
3195 3195                      pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
3196 3196                          VM_STAT_ADD(vmm_vmstats.ptcpfailcage[szc]);
3197 3197                          while (i != (pgcnt_t)-1) {
3198 3198                                  pp = &spp[i];
3199 3199                                  ASSERT(PAGE_EXCL(pp));
3200 3200                                  page_unlock_nocapture(pp);
3201 3201                                  i--;
3202 3202                          }
3203 3203                          return (0);
3204 3204                  }
3205 3205          }
3206 3206          VM_STAT_ADD(vmm_vmstats.ptcpok[szc]);
3207 3207          return (1);
3208 3208  }
3209 3209  
3210 3210  /*
3211 3211   * Claim large page pointed to by 'pp'. 'pp' is the starting set
3212 3212   * of 'szc' constituent pages that had been locked exclusively previously.
3213 3213   * Will attempt to relocate constituent pages in use.
3214 3214   */
3215 3215  static page_t *
3216 3216  page_claim_contig_pages(page_t *pp, uchar_t szc, int flags)
3217 3217  {
3218 3218          spgcnt_t pgcnt, npgs, i;
3219 3219          page_t *targpp, *rpp, *hpp;
3220 3220          page_t *replpp = NULL;
3221 3221          page_t *pplist = NULL;
3222 3222  
3223 3223          ASSERT(pp != NULL);
3224 3224  
3225 3225          pgcnt = page_get_pagecnt(szc);
3226 3226          while (pgcnt) {
3227 3227                  ASSERT(PAGE_EXCL(pp));
3228 3228                  ASSERT(!PP_ISNORELOC(pp));
3229 3229                  if (PP_ISFREE(pp)) {
3230 3230                          /*
3231 3231                           * If this is a PG_FREE_LIST page then its
3232 3232                           * size code can change underneath us due to
3233 3233                           * page promotion or demotion. As an optimzation
3234 3234                           * use page_list_sub_pages() instead of
3235 3235                           * page_list_sub().
3236 3236                           */
3237 3237                          if (PP_ISAGED(pp)) {
3238 3238                                  page_list_sub_pages(pp, szc);
3239 3239                                  if (pp->p_szc == szc) {
3240 3240                                          return (pp);
3241 3241                                  }
3242 3242                                  ASSERT(pp->p_szc < szc);
3243 3243                                  npgs = page_get_pagecnt(pp->p_szc);
3244 3244                                  hpp = pp;
3245 3245                                  for (i = 0; i < npgs; i++, pp++) {
3246 3246                                          pp->p_szc = szc;
3247 3247                                  }
3248 3248                                  page_list_concat(&pplist, &hpp);
3249 3249                                  pgcnt -= npgs;
3250 3250                                  continue;
3251 3251                          }
3252 3252                          ASSERT(!PP_ISAGED(pp));
3253 3253                          ASSERT(pp->p_szc == 0);
3254 3254                          page_list_sub(pp, PG_CACHE_LIST);
3255 3255                          page_hashout(pp, NULL);
3256 3256                          PP_SETAGED(pp);
3257 3257                          pp->p_szc = szc;
3258 3258                          page_list_concat(&pplist, &pp);
3259 3259                          pp++;
3260 3260                          pgcnt--;
3261 3261                          continue;
3262 3262                  }
3263 3263                  npgs = page_get_pagecnt(pp->p_szc);
3264 3264  
3265 3265                  /*
3266 3266                   * page_create_wait freemem accounting done by caller of
3267 3267                   * page_get_freelist and not necessary to call it prior to
3268 3268                   * calling page_get_replacement_page.
3269 3269                   *
3270 3270                   * page_get_replacement_page can call page_get_contig_pages
3271 3271                   * to acquire a large page (szc > 0); the replacement must be
3272 3272                   * smaller than the contig page size to avoid looping or
3273 3273                   * szc == 0 and PGI_PGCPSZC0 is set.
3274 3274                   */
3275 3275                  if (pp->p_szc < szc || (szc == 0 && (flags & PGI_PGCPSZC0))) {
3276 3276                          replpp = page_get_replacement_page(pp, NULL, 0);
3277 3277                          if (replpp) {
3278 3278                                  npgs = page_get_pagecnt(pp->p_szc);
3279 3279                                  ASSERT(npgs <= pgcnt);
3280 3280                                  targpp = pp;
3281 3281                          }
3282 3282                  }
3283 3283  
3284 3284                  /*
3285 3285                   * If replacement is NULL or do_page_relocate fails, fail
3286 3286                   * coalescing of pages.
3287 3287                   */
3288 3288                  if (replpp == NULL || (do_page_relocate(&targpp, &replpp, 0,
3289 3289                      &npgs, NULL) != 0)) {
3290 3290                          /*
3291 3291                           * Unlock un-processed target list
3292 3292                           */
3293 3293                          while (pgcnt--) {
3294 3294                                  ASSERT(PAGE_EXCL(pp));
3295 3295                                  page_unlock_nocapture(pp);
3296 3296                                  pp++;
3297 3297                          }
3298 3298                          /*
3299 3299                           * Free the processed target list.
3300 3300                           */
3301 3301                          while (pplist) {
3302 3302                                  pp = pplist;
3303 3303                                  page_sub(&pplist, pp);
3304 3304                                  ASSERT(PAGE_EXCL(pp));
3305 3305                                  ASSERT(pp->p_szc == szc);
3306 3306                                  ASSERT(PP_ISFREE(pp));
3307 3307                                  ASSERT(PP_ISAGED(pp));
3308 3308                                  pp->p_szc = 0;
3309 3309                                  page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
3310 3310                                  page_unlock_nocapture(pp);
3311 3311                          }
3312 3312  
3313 3313                          if (replpp != NULL)
3314 3314                                  page_free_replacement_page(replpp);
3315 3315  
3316 3316                          return (NULL);
3317 3317                  }
3318 3318                  ASSERT(pp == targpp);
3319 3319  
3320 3320                  /* LINTED */
3321 3321                  ASSERT(hpp = pp); /* That's right, it's an assignment */
3322 3322  
3323 3323                  pp += npgs;
3324 3324                  pgcnt -= npgs;
3325 3325  
3326 3326                  while (npgs--) {
3327 3327                          ASSERT(PAGE_EXCL(targpp));
3328 3328                          ASSERT(!PP_ISFREE(targpp));
3329 3329                          ASSERT(!PP_ISNORELOC(targpp));
3330 3330                          PP_SETFREE(targpp);
3331 3331                          ASSERT(PP_ISAGED(targpp));
3332 3332                          ASSERT(targpp->p_szc < szc || (szc == 0 &&
3333 3333                              (flags & PGI_PGCPSZC0)));
3334 3334                          targpp->p_szc = szc;
3335 3335                          targpp = targpp->p_next;
3336 3336  
3337 3337                          rpp = replpp;
3338 3338                          ASSERT(rpp != NULL);
3339 3339                          page_sub(&replpp, rpp);
3340 3340                          ASSERT(PAGE_EXCL(rpp));
3341 3341                          ASSERT(!PP_ISFREE(rpp));
3342 3342                          page_unlock_nocapture(rpp);
3343 3343                  }
3344 3344                  ASSERT(targpp == hpp);
3345 3345                  ASSERT(replpp == NULL);
3346 3346                  page_list_concat(&pplist, &targpp);
3347 3347          }
3348 3348          CHK_LPG(pplist, szc);
3349 3349          return (pplist);
3350 3350  }
3351 3351  
3352 3352  /*
3353 3353   * Trim kernel cage from pfnlo-pfnhi and store result in lo-hi. Return code
3354 3354   * of 0 means nothing left after trim.
3355 3355   */
3356 3356  int
3357 3357  trimkcage(struct memseg *mseg, pfn_t *lo, pfn_t *hi, pfn_t pfnlo, pfn_t pfnhi)
3358 3358  {
3359 3359          pfn_t   kcagepfn;
3360 3360          int     decr;
3361 3361          int     rc = 0;
3362 3362  
3363 3363          if (PP_ISNORELOC(mseg->pages)) {
3364 3364                  if (PP_ISNORELOC(mseg->epages - 1) == 0) {
3365 3365  
3366 3366                          /* lower part of this mseg inside kernel cage */
3367 3367                          decr = kcage_current_pfn(&kcagepfn);
3368 3368  
3369 3369                          /* kernel cage may have transitioned past mseg */
3370 3370                          if (kcagepfn >= mseg->pages_base &&
3371 3371                              kcagepfn < mseg->pages_end) {
3372 3372                                  ASSERT(decr == 0);
3373 3373                                  *lo = MAX(kcagepfn, pfnlo);
3374 3374                                  *hi = MIN(pfnhi, (mseg->pages_end - 1));
3375 3375                                  rc = 1;
3376 3376                          }
3377 3377                  }
3378 3378                  /* else entire mseg in the cage */
3379 3379          } else {
3380 3380                  if (PP_ISNORELOC(mseg->epages - 1)) {
3381 3381  
3382 3382                          /* upper part of this mseg inside kernel cage */
3383 3383                          decr = kcage_current_pfn(&kcagepfn);
3384 3384  
3385 3385                          /* kernel cage may have transitioned past mseg */
3386 3386                          if (kcagepfn >= mseg->pages_base &&
3387 3387                              kcagepfn < mseg->pages_end) {
3388 3388                                  ASSERT(decr);
3389 3389                                  *hi = MIN(kcagepfn, pfnhi);
3390 3390                                  *lo = MAX(pfnlo, mseg->pages_base);
3391 3391                                  rc = 1;
3392 3392                          }
3393 3393                  } else {
3394 3394                          /* entire mseg outside of kernel cage */
3395 3395                          *lo = MAX(pfnlo, mseg->pages_base);
3396 3396                          *hi = MIN(pfnhi, (mseg->pages_end - 1));
3397 3397                          rc = 1;
3398 3398                  }
3399 3399          }
3400 3400          return (rc);
3401 3401  }
3402 3402  
3403 3403  /*
3404 3404   * called from page_get_contig_pages to search 'pfnlo' thru 'pfnhi' to claim a
3405 3405   * page with size code 'szc'. Claiming such a page requires acquiring
3406 3406   * exclusive locks on all constituent pages (page_trylock_contig_pages),
3407 3407   * relocating pages in use and concatenating these constituent pages into a
3408 3408   * large page.
3409 3409   *
3410 3410   * The page lists do not have such a large page and page_freelist_split has
3411 3411   * already failed to demote larger pages and/or coalesce smaller free pages.
3412 3412   *
3413 3413   * 'flags' may specify PG_COLOR_MATCH which would limit the search of large
3414 3414   * pages with the same color as 'bin'.
3415 3415   *
3416 3416   * 'pfnflag' specifies the subset of the pfn range to search.
3417 3417   */
3418 3418  
3419 3419  static page_t *
3420 3420  page_geti_contig_pages(int mnode, uint_t bin, uchar_t szc, int flags,
3421 3421      pfn_t pfnlo, pfn_t pfnhi, pgcnt_t pfnflag)
3422 3422  {
3423 3423          struct memseg *mseg;
3424 3424          pgcnt_t szcpgcnt = page_get_pagecnt(szc);
3425 3425          pgcnt_t szcpgmask = szcpgcnt - 1;
3426 3426          pfn_t   randpfn;
3427 3427          page_t *pp, *randpp, *endpp;
3428 3428          uint_t colors, ceq_mask;
3429 3429          /* LINTED : set but not used in function */
3430 3430          uint_t color_mask __unused;
3431 3431          pfn_t hi, lo;
3432 3432          uint_t skip;
3433 3433          MEM_NODE_ITERATOR_DECL(it);
3434 3434  
3435 3435          ASSERT(szc != 0 || (flags & PGI_PGCPSZC0));
3436 3436  
3437 3437          pfnlo = P2ROUNDUP(pfnlo, szcpgcnt);
3438 3438  
3439 3439          if ((pfnhi - pfnlo) + 1 < szcpgcnt || pfnlo >= pfnhi)
3440 3440                  return (NULL);
3441 3441  
3442 3442          ASSERT(szc < mmu_page_sizes);
3443 3443  
3444 3444          colors = PAGE_GET_PAGECOLORS(szc);
3445 3445          color_mask = colors - 1;
3446 3446          if ((colors > 1) && (flags & PG_MATCH_COLOR)) {
3447 3447                  uchar_t ceq = colorequivszc[szc];
3448 3448                  uint_t  ceq_dif = colors >> ((ceq >> 4) + (ceq & 0xf));
3449 3449  
3450 3450                  ASSERT(ceq_dif > 0);
3451 3451                  ceq_mask = (ceq_dif - 1) << (ceq & 0xf);
3452 3452          } else {
3453 3453                  ceq_mask = 0;
3454 3454          }
3455 3455  
3456 3456          ASSERT(bin < colors);
3457 3457  
3458 3458          /* clear "non-significant" color bits */
3459 3459          bin &= ceq_mask;
3460 3460  
3461 3461          /*
3462 3462           * trim the pfn range to search based on pfnflag. pfnflag is set
3463 3463           * when there have been previous page_get_contig_page failures to
3464 3464           * limit the search.
3465 3465           *
3466 3466           * The high bit in pfnflag specifies the number of 'slots' in the
3467 3467           * pfn range and the remainder of pfnflag specifies which slot.
3468 3468           * For example, a value of 1010b would mean the second slot of
3469 3469           * the pfn range that has been divided into 8 slots.
3470 3470           */
3471 3471          if (pfnflag > 1) {
3472 3472                  int     slots = 1 << (highbit(pfnflag) - 1);
3473 3473                  int     slotid = pfnflag & (slots - 1);
3474 3474                  pgcnt_t szcpages;
3475 3475                  int     slotlen;
3476 3476  
3477 3477                  pfnhi = P2ALIGN((pfnhi + 1), szcpgcnt) - 1;
3478 3478                  szcpages = ((pfnhi - pfnlo) + 1) / szcpgcnt;
3479 3479                  slotlen = howmany(szcpages, slots);
3480 3480                  /* skip if 'slotid' slot is empty */
3481 3481                  if (slotid * slotlen >= szcpages)
3482 3482                          return (NULL);
3483 3483                  pfnlo = pfnlo + (((slotid * slotlen) % szcpages) * szcpgcnt);
3484 3484                  ASSERT(pfnlo < pfnhi);
3485 3485                  if (pfnhi > pfnlo + (slotlen * szcpgcnt))
3486 3486                          pfnhi = pfnlo + (slotlen * szcpgcnt) - 1;
3487 3487          }
3488 3488  
3489 3489          /*
3490 3490           * This routine is can be called recursively so we shouldn't
3491 3491           * acquire a reader lock if a write request is pending. This
3492 3492           * could lead to a deadlock with the DR thread.
3493 3493           *
3494 3494           * Returning NULL informs the caller that we could not get
3495 3495           * a contig page with the required characteristics.
3496 3496           */
3497 3497  
3498 3498          if (!memsegs_trylock(0))
3499 3499                  return (NULL);
3500 3500  
3501 3501          /*
3502 3502           * loop through memsegs to look for contig page candidates
3503 3503           */
3504 3504  
3505 3505          for (mseg = memsegs; mseg != NULL; mseg = mseg->next) {
3506 3506                  if (pfnhi < mseg->pages_base || pfnlo >= mseg->pages_end) {
3507 3507                          /* no overlap */
3508 3508                          continue;
3509 3509                  }
3510 3510  
3511 3511                  if (mseg->pages_end - mseg->pages_base < szcpgcnt)
3512 3512                          /* mseg too small */
3513 3513                          continue;
3514 3514  
3515 3515                  /*
3516 3516                   * trim off kernel cage pages from pfn range and check for
3517 3517                   * a trimmed pfn range returned that does not span the
3518 3518                   * desired large page size.
3519 3519                   */
3520 3520                  if (kcage_on) {
3521 3521                          if (trimkcage(mseg, &lo, &hi, pfnlo, pfnhi) == 0 ||
3522 3522                              lo >= hi || ((hi - lo) + 1) < szcpgcnt)
3523 3523                                  continue;
3524 3524                  } else {
3525 3525                          lo = MAX(pfnlo, mseg->pages_base);
3526 3526                          hi = MIN(pfnhi, (mseg->pages_end - 1));
3527 3527                  }
3528 3528  
3529 3529                  /* round to szcpgcnt boundaries */
3530 3530                  lo = P2ROUNDUP(lo, szcpgcnt);
3531 3531  
3532 3532                  MEM_NODE_ITERATOR_INIT(lo, mnode, szc, &it);
3533 3533                  hi = P2ALIGN((hi + 1), szcpgcnt) - 1;
3534 3534  
3535 3535                  if (hi <= lo)
3536 3536                          continue;
3537 3537  
3538 3538                  /*
3539 3539                   * set lo to point to the pfn for the desired bin. Large
3540 3540                   * page sizes may only have a single page color
3541 3541                   */
3542 3542                  skip = szcpgcnt;
3543 3543                  if (ceq_mask > 0 || interleaved_mnodes) {
3544 3544                          /* set lo to point at appropriate color */
3545 3545                          if (((PFN_2_COLOR(lo, szc, &it) ^ bin) & ceq_mask) ||
3546 3546                              (interleaved_mnodes &&
3547 3547                              PFN_2_MEM_NODE(lo) != mnode)) {
3548 3548                                  PAGE_NEXT_PFN_FOR_COLOR(lo, szc, bin, ceq_mask,
3549 3549                                      color_mask, &it);
3550 3550                          }
3551 3551                          if (hi <= lo)
3552 3552                                  /* mseg cannot satisfy color request */
3553 3553                                  continue;
3554 3554                  }
3555 3555  
3556 3556                  /* randomly choose a point between lo and hi to begin search */
3557 3557  
3558 3558                  randpfn = (pfn_t)GETTICK();
3559 3559                  randpfn = ((randpfn % (hi - lo)) + lo) & ~(skip - 1);
3560 3560                  MEM_NODE_ITERATOR_INIT(randpfn, mnode, szc, &it);
3561 3561                  if (ceq_mask || interleaved_mnodes || randpfn == (pfn_t)-1) {
3562 3562                          if (randpfn != (pfn_t)-1) {
3563 3563                                  PAGE_NEXT_PFN_FOR_COLOR(randpfn, szc, bin,
3564 3564                                      ceq_mask, color_mask, &it);
3565 3565                          }
3566 3566                          if (randpfn >= hi) {
3567 3567                                  randpfn = lo;
3568 3568                                  MEM_NODE_ITERATOR_INIT(randpfn, mnode, szc,
3569 3569                                      &it);
3570 3570                          }
3571 3571                  }
3572 3572                  randpp = mseg->pages + (randpfn - mseg->pages_base);
3573 3573  
3574 3574                  ASSERT(randpp->p_pagenum == randpfn);
3575 3575  
3576 3576                  pp = randpp;
3577 3577                  endpp =  mseg->pages + (hi - mseg->pages_base) + 1;
3578 3578  
3579 3579                  ASSERT(randpp + szcpgcnt <= endpp);
3580 3580  
3581 3581                  do {
3582 3582                          ASSERT(!(pp->p_pagenum & szcpgmask));
3583 3583                          ASSERT(((PP_2_BIN(pp) ^ bin) & ceq_mask) == 0);
3584 3584  
3585 3585                          if (page_trylock_contig_pages(mnode, pp, szc, flags)) {
3586 3586                                  /* pages unlocked by page_claim on failure */
3587 3587                                  if (page_claim_contig_pages(pp, szc, flags)) {
3588 3588                                          memsegs_unlock(0);
3589 3589                                          return (pp);
3590 3590                                  }
3591 3591                          }
3592 3592  
3593 3593                          if (ceq_mask == 0 && !interleaved_mnodes) {
3594 3594                                  pp += skip;
3595 3595                          } else {
3596 3596                                  pfn_t pfn = pp->p_pagenum;
3597 3597  
3598 3598                                  PAGE_NEXT_PFN_FOR_COLOR(pfn, szc, bin,
3599 3599                                      ceq_mask, color_mask, &it);
3600 3600                                  if (pfn == (pfn_t)-1) {
3601 3601                                          pp = endpp;
3602 3602                                  } else {
3603 3603                                          pp = mseg->pages +
3604 3604                                              (pfn - mseg->pages_base);
3605 3605                                  }
3606 3606                          }
3607 3607                          if (pp >= endpp) {
3608 3608                                  /* start from the beginning */
3609 3609                                  MEM_NODE_ITERATOR_INIT(lo, mnode, szc, &it);
3610 3610                                  pp = mseg->pages + (lo - mseg->pages_base);
3611 3611                                  ASSERT(pp->p_pagenum == lo);
3612 3612                                  ASSERT(pp + szcpgcnt <= endpp);
3613 3613                          }
3614 3614                  } while (pp != randpp);
3615 3615          }
3616 3616          memsegs_unlock(0);
3617 3617          return (NULL);
3618 3618  }
3619 3619  
3620 3620  
3621 3621  /*
3622 3622   * controlling routine that searches through physical memory in an attempt to
3623 3623   * claim a large page based on the input parameters.
3624 3624   * on the page free lists.
3625 3625   *
3626 3626   * calls page_geti_contig_pages with an initial pfn range from the mnode
3627 3627   * and mtype. page_geti_contig_pages will trim off the parts of the pfn range
3628 3628   * that overlaps with the kernel cage or does not match the requested page
3629 3629   * color if PG_MATCH_COLOR is set.  Since this search is very expensive,
3630 3630   * page_geti_contig_pages may further limit the search range based on
3631 3631   * previous failure counts (pgcpfailcnt[]).
3632 3632   *
3633 3633   * for PGI_PGCPSZC0 requests, page_get_contig_pages will relocate a base
3634 3634   * pagesize page that satisfies mtype.
3635 3635   */
3636 3636  page_t *
3637 3637  page_get_contig_pages(int mnode, uint_t bin, int mtype, uchar_t szc,
3638 3638      uint_t flags)
3639 3639  {
3640 3640          pfn_t           pfnlo, pfnhi;   /* contig pages pfn range */
3641 3641          page_t          *pp;
3642 3642          pgcnt_t         pfnflag = 0;    /* no limit on search if 0 */
3643 3643  
3644 3644          VM_STAT_ADD(vmm_vmstats.pgcp_alloc[szc]);
3645 3645  
3646 3646          /* no allocations from cage */
3647 3647          flags |= PGI_NOCAGE;
3648 3648  
3649 3649          /* LINTED */
3650 3650          MTYPE_START(mnode, mtype, flags);
3651 3651          if (mtype < 0) {        /* mnode does not have memory in mtype range */
3652 3652                  VM_STAT_ADD(vmm_vmstats.pgcp_allocempty[szc]);
3653 3653                  return (NULL);
3654 3654          }
3655 3655  
3656 3656          ASSERT(szc > 0 || (flags & PGI_PGCPSZC0));
3657 3657  
3658 3658          /* do not limit search and ignore color if hi pri */
3659 3659  
3660 3660          if (pgcplimitsearch && ((flags & PGI_PGCPHIPRI) == 0))
3661 3661                  pfnflag = pgcpfailcnt[szc];
3662 3662  
3663 3663          /* remove color match to improve chances */
3664 3664  
3665 3665          if (flags & PGI_PGCPHIPRI || pfnflag)
3666 3666                  flags &= ~PG_MATCH_COLOR;
3667 3667  
3668 3668          do {
3669 3669                  /* get pfn range based on mnode and mtype */
3670 3670                  MNODETYPE_2_PFN(mnode, mtype, pfnlo, pfnhi);
3671 3671  
3672 3672                  ASSERT(pfnhi >= pfnlo);
3673 3673  
3674 3674                  pp = page_geti_contig_pages(mnode, bin, szc, flags,
3675 3675                      pfnlo, pfnhi, pfnflag);
3676 3676  
3677 3677                  if (pp != NULL) {
3678 3678                          pfnflag = pgcpfailcnt[szc];
3679 3679                          if (pfnflag) {
3680 3680                                  /* double the search size */
3681 3681                                  pgcpfailcnt[szc] = pfnflag >> 1;
3682 3682                          }
3683 3683                          VM_STAT_ADD(vmm_vmstats.pgcp_allocok[szc]);
3684 3684                          return (pp);
3685 3685                  }
3686 3686                  MTYPE_NEXT(mnode, mtype, flags);
3687 3687          } while (mtype >= 0);
3688 3688  
3689 3689          VM_STAT_ADD(vmm_vmstats.pgcp_allocfailed[szc]);
3690 3690          return (NULL);
3691 3691  }
3692 3692  
3693 3693  #if defined(__i386) || defined(__amd64)
3694 3694  /*
3695 3695   * Determine the likelihood of finding/coalescing a szc page.
3696 3696   * Return 0 if the likelihood is small otherwise return 1.
3697 3697   *
3698 3698   * For now, be conservative and check only 1g pages and return 0
3699 3699   * if there had been previous coalescing failures and the szc pages
3700 3700   * needed to satisfy request would exhaust most of freemem.
3701 3701   */
3702 3702  int
3703 3703  page_chk_freelist(uint_t szc)
3704 3704  {
3705 3705          pgcnt_t         pgcnt;
3706 3706  
3707 3707          if (szc <= 1)
3708 3708                  return (1);
3709 3709  
3710 3710          pgcnt = page_get_pagecnt(szc);
3711 3711          if (pgcpfailcnt[szc] && pgcnt + throttlefree >= freemem) {
3712 3712                  VM_STAT_ADD(vmm_vmstats.pcf_deny[szc]);
3713 3713                  return (0);
3714 3714          }
3715 3715          VM_STAT_ADD(vmm_vmstats.pcf_allow[szc]);
3716 3716          return (1);
3717 3717  }
3718 3718  #endif
3719 3719  
3720 3720  /*
3721 3721   * Find the `best' page on the freelist for this (vp,off) (as,vaddr) pair.
3722 3722   *
3723 3723   * Does its own locking and accounting.
3724 3724   * If PG_MATCH_COLOR is set, then NULL will be returned if there are no
3725 3725   * pages of the proper color even if there are pages of a different color.
3726 3726   *
3727 3727   * Finds a page, removes it, THEN locks it.
3728 3728   */
3729 3729  
3730 3730  /*ARGSUSED*/
3731 3731  page_t *
3732 3732  page_get_freelist(struct vnode *vp, u_offset_t off, struct seg *seg,
3733 3733      caddr_t vaddr, size_t size, uint_t flags, struct lgrp *lgrp)
3734 3734  {
3735 3735          struct as       *as = seg->s_as;
3736 3736          page_t          *pp = NULL;
3737 3737          ulong_t         bin;
3738 3738          uchar_t         szc;
3739 3739          int             mnode;
3740 3740          int             mtype;
3741 3741          page_t          *(*page_get_func)(int, uint_t, int, uchar_t, uint_t);
3742 3742          lgrp_mnode_cookie_t     lgrp_cookie;
3743 3743  
3744 3744          page_get_func = page_get_mnode_freelist;
3745 3745  
3746 3746          /*
3747 3747           * If we aren't passed a specific lgroup, or passed a freed lgrp
3748 3748           * assume we wish to allocate near to the current thread's home.
3749 3749           */
3750 3750          if (!LGRP_EXISTS(lgrp))
3751 3751                  lgrp = lgrp_home_lgrp();
3752 3752  
3753 3753          if (kcage_on) {
3754 3754                  if ((flags & (PG_NORELOC | PG_PANIC)) == PG_NORELOC &&
3755 3755                      kcage_freemem < kcage_throttlefree + btop(size) &&
3756 3756                      curthread != kcage_cageout_thread) {
3757 3757                          /*
3758 3758                           * Set a "reserve" of kcage_throttlefree pages for
3759 3759                           * PG_PANIC and cageout thread allocations.
3760 3760                           *
3761 3761                           * Everybody else has to serialize in
3762 3762                           * page_create_get_something() to get a cage page, so
3763 3763                           * that we don't deadlock cageout!
3764 3764                           */
3765 3765                          return (NULL);
3766 3766                  }
3767 3767          } else {
3768 3768                  flags &= ~PG_NORELOC;
3769 3769                  flags |= PGI_NOCAGE;
3770 3770          }
3771 3771  
3772 3772          /* LINTED */
3773 3773          MTYPE_INIT(mtype, vp, vaddr, flags, size);
3774 3774  
3775 3775          /*
3776 3776           * Convert size to page size code.
3777 3777           */
3778 3778          if ((szc = page_szc(size)) == (uchar_t)-1)
3779 3779                  panic("page_get_freelist: illegal page size request");
3780 3780          ASSERT(szc < mmu_page_sizes);
3781 3781  
3782 3782          VM_STAT_ADD(vmm_vmstats.pgf_alloc[szc]);
3783 3783  
3784 3784          /* LINTED */
3785 3785          AS_2_BIN(as, seg, vp, vaddr, bin, szc);
3786 3786  
3787 3787          ASSERT(bin < PAGE_GET_PAGECOLORS(szc));
3788 3788  
3789 3789          /*
3790 3790           * Try to get a local page first, but try remote if we can't
3791 3791           * get a page of the right color.
3792 3792           */
3793 3793  pgretry:
3794 3794          LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL);
3795 3795          while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3796 3796                  pp = page_get_func(mnode, bin, mtype, szc, flags);
3797 3797                  if (pp != NULL) {
3798 3798                          VM_STAT_ADD(vmm_vmstats.pgf_allocok[szc]);
3799 3799                          DTRACE_PROBE4(page__get,
3800 3800                              lgrp_t *, lgrp,
3801 3801                              int, mnode,
3802 3802                              ulong_t, bin,
3803 3803                              uint_t, flags);
3804 3804                          return (pp);
3805 3805                  }
3806 3806          }
3807 3807          ASSERT(pp == NULL);
3808 3808  
3809 3809          /*
3810 3810           * for non-SZC0 PAGESIZE requests, check cachelist before checking
3811 3811           * remote free lists.  Caller expected to call page_get_cachelist which
3812 3812           * will check local cache lists and remote free lists.
3813 3813           */
3814 3814          if (szc == 0 && ((flags & PGI_PGCPSZC0) == 0)) {
3815 3815                  VM_STAT_ADD(vmm_vmstats.pgf_allocdeferred);
3816 3816                  return (NULL);
3817 3817          }
3818 3818  
3819 3819          ASSERT(szc > 0 || (flags & PGI_PGCPSZC0));
3820 3820  
3821 3821          lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1);
3822 3822  
3823 3823          if (!(flags & PG_LOCAL)) {
3824 3824                  /*
3825 3825                   * Try to get a non-local freelist page.
3826 3826                   */
3827 3827                  LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie);
3828 3828                  while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3829 3829                          pp = page_get_func(mnode, bin, mtype, szc, flags);
3830 3830                          if (pp != NULL) {
3831 3831                                  DTRACE_PROBE4(page__get,
3832 3832                                      lgrp_t *, lgrp,
3833 3833                                      int, mnode,
3834 3834                                      ulong_t, bin,
3835 3835                                      uint_t, flags);
3836 3836                                  VM_STAT_ADD(vmm_vmstats.pgf_allocokrem[szc]);
3837 3837                                  return (pp);
3838 3838                          }
3839 3839                  }
3840 3840                  ASSERT(pp == NULL);
3841 3841          }
3842 3842  
3843 3843          /*
3844 3844           * when the cage is off chances are page_get_contig_pages() will fail
3845 3845           * to lock a large page chunk therefore when the cage is off it's not
3846 3846           * called by default.  this can be changed via /etc/system.
3847 3847           *
3848 3848           * page_get_contig_pages() also called to acquire a base pagesize page
3849 3849           * for page_create_get_something().
3850 3850           */
3851 3851          if (!(flags & PG_NORELOC) && (pg_contig_disable == 0) &&
3852 3852              (kcage_on || pg_lpgcreate_nocage || szc == 0) &&
3853 3853              (page_get_func != page_get_contig_pages)) {
3854 3854  
3855 3855                  VM_STAT_ADD(vmm_vmstats.pgf_allocretry[szc]);
3856 3856                  page_get_func = page_get_contig_pages;
3857 3857                  goto pgretry;
3858 3858          }
3859 3859  
3860 3860          if (!(flags & PG_LOCAL) && pgcplimitsearch &&
3861 3861              page_get_func == page_get_contig_pages)
3862 3862                  SETPGCPFAILCNT(szc);
3863 3863  
3864 3864          VM_STAT_ADD(vmm_vmstats.pgf_allocfailed[szc]);
3865 3865          return (NULL);
3866 3866  }
3867 3867  
3868 3868  /*
3869 3869   * Find the `best' page on the cachelist for this (vp,off) (as,vaddr) pair.
3870 3870   *
3871 3871   * Does its own locking.
3872 3872   * If PG_MATCH_COLOR is set, then NULL will be returned if there are no
3873 3873   * pages of the proper color even if there are pages of a different color.
3874 3874   * Otherwise, scan the bins for ones with pages.  For each bin with pages,
3875 3875   * try to lock one of them.  If no page can be locked, try the
3876 3876   * next bin.  Return NULL if a page can not be found and locked.
3877 3877   *
3878 3878   * Finds a pages, trys to lock it, then removes it.
3879 3879   */
3880 3880  
3881 3881  /*ARGSUSED*/
3882 3882  page_t *
3883 3883  page_get_cachelist(struct vnode *vp, u_offset_t off, struct seg *seg,
3884 3884      caddr_t vaddr, uint_t flags, struct lgrp *lgrp)
3885 3885  {
3886 3886          page_t          *pp;
3887 3887          struct as       *as = seg->s_as;
3888 3888          ulong_t         bin;
3889 3889          /*LINTED*/
3890 3890          int             mnode;
3891 3891          int             mtype;
3892 3892          lgrp_mnode_cookie_t     lgrp_cookie;
3893 3893  
3894 3894          /*
3895 3895           * If we aren't passed a specific lgroup, or pasased a freed lgrp
3896 3896           * assume we wish to allocate near to the current thread's home.
3897 3897           */
3898 3898          if (!LGRP_EXISTS(lgrp))
3899 3899                  lgrp = lgrp_home_lgrp();
3900 3900  
3901 3901          if (!kcage_on) {
3902 3902                  flags &= ~PG_NORELOC;
3903 3903                  flags |= PGI_NOCAGE;
3904 3904          }
3905 3905  
3906 3906          if ((flags & (PG_NORELOC | PG_PANIC | PG_PUSHPAGE)) == PG_NORELOC &&
3907 3907              kcage_freemem <= kcage_throttlefree) {
3908 3908                  /*
3909 3909                   * Reserve kcage_throttlefree pages for critical kernel
3910 3910                   * threads.
3911 3911                   *
3912 3912                   * Everybody else has to go to page_create_get_something()
3913 3913                   * to get a cage page, so we don't deadlock cageout.
3914 3914                   */
3915 3915                  return (NULL);
3916 3916          }
3917 3917  
3918 3918          /* LINTED */
3919 3919          AS_2_BIN(as, seg, vp, vaddr, bin, 0);
3920 3920  
3921 3921          ASSERT(bin < PAGE_GET_PAGECOLORS(0));
3922 3922  
3923 3923          /* LINTED */
3924 3924          MTYPE_INIT(mtype, vp, vaddr, flags, MMU_PAGESIZE);
3925 3925  
3926 3926          VM_STAT_ADD(vmm_vmstats.pgc_alloc);
3927 3927  
3928 3928          /*
3929 3929           * Try local cachelists first
3930 3930           */
3931 3931          LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL);
3932 3932          while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3933 3933                  pp = page_get_mnode_cachelist(bin, flags, mnode, mtype);
3934 3934                  if (pp != NULL) {
3935 3935                          VM_STAT_ADD(vmm_vmstats.pgc_allocok);
3936 3936                          DTRACE_PROBE4(page__get,
3937 3937                              lgrp_t *, lgrp,
3938 3938                              int, mnode,
3939 3939                              ulong_t, bin,
3940 3940                              uint_t, flags);
3941 3941                          return (pp);
3942 3942                  }
3943 3943          }
3944 3944  
3945 3945          lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1);
3946 3946  
3947 3947          /*
3948 3948           * Try freelists/cachelists that are farther away
3949 3949           * This is our only chance to allocate remote pages for PAGESIZE
3950 3950           * requests.
3951 3951           */
3952 3952          LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie);
3953 3953          while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3954 3954                  pp = page_get_mnode_freelist(mnode, bin, mtype,
3955 3955                      0, flags);
3956 3956                  if (pp != NULL) {
3957 3957                          VM_STAT_ADD(vmm_vmstats.pgc_allocokdeferred);
3958 3958                          DTRACE_PROBE4(page__get,
3959 3959                              lgrp_t *, lgrp,
3960 3960                              int, mnode,
3961 3961                              ulong_t, bin,
3962 3962                              uint_t, flags);
3963 3963                          return (pp);
3964 3964                  }
3965 3965                  pp = page_get_mnode_cachelist(bin, flags, mnode, mtype);
3966 3966                  if (pp != NULL) {
3967 3967                          VM_STAT_ADD(vmm_vmstats.pgc_allocokrem);
3968 3968                          DTRACE_PROBE4(page__get,
3969 3969                              lgrp_t *, lgrp,
3970 3970                              int, mnode,
3971 3971                              ulong_t, bin,
3972 3972                              uint_t, flags);
3973 3973                          return (pp);
3974 3974                  }
3975 3975          }
3976 3976  
3977 3977          VM_STAT_ADD(vmm_vmstats.pgc_allocfailed);
3978 3978          return (NULL);
3979 3979  }
3980 3980  
3981 3981  page_t *
3982 3982  page_get_mnode_cachelist(uint_t bin, uint_t flags, int mnode, int mtype)
3983 3983  {
3984 3984          kmutex_t                *pcm;
3985 3985          page_t                  *pp, *first_pp;
3986 3986          uint_t                  sbin;
3987 3987          int                     plw_initialized;
3988 3988          page_list_walker_t      plw;
3989 3989  
3990 3990          VM_STAT_ADD(vmm_vmstats.pgmc_alloc);
3991 3991  
3992 3992          /* LINTED */
3993 3993          MTYPE_START(mnode, mtype, flags);
3994 3994          if (mtype < 0) {        /* mnode does not have memory in mtype range */
3995 3995                  VM_STAT_ADD(vmm_vmstats.pgmc_allocempty);
3996 3996                  return (NULL);
3997 3997          }
3998 3998  
3999 3999  try_again:
4000 4000  
4001 4001          plw_initialized = 0;
4002 4002          plw.plw_ceq_dif = 1;
4003 4003  
4004 4004          /*
4005 4005           * Only hold one cachelist lock at a time, that way we
4006 4006           * can start anywhere and not have to worry about lock
4007 4007           * ordering.
4008 4008           */
4009 4009  
4010 4010          for (plw.plw_count = 0;
4011 4011              plw.plw_count < plw.plw_ceq_dif; plw.plw_count++) {
4012 4012                  sbin = bin;
4013 4013                  do {
4014 4014  
4015 4015                          if (!PAGE_CACHELISTS(mnode, bin, mtype))
4016 4016                                  goto bin_empty_1;
4017 4017                          pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST);
4018 4018                          mutex_enter(pcm);
4019 4019                          pp = PAGE_CACHELISTS(mnode, bin, mtype);
4020 4020                          if (pp == NULL)
4021 4021                                  goto bin_empty_0;
4022 4022  
4023 4023                          first_pp = pp;
4024 4024                          ASSERT(pp->p_vnode);
4025 4025                          ASSERT(PP_ISAGED(pp) == 0);
4026 4026                          ASSERT(pp->p_szc == 0);
4027 4027                          ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
4028 4028                          while (IS_DUMP_PAGE(pp) || !page_trylock(pp, SE_EXCL)) {
4029 4029                                  pp = pp->p_next;
4030 4030                                  ASSERT(pp->p_szc == 0);
4031 4031                                  if (pp == first_pp) {
4032 4032                                          /*
4033 4033                                           * We have searched the complete list!
4034 4034                                           * And all of them (might only be one)
4035 4035                                           * are locked. This can happen since
4036 4036                                           * these pages can also be found via
4037 4037                                           * the hash list. When found via the
4038 4038                                           * hash list, they are locked first,
4039 4039                                           * then removed. We give up to let the
4040 4040                                           * other thread run.
4041 4041                                           */
4042 4042                                          pp = NULL;
4043 4043                                          break;
4044 4044                                  }
4045 4045                                  ASSERT(pp->p_vnode);
4046 4046                                  ASSERT(PP_ISFREE(pp));
4047 4047                                  ASSERT(PP_ISAGED(pp) == 0);
4048 4048                                  ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) ==
4049 4049                                      mnode);
4050 4050                          }
4051 4051  
4052 4052                          if (pp) {
4053 4053                                  page_t  **ppp;
4054 4054                                  /*
4055 4055                                   * Found and locked a page.
4056 4056                                   * Pull it off the list.
4057 4057                                   */
4058 4058                                  ASSERT(mtype == PP_2_MTYPE(pp));
4059 4059                                  ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
4060 4060                                  page_sub(ppp, pp);
4061 4061                                  /*
4062 4062                                   * Subtract counters before releasing pcm mutex
4063 4063                                   * to avoid a race with page_freelist_coalesce
4064 4064                                   * and page_freelist_split.
4065 4065                                   */
4066 4066                                  page_ctr_sub(mnode, mtype, pp, PG_CACHE_LIST);
4067 4067                                  mutex_exit(pcm);
4068 4068                                  ASSERT(pp->p_vnode);
4069 4069                                  ASSERT(PP_ISAGED(pp) == 0);
4070 4070  #if defined(__sparc)
4071 4071                                  ASSERT(!kcage_on ||
4072 4072                                      (flags & PG_NORELOC) == 0 ||
4073 4073                                      PP_ISNORELOC(pp));
4074 4074                                  if (PP_ISNORELOC(pp)) {
4075 4075                                          kcage_freemem_sub(1);
4076 4076                                  }
4077 4077  #endif
4078 4078                                  VM_STAT_ADD(vmm_vmstats. pgmc_allocok);
4079 4079                                  return (pp);
4080 4080                          }
4081 4081  bin_empty_0:
4082 4082                          mutex_exit(pcm);
4083 4083  bin_empty_1:
4084 4084                          if (plw_initialized == 0) {
4085 4085                                  page_list_walk_init(0, flags, bin, 0, 1, &plw);
4086 4086                                  plw_initialized = 1;
4087 4087                          }
4088 4088                          /* calculate the next bin with equivalent color */
4089 4089                          bin = ADD_MASKED(bin, plw.plw_bin_step,
4090 4090                              plw.plw_ceq_mask[0], plw.plw_color_mask);
4091 4091                  } while (sbin != bin);
4092 4092  
4093 4093                  if (plw.plw_ceq_dif > 1)
4094 4094                          bin = page_list_walk_next_bin(0, bin, &plw);
4095 4095          }
4096 4096  
4097 4097          MTYPE_NEXT(mnode, mtype, flags);
4098 4098          if (mtype >= 0)
4099 4099                  goto try_again;
4100 4100  
4101 4101          VM_STAT_ADD(vmm_vmstats.pgmc_allocfailed);
4102 4102          return (NULL);
4103 4103  }
4104 4104  
4105 4105  #ifdef DEBUG
4106 4106  #define REPL_PAGE_STATS
4107 4107  #endif /* DEBUG */
4108 4108  
4109 4109  #ifdef REPL_PAGE_STATS
4110 4110  struct repl_page_stats {
4111 4111          uint_t  ngets;
4112 4112          uint_t  ngets_noreloc;
4113 4113          uint_t  npgr_noreloc;
4114 4114          uint_t  nnopage_first;
4115 4115          uint_t  nnopage;
4116 4116          uint_t  nhashout;
4117 4117          uint_t  nnofree;
4118 4118          uint_t  nnext_pp;
4119 4119  } repl_page_stats;
4120 4120  #define REPL_STAT_INCR(v)       atomic_inc_32(&repl_page_stats.v)
4121 4121  #else /* REPL_PAGE_STATS */
4122 4122  #define REPL_STAT_INCR(v)
4123 4123  #endif /* REPL_PAGE_STATS */
4124 4124  
4125 4125  int     pgrppgcp;
4126 4126  
4127 4127  /*
4128 4128   * The freemem accounting must be done by the caller.
4129 4129   * First we try to get a replacement page of the same size as like_pp,
4130 4130   * if that is not possible, then we just get a set of discontiguous
4131 4131   * PAGESIZE pages.
4132 4132   */
4133 4133  page_t *
4134 4134  page_get_replacement_page(page_t *orig_like_pp, struct lgrp *lgrp_target,
4135 4135      uint_t pgrflags)
4136 4136  {
4137 4137          page_t          *like_pp;
4138 4138          page_t          *pp, *pplist;
4139 4139          page_t          *pl = NULL;
4140 4140          ulong_t         bin;
4141 4141          int             mnode, page_mnode;
4142 4142          int             szc;
4143 4143          spgcnt_t        npgs, pg_cnt;
4144 4144          pfn_t           pfnum;
4145 4145          int             mtype;
4146 4146          int             flags = 0;
4147 4147          lgrp_mnode_cookie_t     lgrp_cookie;
4148 4148          lgrp_t          *lgrp;
4149 4149  
4150 4150          REPL_STAT_INCR(ngets);
4151 4151          like_pp = orig_like_pp;
4152 4152          ASSERT(PAGE_EXCL(like_pp));
4153 4153  
4154 4154          szc = like_pp->p_szc;
4155 4155          npgs = page_get_pagecnt(szc);
4156 4156          /*
4157 4157           * Now we reset like_pp to the base page_t.
4158 4158           * That way, we won't walk past the end of this 'szc' page.
4159 4159           */
4160 4160          pfnum = PFN_BASE(like_pp->p_pagenum, szc);
4161 4161          like_pp = page_numtopp_nolock(pfnum);
4162 4162          ASSERT(like_pp->p_szc == szc);
4163 4163  
4164 4164          if (PP_ISNORELOC(like_pp)) {
4165 4165                  ASSERT(kcage_on);
4166 4166                  REPL_STAT_INCR(ngets_noreloc);
4167 4167                  flags = PGI_RELOCONLY;
4168 4168          } else if (pgrflags & PGR_NORELOC) {
4169 4169                  ASSERT(kcage_on);
4170 4170                  REPL_STAT_INCR(npgr_noreloc);
4171 4171                  flags = PG_NORELOC;

↓ open down ↓

4171 lines elided

↑ open up ↑

4172 4172          }
4173 4173  
4174 4174          /*
4175 4175           * Kernel pages must always be replaced with the same size
4176 4176           * pages, since we cannot properly handle demotion of kernel
4177 4177           * pages.
4178 4178           */
4179 4179          if (PP_ISKAS(like_pp))
4180 4180                  pgrflags |= PGR_SAMESZC;
4181 4181  
4182      -        /* LINTED */
4183      -        MTYPE_PGR_INIT(mtype, flags, like_pp, page_mnode, npgs);
     4182 +        MTYPE_PGR_INIT(mtype, flags, like_pp, npgs);
4184 4183  
4185 4184          while (npgs) {
4186 4185                  pplist = NULL;
4187 4186                  for (;;) {
4188 4187                          pg_cnt = page_get_pagecnt(szc);
4189 4188                          bin = PP_2_BIN(like_pp);
4190 4189                          ASSERT(like_pp->p_szc == orig_like_pp->p_szc);
4191 4190                          ASSERT(pg_cnt <= npgs);
4192 4191  
4193 4192                          /*

4194 4193                           * If an lgroup was specified, try to get the
4195 4194                           * page from that lgroup.
4196 4195                           * NOTE: Must be careful with code below because
4197 4196                           *       lgroup may disappear and reappear since there
4198 4197                           *       is no locking for lgroup here.
4199 4198                           */
4200 4199                          if (LGRP_EXISTS(lgrp_target)) {
4201 4200                                  /*
4202 4201                                   * Keep local variable for lgroup separate
4203 4202                                   * from lgroup argument since this code should
4204 4203                                   * only be exercised when lgroup argument
4205 4204                                   * exists....
4206 4205                                   */
4207 4206                                  lgrp = lgrp_target;
4208 4207  
4209 4208                                  /* Try the lgroup's freelists first */
4210 4209                                  LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4211 4210                                      LGRP_SRCH_LOCAL);
4212 4211                                  while ((pplist == NULL) &&
4213 4212                                      (mnode = lgrp_memnode_choose(&lgrp_cookie))
4214 4213                                      != -1) {
4215 4214                                          pplist =
4216 4215                                              page_get_mnode_freelist(mnode, bin,
4217 4216                                              mtype, szc, flags);
4218 4217                                  }
4219 4218  
4220 4219                                  /*
4221 4220                                   * Now try it's cachelists if this is a
4222 4221                                   * small page. Don't need to do it for
4223 4222                                   * larger ones since page_freelist_coalesce()
4224 4223                                   * already failed.
4225 4224                                   */
4226 4225                                  if (pplist != NULL || szc != 0)
4227 4226                                          break;
4228 4227  
4229 4228                                  /* Now try it's cachelists */
4230 4229                                  LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4231 4230                                      LGRP_SRCH_LOCAL);
4232 4231  
4233 4232                                  while ((pplist == NULL) &&
4234 4233                                      (mnode = lgrp_memnode_choose(&lgrp_cookie))
4235 4234                                      != -1) {
4236 4235                                          pplist =
4237 4236                                              page_get_mnode_cachelist(bin, flags,
4238 4237                                              mnode, mtype);
4239 4238                                  }
4240 4239                                  if (pplist != NULL) {
4241 4240                                          page_hashout(pplist, NULL);
4242 4241                                          PP_SETAGED(pplist);
4243 4242                                          REPL_STAT_INCR(nhashout);
4244 4243                                          break;
4245 4244                                  }
4246 4245                                  /* Done looking in this lgroup. Bail out. */
4247 4246                                  break;
4248 4247                          }
4249 4248  
4250 4249                          /*
4251 4250                           * No lgroup was specified (or lgroup was removed by
4252 4251                           * DR, so just try to get the page as close to
4253 4252                           * like_pp's mnode as possible.
4254 4253                           * First try the local freelist...
4255 4254                           */
4256 4255                          mnode = PP_2_MEM_NODE(like_pp);
4257 4256                          pplist = page_get_mnode_freelist(mnode, bin,
4258 4257                              mtype, szc, flags);
4259 4258                          if (pplist != NULL)
4260 4259                                  break;
4261 4260  
4262 4261                          REPL_STAT_INCR(nnofree);
4263 4262  
4264 4263                          /*
4265 4264                           * ...then the local cachelist. Don't need to do it for
4266 4265                           * larger pages cause page_freelist_coalesce() already
4267 4266                           * failed there anyway.
4268 4267                           */
4269 4268                          if (szc == 0) {
4270 4269                                  pplist = page_get_mnode_cachelist(bin, flags,
4271 4270                                      mnode, mtype);
4272 4271                                  if (pplist != NULL) {
4273 4272                                          page_hashout(pplist, NULL);
4274 4273                                          PP_SETAGED(pplist);
4275 4274                                          REPL_STAT_INCR(nhashout);
4276 4275                                          break;
4277 4276                                  }
4278 4277                          }
4279 4278  
4280 4279                          /* Now try remote freelists */
4281 4280                          page_mnode = mnode;
4282 4281                          lgrp =
4283 4282                              lgrp_hand_to_lgrp(MEM_NODE_2_LGRPHAND(page_mnode));
4284 4283                          LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4285 4284                              LGRP_SRCH_HIER);
4286 4285                          while (pplist == NULL &&
4287 4286                              (mnode = lgrp_memnode_choose(&lgrp_cookie))
4288 4287                              != -1) {
4289 4288                                  /*
4290 4289                                   * Skip local mnode.
4291 4290                                   */
4292 4291                                  if ((mnode == page_mnode) ||
4293 4292                                      (mem_node_config[mnode].exists == 0))
4294 4293                                          continue;
4295 4294  
4296 4295                                  pplist = page_get_mnode_freelist(mnode,
4297 4296                                      bin, mtype, szc, flags);
4298 4297                          }
4299 4298  
4300 4299                          if (pplist != NULL)
4301 4300                                  break;
4302 4301  
4303 4302  
4304 4303                          /* Now try remote cachelists */
4305 4304                          LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4306 4305                              LGRP_SRCH_HIER);
4307 4306                          while (pplist == NULL && szc == 0) {
4308 4307                                  mnode = lgrp_memnode_choose(&lgrp_cookie);
4309 4308                                  if (mnode == -1)
4310 4309                                          break;
4311 4310                                  /*
4312 4311                                   * Skip local mnode.
4313 4312                                   */
4314 4313                                  if ((mnode == page_mnode) ||
4315 4314                                      (mem_node_config[mnode].exists == 0))
4316 4315                                          continue;
4317 4316  
4318 4317                                  pplist = page_get_mnode_cachelist(bin,
4319 4318                                      flags, mnode, mtype);
4320 4319  
4321 4320                                  if (pplist != NULL) {
4322 4321                                          page_hashout(pplist, NULL);
4323 4322                                          PP_SETAGED(pplist);
4324 4323                                          REPL_STAT_INCR(nhashout);
4325 4324                                          break;
4326 4325                                  }
4327 4326                          }
4328 4327  
4329 4328                          /*
4330 4329                           * Break out of while loop under the following cases:
4331 4330                           * - If we successfully got a page.
4332 4331                           * - If pgrflags specified only returning a specific
4333 4332                           *   page size and we could not find that page size.
4334 4333                           * - If we could not satisfy the request with PAGESIZE
4335 4334                           *   or larger pages.
4336 4335                           */
4337 4336                          if (pplist != NULL || szc == 0)
4338 4337                                  break;
4339 4338  
4340 4339                          if ((pgrflags & PGR_SAMESZC) || pgrppgcp) {
4341 4340                                  /* try to find contig page */
4342 4341  
4343 4342                                  LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4344 4343                                      LGRP_SRCH_HIER);
4345 4344  
4346 4345                                  while ((pplist == NULL) &&
4347 4346                                      (mnode =
4348 4347                                      lgrp_memnode_choose(&lgrp_cookie))
4349 4348                                      != -1) {
4350 4349                                          pplist = page_get_contig_pages(
4351 4350                                              mnode, bin, mtype, szc,
4352 4351                                              flags | PGI_PGCPHIPRI);
4353 4352                                  }
4354 4353                                  break;
4355 4354                          }
4356 4355  
4357 4356                          /*
4358 4357                           * The correct thing to do here is try the next
4359 4358                           * page size down using szc--. Due to a bug
4360 4359                           * with the processing of HAT_RELOAD_SHARE
4361 4360                           * where the sfmmu_ttecnt arrays of all
4362 4361                           * hats sharing an ISM segment don't get updated,
4363 4362                           * using intermediate size pages for relocation
4364 4363                           * can lead to continuous page faults.
4365 4364                           */
4366 4365                          szc = 0;
4367 4366                  }
4368 4367  
4369 4368                  if (pplist != NULL) {
4370 4369                          DTRACE_PROBE4(page__get,
4371 4370                              lgrp_t *, lgrp,
4372 4371                              int, mnode,
4373 4372                              ulong_t, bin,
4374 4373                              uint_t, flags);
4375 4374  
4376 4375                          while (pplist != NULL && pg_cnt--) {
4377 4376                                  ASSERT(pplist != NULL);
4378 4377                                  pp = pplist;
4379 4378                                  page_sub(&pplist, pp);
4380 4379                                  PP_CLRFREE(pp);
4381 4380                                  PP_CLRAGED(pp);
4382 4381                                  page_list_concat(&pl, &pp);
4383 4382                                  npgs--;
4384 4383                                  like_pp = like_pp + 1;
4385 4384                                  REPL_STAT_INCR(nnext_pp);
4386 4385                          }
4387 4386                          ASSERT(pg_cnt == 0);
4388 4387                  } else {
4389 4388                          break;
4390 4389                  }
4391 4390          }
4392 4391  
4393 4392          if (npgs) {
4394 4393                  /*
4395 4394                   * We were unable to allocate the necessary number
4396 4395                   * of pages.
4397 4396                   * We need to free up any pl.
4398 4397                   */
4399 4398                  REPL_STAT_INCR(nnopage);
4400 4399                  page_free_replacement_page(pl);
4401 4400                  return (NULL);
4402 4401          } else {
4403 4402                  return (pl);
4404 4403          }
4405 4404  }
4406 4405  
4407 4406  /*
4408 4407   * demote a free large page to it's constituent pages
4409 4408   */
4410 4409  void
4411 4410  page_demote_free_pages(page_t *pp)
4412 4411  {
4413 4412  
4414 4413          int mnode;
4415 4414  
4416 4415          ASSERT(pp != NULL);
4417 4416          ASSERT(PAGE_LOCKED(pp));
4418 4417          ASSERT(PP_ISFREE(pp));
4419 4418          ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes);
4420 4419  
4421 4420          mnode = PP_2_MEM_NODE(pp);
4422 4421          page_freelist_lock(mnode);
4423 4422          if (pp->p_szc != 0) {
4424 4423                  (void) page_demote(mnode, PFN_BASE(pp->p_pagenum,
4425 4424                      pp->p_szc), 0, pp->p_szc, 0, PC_NO_COLOR, PC_FREE);
4426 4425          }
4427 4426          page_freelist_unlock(mnode);
4428 4427          ASSERT(pp->p_szc == 0);
4429 4428  }
4430 4429  
4431 4430  /*
4432 4431   * Factor in colorequiv to check additional 'equivalent' bins.
4433 4432   * colorequiv may be set in /etc/system
4434 4433   */
4435 4434  void
4436 4435  page_set_colorequiv_arr(void)
4437 4436  {
4438 4437          if (colorequiv > 1) {
4439 4438                  int i;
4440 4439                  uint_t sv_a = lowbit(colorequiv) - 1;
4441 4440  
4442 4441                  if (sv_a > 15)
4443 4442                          sv_a = 15;
4444 4443  
4445 4444                  for (i = 0; i < MMU_PAGE_SIZES; i++) {
4446 4445                          uint_t colors;
4447 4446                          uint_t a = sv_a;
4448 4447  
4449 4448                          if ((colors = hw_page_array[i].hp_colors) <= 1) {
4450 4449                                  continue;
4451 4450                          }
4452 4451                          while ((colors >> a) == 0)
4453 4452                                  a--;
4454 4453                          if ((a << 4) > colorequivszc[i]) {
4455 4454                                  colorequivszc[i] = (a << 4);
4456 4455                          }
4457 4456                  }
4458 4457          }
4459 4458  }

↓ open down ↓

266 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX