2 Wdiff usr/src/uts/i86pc/vm/vm_machdep.c

Print this page

11528 Makefile.noget can get gone
11529 Use -Wno-maybe-initialized

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/i86pc/vm/vm_machdep.c
          +++ new/usr/src/uts/i86pc/vm/vm_machdep.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   */
  24   24  /*
  25   25   * Copyright (c) 2010, Intel Corporation.
  26   26   * All rights reserved.
  27   27   * Copyright 2019, Joyent, Inc.
  28   28   */
  29   29  
  30   30  /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  31   31  /*      All Rights Reserved   */
  32   32  
  33   33  /*
  34   34   * Portions of this source code were derived from Berkeley 4.3 BSD
  35   35   * under license from the Regents of the University of California.
  36   36   */
  37   37  
  38   38  /*
  39   39   * UNIX machine dependent virtual memory support.
  40   40   */
  41   41  
  42   42  #include <sys/types.h>
  43   43  #include <sys/param.h>
  44   44  #include <sys/systm.h>
  45   45  #include <sys/user.h>
  46   46  #include <sys/proc.h>
  47   47  #include <sys/kmem.h>
  48   48  #include <sys/vmem.h>
  49   49  #include <sys/buf.h>
  50   50  #include <sys/cpuvar.h>
  51   51  #include <sys/lgrp.h>
  52   52  #include <sys/disp.h>
  53   53  #include <sys/vm.h>
  54   54  #include <sys/mman.h>
  55   55  #include <sys/vnode.h>
  56   56  #include <sys/cred.h>
  57   57  #include <sys/exec.h>
  58   58  #include <sys/exechdr.h>
  59   59  #include <sys/debug.h>
  60   60  #include <sys/vmsystm.h>
  61   61  #include <sys/swap.h>
  62   62  #include <sys/dumphdr.h>
  63   63  #include <sys/random.h>
  64   64  
  65   65  #include <vm/hat.h>
  66   66  #include <vm/as.h>
  67   67  #include <vm/seg.h>
  68   68  #include <vm/seg_kp.h>
  69   69  #include <vm/seg_vn.h>
  70   70  #include <vm/page.h>
  71   71  #include <vm/seg_kmem.h>
  72   72  #include <vm/seg_kpm.h>
  73   73  #include <vm/vm_dep.h>
  74   74  
  75   75  #include <sys/cpu.h>
  76   76  #include <sys/vm_machparam.h>
  77   77  #include <sys/memlist.h>
  78   78  #include <sys/bootconf.h> /* XXX the memlist stuff belongs in memlist_plat.h */
  79   79  #include <vm/hat_i86.h>
  80   80  #include <sys/x86_archext.h>
  81   81  #include <sys/elf_386.h>
  82   82  #include <sys/cmn_err.h>
  83   83  #include <sys/archsystm.h>
  84   84  #include <sys/machsystm.h>
  85   85  #include <sys/secflags.h>
  86   86  
  87   87  #include <sys/vtrace.h>
  88   88  #include <sys/ddidmareq.h>
  89   89  #include <sys/promif.h>
  90   90  #include <sys/memnode.h>
  91   91  #include <sys/stack.h>
  92   92  #include <util/qsort.h>
  93   93  #include <sys/taskq.h>
  94   94  
  95   95  #ifdef __xpv
  96   96  
  97   97  #include <sys/hypervisor.h>
  98   98  #include <sys/xen_mmu.h>
  99   99  #include <sys/balloon_impl.h>
 100  100  
 101  101  /*
 102  102   * domain 0 pages usable for DMA are kept pre-allocated and kept in
 103  103   * distinct lists, ordered by increasing mfn.
 104  104   */
 105  105  static kmutex_t io_pool_lock;
 106  106  static kmutex_t contig_list_lock;
 107  107  static page_t *io_pool_4g;      /* pool for 32 bit dma limited devices */
 108  108  static page_t *io_pool_16m;     /* pool for 24 bit dma limited legacy devices */
 109  109  static long io_pool_cnt;
 110  110  static long io_pool_cnt_max = 0;
 111  111  #define DEFAULT_IO_POOL_MIN     128
 112  112  static long io_pool_cnt_min = DEFAULT_IO_POOL_MIN;
 113  113  static long io_pool_cnt_lowater = 0;
 114  114  static long io_pool_shrink_attempts; /* how many times did we try to shrink */
 115  115  static long io_pool_shrinks;    /* how many times did we really shrink */
 116  116  static long io_pool_grows;      /* how many times did we grow */
 117  117  static mfn_t start_mfn = 1;
 118  118  static caddr_t io_pool_kva;     /* use to alloc pages when needed */
 119  119  
 120  120  static int create_contig_pfnlist(uint_t);
 121  121  
 122  122  /*
 123  123   * percentage of phys mem to hold in the i/o pool
 124  124   */
 125  125  #define DEFAULT_IO_POOL_PCT     2
 126  126  static long io_pool_physmem_pct = DEFAULT_IO_POOL_PCT;
 127  127  static void page_io_pool_sub(page_t **, page_t *, page_t *);
 128  128  int ioalloc_dbg = 0;
 129  129  
 130  130  #endif /* __xpv */
 131  131  
 132  132  uint_t vac_colors = 1;
 133  133  
 134  134  int largepagesupport = 0;
 135  135  extern uint_t page_create_new;
 136  136  extern uint_t page_create_exists;
 137  137  extern uint_t page_create_putbacks;
 138  138  /*
 139  139   * Allow users to disable the kernel's use of SSE.
 140  140   */
 141  141  extern int use_sse_pagecopy, use_sse_pagezero;
 142  142  
 143  143  /*
 144  144   * combined memory ranges from mnode and memranges[] to manage single
 145  145   * mnode/mtype dimension in the page lists.
 146  146   */
 147  147  typedef struct {
 148  148          pfn_t   mnr_pfnlo;
 149  149          pfn_t   mnr_pfnhi;
 150  150          int     mnr_mnode;
 151  151          int     mnr_memrange;           /* index into memranges[] */
 152  152          int     mnr_next;               /* next lower PA mnoderange */
 153  153          int     mnr_exists;
 154  154          /* maintain page list stats */
 155  155          pgcnt_t mnr_mt_clpgcnt;         /* cache list cnt */
 156  156          pgcnt_t mnr_mt_flpgcnt[MMU_PAGE_SIZES]; /* free list cnt per szc */
 157  157          pgcnt_t mnr_mt_totcnt;          /* sum of cache and free lists */
 158  158  #ifdef DEBUG
 159  159          struct mnr_mts {                /* mnode/mtype szc stats */
 160  160                  pgcnt_t mnr_mts_pgcnt;
 161  161                  int     mnr_mts_colors;
 162  162                  pgcnt_t *mnr_mtsc_pgcnt;
 163  163          }       *mnr_mts;
 164  164  #endif
 165  165  } mnoderange_t;
 166  166  
 167  167  #define MEMRANGEHI(mtype)                                               \
 168  168          ((mtype > 0) ? memranges[mtype - 1] - 1: physmax)
 169  169  #define MEMRANGELO(mtype)       (memranges[mtype])
 170  170  
 171  171  #define MTYPE_FREEMEM(mt)       (mnoderanges[mt].mnr_mt_totcnt)
 172  172  
 173  173  /*
 174  174   * As the PC architecture evolved memory up was clumped into several
 175  175   * ranges for various historical I/O devices to do DMA.
 176  176   * < 16Meg - ISA bus
 177  177   * < 2Gig - ???
 178  178   * < 4Gig - PCI bus or drivers that don't understand PAE mode
 179  179   *
 180  180   * These are listed in reverse order, so that we can skip over unused
 181  181   * ranges on machines with small memories.
 182  182   *
 183  183   * For now under the Hypervisor, we'll only ever have one memrange.
 184  184   */
 185  185  #define PFN_4GIG        0x100000
 186  186  #define PFN_16MEG       0x1000
 187  187  /* Indices into the memory range (arch_memranges) array. */
 188  188  #define MRI_4G          0
 189  189  #define MRI_2G          1
 190  190  #define MRI_16M         2
 191  191  #define MRI_0           3
 192  192  static pfn_t arch_memranges[NUM_MEM_RANGES] = {
 193  193      PFN_4GIG,   /* pfn range for 4G and above */
 194  194      0x80000,    /* pfn range for 2G-4G */
 195  195      PFN_16MEG,  /* pfn range for 16M-2G */
 196  196      0x00000,    /* pfn range for 0-16M */
 197  197  };
 198  198  pfn_t *memranges = &arch_memranges[0];
 199  199  int nranges = NUM_MEM_RANGES;
 200  200  
 201  201  /*
 202  202   * This combines mem_node_config and memranges into one data
 203  203   * structure to be used for page list management.
 204  204   */
 205  205  static mnoderange_t *mnoderanges;
 206  206  static int mnoderangecnt;
 207  207  static int mtype4g;
 208  208  static int mtype16m;
 209  209  static int mtypetop;
 210  210  
 211  211  /*
 212  212   * 4g memory management variables for systems with more than 4g of memory:
 213  213   *
 214  214   * physical memory below 4g is required for 32bit dma devices and, currently,
 215  215   * for kmem memory. On systems with more than 4g of memory, the pool of memory
 216  216   * below 4g can be depleted without any paging activity given that there is
 217  217   * likely to be sufficient memory above 4g.
 218  218   *
 219  219   * physmax4g is set true if the largest pfn is over 4g. The rest of the
 220  220   * 4g memory management code is enabled only when physmax4g is true.
 221  221   *
 222  222   * maxmem4g is the count of the maximum number of pages on the page lists
 223  223   * with physical addresses below 4g. It can be a lot less then 4g given that
 224  224   * BIOS may reserve large chunks of space below 4g for hot plug pci devices,
 225  225   * agp aperture etc.
 226  226   *
 227  227   * freemem4g maintains the count of the number of available pages on the
 228  228   * page lists with physical addresses below 4g.
 229  229   *
 230  230   * DESFREE4G specifies the desired amount of below 4g memory. It defaults to
 231  231   * 6% (desfree4gshift = 4) of maxmem4g.
 232  232   *
 233  233   * RESTRICT4G_ALLOC returns true if freemem4g falls below DESFREE4G
 234  234   * and the amount of physical memory above 4g is greater than freemem4g.
 235  235   * In this case, page_get_* routines will restrict below 4g allocations
 236  236   * for requests that don't specifically require it.
 237  237   */
 238  238  
 239  239  #define DESFREE4G       (maxmem4g >> desfree4gshift)
 240  240  
 241  241  #define RESTRICT4G_ALLOC                                        \
 242  242          (physmax4g && (freemem4g < DESFREE4G) && ((freemem4g << 1) < freemem))
 243  243  
 244  244  static pgcnt_t  maxmem4g;
 245  245  static pgcnt_t  freemem4g;
 246  246  static int      physmax4g;
 247  247  static int      desfree4gshift = 4;     /* maxmem4g shift to derive DESFREE4G */
 248  248  
 249  249  /*
 250  250   * 16m memory management:
 251  251   *
 252  252   * reserve some amount of physical memory below 16m for legacy devices.
 253  253   *
 254  254   * RESTRICT16M_ALLOC returns true if an there are sufficient free pages above
 255  255   * 16m or if the 16m pool drops below DESFREE16M.
 256  256   *
 257  257   * In this case, general page allocations via page_get_{free,cache}list
 258  258   * routines will be restricted from allocating from the 16m pool. Allocations
 259  259   * that require specific pfn ranges (page_get_anylist) and PG_PANIC allocations
 260  260   * are not restricted.
 261  261   */
 262  262  
 263  263  #define FREEMEM16M      MTYPE_FREEMEM(mtype16m)
 264  264  #define DESFREE16M      desfree16m
 265  265  #define RESTRICT16M_ALLOC(freemem, pgcnt, flags) \
 266  266          (mtype16m != -1 && (freemem != 0) && ((flags & PG_PANIC) == 0) && \
 267  267              ((freemem >= (FREEMEM16M)) || \
 268  268              (FREEMEM16M  < (DESFREE16M + pgcnt))))
 269  269  
 270  270  static pgcnt_t  desfree16m = 0x380;
 271  271  
 272  272  /*
 273  273   * This can be patched via /etc/system to allow old non-PAE aware device
 274  274   * drivers to use kmem_alloc'd memory on 32 bit systems with > 4Gig RAM.
 275  275   */
 276  276  int restricted_kmemalloc = 0;
 277  277  
 278  278  #ifdef VM_STATS
 279  279  struct {
 280  280          ulong_t pga_alloc;
 281  281          ulong_t pga_notfullrange;
 282  282          ulong_t pga_nulldmaattr;
 283  283          ulong_t pga_allocok;
 284  284          ulong_t pga_allocfailed;
 285  285          ulong_t pgma_alloc;
 286  286          ulong_t pgma_allocok;
 287  287          ulong_t pgma_allocfailed;
 288  288          ulong_t pgma_allocempty;
 289  289  } pga_vmstats;
 290  290  #endif
 291  291  
 292  292  uint_t mmu_page_sizes;
 293  293  
 294  294  /* How many page sizes the users can see */
 295  295  uint_t mmu_exported_page_sizes;
 296  296  
 297  297  /* page sizes that legacy applications can see */
 298  298  uint_t mmu_legacy_page_sizes;
 299  299  
 300  300  /*
 301  301   * Number of pages in 1 GB.  Don't enable automatic large pages if we have
 302  302   * fewer than this many pages.
 303  303   */
 304  304  pgcnt_t shm_lpg_min_physmem = 1 << (30 - MMU_PAGESHIFT);
 305  305  pgcnt_t privm_lpg_min_physmem = 1 << (30 - MMU_PAGESHIFT);
 306  306  
 307  307  /*
 308  308   * Maximum and default segment size tunables for user private
 309  309   * and shared anon memory, and user text and initialized data.
 310  310   * These can be patched via /etc/system to allow large pages
 311  311   * to be used for mapping application private and shared anon memory.
 312  312   */
 313  313  size_t mcntl0_lpsize = MMU_PAGESIZE;
 314  314  size_t max_uheap_lpsize = MMU_PAGESIZE;
 315  315  size_t default_uheap_lpsize = MMU_PAGESIZE;
 316  316  size_t max_ustack_lpsize = MMU_PAGESIZE;
 317  317  size_t default_ustack_lpsize = MMU_PAGESIZE;
 318  318  size_t max_privmap_lpsize = MMU_PAGESIZE;
 319  319  size_t max_uidata_lpsize = MMU_PAGESIZE;
 320  320  size_t max_utext_lpsize = MMU_PAGESIZE;
 321  321  size_t max_shm_lpsize = MMU_PAGESIZE;
 322  322  
 323  323  
 324  324  /*
 325  325   * initialized by page_coloring_init().
 326  326   */
 327  327  uint_t  page_colors;
 328  328  uint_t  page_colors_mask;
 329  329  uint_t  page_coloring_shift;
 330  330  int     cpu_page_colors;
 331  331  static uint_t   l2_colors;
 332  332  
 333  333  /*
 334  334   * Page freelists and cachelists are dynamically allocated once mnoderangecnt
 335  335   * and page_colors are calculated from the l2 cache n-way set size.  Within a
 336  336   * mnode range, the page freelist and cachelist are hashed into bins based on
 337  337   * color. This makes it easier to search for a page within a specific memory
 338  338   * range.
 339  339   */
 340  340  #define PAGE_COLORS_MIN 16
 341  341  
 342  342  page_t ****page_freelists;
 343  343  page_t ***page_cachelists;
 344  344  
 345  345  
 346  346  /*
 347  347   * Used by page layer to know about page sizes
 348  348   */
 349  349  hw_pagesize_t hw_page_array[MAX_NUM_LEVEL + 1];
 350  350  
 351  351  kmutex_t        *fpc_mutex[NPC_MUTEX];
 352  352  kmutex_t        *cpc_mutex[NPC_MUTEX];
 353  353  
 354  354  /* Lock to protect mnoderanges array for memory DR operations. */
 355  355  static kmutex_t mnoderange_lock;
 356  356  
 357  357  /*
 358  358   * Only let one thread at a time try to coalesce large pages, to
 359  359   * prevent them from working against each other.
 360  360   */
 361  361  static kmutex_t contig_lock;
 362  362  #define CONTIG_LOCK()   mutex_enter(&contig_lock);
 363  363  #define CONTIG_UNLOCK() mutex_exit(&contig_lock);
 364  364  
 365  365  #define PFN_16M         (mmu_btop((uint64_t)0x1000000))
 366  366  
 367  367  caddr_t
 368  368  i86devmap(pfn_t pf, pgcnt_t pgcnt, uint_t prot)
 369  369  {
 370  370          caddr_t addr;
 371  371          caddr_t addr1;
 372  372          page_t *pp;
 373  373  
 374  374          addr1 = addr = vmem_alloc(heap_arena, mmu_ptob(pgcnt), VM_SLEEP);
 375  375  
 376  376          for (; pgcnt != 0; addr += MMU_PAGESIZE, ++pf, --pgcnt) {
 377  377                  pp = page_numtopp_nolock(pf);
 378  378                  if (pp == NULL) {
 379  379                          hat_devload(kas.a_hat, addr, MMU_PAGESIZE, pf,
 380  380                              prot | HAT_NOSYNC, HAT_LOAD_LOCK);
 381  381                  } else {
 382  382                          hat_memload(kas.a_hat, addr, pp,
 383  383                              prot | HAT_NOSYNC, HAT_LOAD_LOCK);
 384  384                  }
 385  385          }
 386  386  
 387  387          return (addr1);
 388  388  }
 389  389  
 390  390  /*
 391  391   * This routine is like page_numtopp, but accepts only free pages, which
 392  392   * it allocates (unfrees) and returns with the exclusive lock held.
 393  393   * It is used by machdep.c/dma_init() to find contiguous free pages.
 394  394   */
 395  395  page_t *
 396  396  page_numtopp_alloc(pfn_t pfnum)
 397  397  {
 398  398          page_t *pp;
 399  399  
 400  400  retry:
 401  401          pp = page_numtopp_nolock(pfnum);
 402  402          if (pp == NULL) {
 403  403                  return (NULL);
 404  404          }
 405  405  
 406  406          if (!page_trylock(pp, SE_EXCL)) {
 407  407                  return (NULL);
 408  408          }
 409  409  
 410  410          if (page_pptonum(pp) != pfnum) {
 411  411                  page_unlock(pp);
 412  412                  goto retry;
 413  413          }
 414  414  
 415  415          if (!PP_ISFREE(pp)) {
 416  416                  page_unlock(pp);
 417  417                  return (NULL);
 418  418          }
 419  419          if (pp->p_szc) {
 420  420                  page_demote_free_pages(pp);
 421  421                  page_unlock(pp);
 422  422                  goto retry;
 423  423          }
 424  424  
 425  425          /* If associated with a vnode, destroy mappings */
 426  426  
 427  427          if (pp->p_vnode) {
 428  428  
 429  429                  page_destroy_free(pp);
 430  430  
 431  431                  if (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_NO_RECLAIM)) {
 432  432                          return (NULL);
 433  433                  }
 434  434  
 435  435                  if (page_pptonum(pp) != pfnum) {
 436  436                          page_unlock(pp);
 437  437                          goto retry;
 438  438                  }
 439  439          }
 440  440  
 441  441          if (!PP_ISFREE(pp)) {
 442  442                  page_unlock(pp);
 443  443                  return (NULL);
 444  444          }
 445  445  
 446  446          if (!page_reclaim(pp, (kmutex_t *)NULL))
 447  447                  return (NULL);
 448  448  
 449  449          return (pp);
 450  450  }
 451  451  
 452  452  /*
 453  453   * Return the optimum page size for a given mapping
 454  454   */
 455  455  /*ARGSUSED*/
 456  456  size_t
 457  457  map_pgsz(int maptype, struct proc *p, caddr_t addr, size_t len, int memcntl)
 458  458  {
 459  459          level_t l = 0;
 460  460          size_t pgsz = MMU_PAGESIZE;
 461  461          size_t max_lpsize;
 462  462          uint_t mszc;
 463  463  
 464  464          ASSERT(maptype != MAPPGSZ_VA);
 465  465  
 466  466          if (maptype != MAPPGSZ_ISM && physmem < privm_lpg_min_physmem) {
 467  467                  return (MMU_PAGESIZE);
 468  468          }
 469  469  
 470  470          switch (maptype) {
 471  471          case MAPPGSZ_HEAP:
 472  472          case MAPPGSZ_STK:
 473  473                  max_lpsize = memcntl ? mcntl0_lpsize : (maptype ==
 474  474                      MAPPGSZ_HEAP ? max_uheap_lpsize : max_ustack_lpsize);
 475  475                  if (max_lpsize == MMU_PAGESIZE) {
 476  476                          return (MMU_PAGESIZE);
 477  477                  }
 478  478                  if (len == 0) {
 479  479                          len = (maptype == MAPPGSZ_HEAP) ? p->p_brkbase +
 480  480                              p->p_brksize - p->p_bssbase : p->p_stksize;
 481  481                  }
 482  482                  len = (maptype == MAPPGSZ_HEAP) ? MAX(len,
 483  483                      default_uheap_lpsize) : MAX(len, default_ustack_lpsize);
 484  484  
 485  485                  /*
 486  486                   * use the pages size that best fits len
 487  487                   */
 488  488                  for (l = mmu.umax_page_level; l > 0; --l) {
 489  489                          if (LEVEL_SIZE(l) > max_lpsize || len < LEVEL_SIZE(l)) {
 490  490                                  continue;
 491  491                          } else {
 492  492                                  pgsz = LEVEL_SIZE(l);
 493  493                          }
 494  494                          break;
 495  495                  }
 496  496  
 497  497                  mszc = (maptype == MAPPGSZ_HEAP ? p->p_brkpageszc :
 498  498                      p->p_stkpageszc);
 499  499                  if (addr == 0 && (pgsz < hw_page_array[mszc].hp_size)) {
 500  500                          pgsz = hw_page_array[mszc].hp_size;
 501  501                  }
 502  502                  return (pgsz);
 503  503  
 504  504          case MAPPGSZ_ISM:
 505  505                  for (l = mmu.umax_page_level; l > 0; --l) {
 506  506                          if (len >= LEVEL_SIZE(l))
 507  507                                  return (LEVEL_SIZE(l));
 508  508                  }
 509  509                  return (LEVEL_SIZE(0));
 510  510          }
 511  511          return (pgsz);
 512  512  }
 513  513  
 514  514  static uint_t
 515  515  map_szcvec(caddr_t addr, size_t size, uintptr_t off, size_t max_lpsize,
 516  516      size_t min_physmem)
 517  517  {
 518  518          caddr_t eaddr = addr + size;
 519  519          uint_t szcvec = 0;
 520  520          caddr_t raddr;
 521  521          caddr_t readdr;
 522  522          size_t  pgsz;
 523  523          int i;
 524  524  
 525  525          if (physmem < min_physmem || max_lpsize <= MMU_PAGESIZE) {
 526  526                  return (0);
 527  527          }
 528  528  
 529  529          for (i = mmu_exported_page_sizes - 1; i > 0; i--) {
 530  530                  pgsz = page_get_pagesize(i);
 531  531                  if (pgsz > max_lpsize) {
 532  532                          continue;
 533  533                  }
 534  534                  raddr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
 535  535                  readdr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
 536  536                  if (raddr < addr || raddr >= readdr) {
 537  537                          continue;
 538  538                  }
 539  539                  if (P2PHASE((uintptr_t)addr ^ off, pgsz)) {
 540  540                          continue;
 541  541                  }
 542  542                  /*
 543  543                   * Set szcvec to the remaining page sizes.
 544  544                   */
 545  545                  szcvec = ((1 << (i + 1)) - 1) & ~1;
 546  546                  break;
 547  547          }
 548  548          return (szcvec);
 549  549  }
 550  550  
 551  551  /*
 552  552   * Return a bit vector of large page size codes that
 553  553   * can be used to map [addr, addr + len) region.
 554  554   */
 555  555  /*ARGSUSED*/
 556  556  uint_t
 557  557  map_pgszcvec(caddr_t addr, size_t size, uintptr_t off, int flags, int type,
 558  558      int memcntl)
 559  559  {
 560  560          size_t max_lpsize = mcntl0_lpsize;
 561  561  
 562  562          if (mmu.max_page_level == 0)
 563  563                  return (0);
 564  564  
 565  565          if (flags & MAP_TEXT) {
 566  566                  if (!memcntl)
 567  567                          max_lpsize = max_utext_lpsize;
 568  568                  return (map_szcvec(addr, size, off, max_lpsize,
 569  569                      shm_lpg_min_physmem));
 570  570  
 571  571          } else if (flags & MAP_INITDATA) {
 572  572                  if (!memcntl)
 573  573                          max_lpsize = max_uidata_lpsize;
 574  574                  return (map_szcvec(addr, size, off, max_lpsize,
 575  575                      privm_lpg_min_physmem));
 576  576  
 577  577          } else if (type == MAPPGSZC_SHM) {
 578  578                  if (!memcntl)
 579  579                          max_lpsize = max_shm_lpsize;
 580  580                  return (map_szcvec(addr, size, off, max_lpsize,
 581  581                      shm_lpg_min_physmem));
 582  582  
 583  583          } else if (type == MAPPGSZC_HEAP) {
 584  584                  if (!memcntl)
 585  585                          max_lpsize = max_uheap_lpsize;
 586  586                  return (map_szcvec(addr, size, off, max_lpsize,
 587  587                      privm_lpg_min_physmem));
 588  588  
 589  589          } else if (type == MAPPGSZC_STACK) {
 590  590                  if (!memcntl)
 591  591                          max_lpsize = max_ustack_lpsize;
 592  592                  return (map_szcvec(addr, size, off, max_lpsize,
 593  593                      privm_lpg_min_physmem));
 594  594  
 595  595          } else {
 596  596                  if (!memcntl)
 597  597                          max_lpsize = max_privmap_lpsize;
 598  598                  return (map_szcvec(addr, size, off, max_lpsize,
 599  599                      privm_lpg_min_physmem));
 600  600          }
 601  601  }
 602  602  
 603  603  /*
 604  604   * Handle a pagefault.
 605  605   */
 606  606  faultcode_t
 607  607  pagefault(
 608  608          caddr_t addr,
 609  609          enum fault_type type,
 610  610          enum seg_rw rw,
 611  611          int iskernel)
 612  612  {
 613  613          struct as *as;
 614  614          struct hat *hat;
 615  615          struct proc *p;
 616  616          kthread_t *t;
 617  617          faultcode_t res;
 618  618          caddr_t base;
 619  619          size_t len;
 620  620          int err;
 621  621          int mapped_red;
 622  622          uintptr_t ea;
 623  623  
 624  624          ASSERT_STACK_ALIGNED();
 625  625  
 626  626          if (INVALID_VADDR(addr))
 627  627                  return (FC_NOMAP);
 628  628  
 629  629          mapped_red = segkp_map_red();
 630  630  
 631  631          if (iskernel) {
 632  632                  as = &kas;
 633  633                  hat = as->a_hat;
 634  634          } else {
 635  635                  t = curthread;
 636  636                  p = ttoproc(t);
 637  637                  as = p->p_as;
 638  638                  hat = as->a_hat;
 639  639          }
 640  640  
 641  641          /*
 642  642           * Dispatch pagefault.
 643  643           */
 644  644          res = as_fault(hat, as, addr, 1, type, rw);
 645  645  
 646  646          /*
 647  647           * If this isn't a potential unmapped hole in the user's
 648  648           * UNIX data or stack segments, just return status info.
 649  649           */
 650  650          if (res != FC_NOMAP || iskernel)
 651  651                  goto out;
 652  652  
 653  653          /*
 654  654           * Check to see if we happened to faulted on a currently unmapped
 655  655           * part of the UNIX data or stack segments.  If so, create a zfod
 656  656           * mapping there and then try calling the fault routine again.
 657  657           */
 658  658          base = p->p_brkbase;
 659  659          len = p->p_brksize;
 660  660  
 661  661          if (addr < base || addr >= base + len) {                /* data seg? */
 662  662                  base = (caddr_t)p->p_usrstack - p->p_stksize;
 663  663                  len = p->p_stksize;
 664  664                  if (addr < base || addr >= p->p_usrstack) {     /* stack seg? */
 665  665                          /* not in either UNIX data or stack segments */
 666  666                          res = FC_NOMAP;
 667  667                          goto out;
 668  668                  }
 669  669          }
 670  670  
 671  671          /*
 672  672           * the rest of this function implements a 3.X 4.X 5.X compatibility
 673  673           * This code is probably not needed anymore
 674  674           */
 675  675          if (p->p_model == DATAMODEL_ILP32) {
 676  676  
 677  677                  /* expand the gap to the page boundaries on each side */
 678  678                  ea = P2ROUNDUP((uintptr_t)base + len, MMU_PAGESIZE);
 679  679                  base = (caddr_t)P2ALIGN((uintptr_t)base, MMU_PAGESIZE);
 680  680                  len = ea - (uintptr_t)base;
 681  681  
 682  682                  as_rangelock(as);
 683  683                  if (as_gap(as, MMU_PAGESIZE, &base, &len, AH_CONTAIN, addr) ==
 684  684                      0) {
 685  685                          err = as_map(as, base, len, segvn_create, zfod_argsp);
 686  686                          as_rangeunlock(as);
 687  687                          if (err) {
 688  688                                  res = FC_MAKE_ERR(err);
 689  689                                  goto out;
 690  690                          }
 691  691                  } else {
 692  692                          /*
 693  693                           * This page is already mapped by another thread after
 694  694                           * we returned from as_fault() above.  We just fall
 695  695                           * through as_fault() below.
 696  696                           */
 697  697                          as_rangeunlock(as);
 698  698                  }
 699  699  
 700  700                  res = as_fault(hat, as, addr, 1, F_INVAL, rw);
 701  701          }
 702  702  
 703  703  out:
 704  704          if (mapped_red)
 705  705                  segkp_unmap_red();
 706  706  
 707  707          return (res);
 708  708  }
 709  709  
 710  710  void
 711  711  map_addr(caddr_t *addrp, size_t len, offset_t off, int vacalign, uint_t flags)
 712  712  {
 713  713          struct proc *p = curproc;
 714  714          caddr_t userlimit = (flags & _MAP_LOW32) ?
 715  715              (caddr_t)_userlimit32 : p->p_as->a_userlimit;
 716  716  
 717  717          map_addr_proc(addrp, len, off, vacalign, userlimit, curproc, flags);
 718  718  }
 719  719  
 720  720  /*ARGSUSED*/
 721  721  int
 722  722  map_addr_vacalign_check(caddr_t addr, u_offset_t off)
 723  723  {
 724  724          return (0);
 725  725  }
 726  726  
 727  727  /*
 728  728   * The maximum amount a randomized mapping will be slewed.  We should perhaps
 729  729   * arrange things so these tunables can be separate for mmap, mmapobj, and
 730  730   * ld.so
 731  731   */
 732  732  size_t aslr_max_map_skew = 256 * 1024 * 1024; /* 256MB */
 733  733  
 734  734  /*
 735  735   * map_addr_proc() is the routine called when the system is to
 736  736   * choose an address for the user.  We will pick an address
 737  737   * range which is the highest available below userlimit.
 738  738   *
 739  739   * Every mapping will have a redzone of a single page on either side of
 740  740   * the request. This is done to leave one page unmapped between segments.
 741  741   * This is not required, but it's useful for the user because if their
 742  742   * program strays across a segment boundary, it will catch a fault
 743  743   * immediately making debugging a little easier.  Currently the redzone
 744  744   * is mandatory.
 745  745   *
 746  746   * addrp is a value/result parameter.
 747  747   *      On input it is a hint from the user to be used in a completely
 748  748   *      machine dependent fashion.  We decide to completely ignore this hint.
 749  749   *      If MAP_ALIGN was specified, addrp contains the minimal alignment, which
 750  750   *      must be some "power of two" multiple of pagesize.
 751  751   *
 752  752   *      On output it is NULL if no address can be found in the current
 753  753   *      processes address space or else an address that is currently
 754  754   *      not mapped for len bytes with a page of red zone on either side.
 755  755   *
 756  756   *      vacalign is not needed on x86 (it's for viturally addressed caches)
 757  757   */
 758  758  /*ARGSUSED*/
 759  759  void
 760  760  map_addr_proc(
 761  761          caddr_t *addrp,
 762  762          size_t len,
 763  763          offset_t off,
 764  764          int vacalign,
 765  765          caddr_t userlimit,
 766  766          struct proc *p,
 767  767          uint_t flags)
 768  768  {
 769  769          struct as *as = p->p_as;
 770  770          caddr_t addr;
 771  771          caddr_t base;
 772  772          size_t slen;
 773  773          size_t align_amount;
 774  774  
 775  775          ASSERT32(userlimit == as->a_userlimit);
 776  776  
 777  777          base = p->p_brkbase;
 778  778  #if defined(__amd64)
 779  779          if (p->p_model == DATAMODEL_NATIVE) {
 780  780                  if (userlimit < as->a_userlimit) {
 781  781                          /*
 782  782                           * This happens when a program wants to map
 783  783                           * something in a range that's accessible to a
 784  784                           * program in a smaller address space.  For example,
 785  785                           * a 64-bit program calling mmap32(2) to guarantee
 786  786                           * that the returned address is below 4Gbytes.
 787  787                           */
 788  788                          ASSERT((uintptr_t)userlimit < ADDRESS_C(0xffffffff));
 789  789  
 790  790                          if (userlimit > base)
 791  791                                  slen = userlimit - base;
 792  792                          else {
 793  793                                  *addrp = NULL;
 794  794                                  return;
 795  795                          }
 796  796                  } else {
 797  797                          /*
 798  798                           * With the stack positioned at a higher address than
 799  799                           * the heap for 64-bit processes, it is necessary to be
 800  800                           * mindful of its location and potential size.
 801  801                           *
 802  802                           * Unallocated space above the top of the stack (that
 803  803                           * is, at a lower address) but still within the bounds
 804  804                           * of the stack limit should be considered unavailable.
 805  805                           *
 806  806                           * As the 64-bit stack guard is mapped in immediately
 807  807                           * adjacent to the stack limit boundary, this prevents
 808  808                           * new mappings from having accidentally dangerous
 809  809                           * proximity to the stack.
 810  810                           */
 811  811                          slen = p->p_usrstack - base -
 812  812                              ((p->p_stk_ctl + PAGEOFFSET) & PAGEMASK);
 813  813                  }
 814  814          } else
 815  815  #endif /* defined(__amd64) */
 816  816                  slen = userlimit - base;
 817  817  
 818  818          /* Make len be a multiple of PAGESIZE */
 819  819          len = (len + PAGEOFFSET) & PAGEMASK;
 820  820  
 821  821          /*
 822  822           * figure out what the alignment should be
 823  823           *
 824  824           * XX64 -- is there an ELF_AMD64_MAXPGSZ or is it the same????
 825  825           */
 826  826          if (len <= ELF_386_MAXPGSZ) {
 827  827                  /*
 828  828                   * Align virtual addresses to ensure that ELF shared libraries
 829  829                   * are mapped with the appropriate alignment constraints by
 830  830                   * the run-time linker.
 831  831                   */
 832  832                  align_amount = ELF_386_MAXPGSZ;
 833  833          } else {
 834  834                  /*
 835  835                   * For 32-bit processes, only those which have specified
 836  836                   * MAP_ALIGN and an addr will be aligned on a larger page size.
 837  837                   * Not doing so can potentially waste up to 1G of process
 838  838                   * address space.
 839  839                   */
 840  840                  int lvl = (p->p_model == DATAMODEL_ILP32) ? 1 :
 841  841                      mmu.umax_page_level;
 842  842  
 843  843                  while (lvl && len < LEVEL_SIZE(lvl))
 844  844                          --lvl;
 845  845  
 846  846                  align_amount = LEVEL_SIZE(lvl);
 847  847          }
 848  848          if ((flags & MAP_ALIGN) && ((uintptr_t)*addrp > align_amount))
 849  849                  align_amount = (uintptr_t)*addrp;
 850  850  
 851  851          ASSERT(ISP2(align_amount));
 852  852          ASSERT(align_amount == 0 || align_amount >= PAGESIZE);
 853  853  
 854  854          off = off & (align_amount - 1);
 855  855  
 856  856          /*
 857  857           * Look for a large enough hole starting below userlimit.
 858  858           * After finding it, use the upper part.
 859  859           */
 860  860          if (as_gap_aligned(as, len, &base, &slen, AH_HI, NULL, align_amount,
 861  861              PAGESIZE, off) == 0) {
 862  862                  caddr_t as_addr;
 863  863  
 864  864                  /*
 865  865                   * addr is the highest possible address to use since we have
 866  866                   * a PAGESIZE redzone at the beginning and end.
 867  867                   */
 868  868                  addr = base + slen - (PAGESIZE + len);
 869  869                  as_addr = addr;
 870  870                  /*
 871  871                   * Round address DOWN to the alignment amount and
 872  872                   * add the offset in.
 873  873                   * If addr is greater than as_addr, len would not be large
 874  874                   * enough to include the redzone, so we must adjust down
 875  875                   * by the alignment amount.
 876  876                   */
 877  877                  addr = (caddr_t)((uintptr_t)addr & (~(align_amount - 1)));
 878  878                  addr += (uintptr_t)off;
 879  879                  if (addr > as_addr) {
 880  880                          addr -= align_amount;
 881  881                  }
 882  882  
 883  883                  /*
 884  884                   * If randomization is requested, slew the allocation
 885  885                   * backwards, within the same gap, by a random amount.
 886  886                   */
 887  887                  if (flags & _MAP_RANDOMIZE) {
 888  888                          uint32_t slew;
 889  889  
 890  890                          (void) random_get_pseudo_bytes((uint8_t *)&slew,
 891  891                              sizeof (slew));
 892  892  
 893  893                          slew = slew % MIN(aslr_max_map_skew, (addr - base));
 894  894                          addr -= P2ALIGN(slew, align_amount);
 895  895                  }
 896  896  
 897  897                  ASSERT(addr > base);
 898  898                  ASSERT(addr + len < base + slen);
 899  899                  ASSERT(((uintptr_t)addr & (align_amount - 1)) ==
 900  900                      ((uintptr_t)(off)));
 901  901                  *addrp = addr;
 902  902          } else {
 903  903                  *addrp = NULL;  /* no more virtual space */
 904  904          }
 905  905  }
 906  906  
 907  907  int valid_va_range_aligned_wraparound;
 908  908  
 909  909  /*
 910  910   * Determine whether [*basep, *basep + *lenp) contains a mappable range of
 911  911   * addresses at least "minlen" long, where the base of the range is at "off"
 912  912   * phase from an "align" boundary and there is space for a "redzone"-sized
 913  913   * redzone on either side of the range.  On success, 1 is returned and *basep
 914  914   * and *lenp are adjusted to describe the acceptable range (including
 915  915   * the redzone).  On failure, 0 is returned.
 916  916   */
 917  917  /*ARGSUSED3*/
 918  918  int
 919  919  valid_va_range_aligned(caddr_t *basep, size_t *lenp, size_t minlen, int dir,
 920  920      size_t align, size_t redzone, size_t off)
 921  921  {
 922  922          uintptr_t hi, lo;
 923  923          size_t tot_len;
 924  924  
 925  925          ASSERT(align == 0 ? off == 0 : off < align);
 926  926          ASSERT(ISP2(align));
 927  927          ASSERT(align == 0 || align >= PAGESIZE);
 928  928  
 929  929          lo = (uintptr_t)*basep;
 930  930          hi = lo + *lenp;
 931  931          tot_len = minlen + 2 * redzone; /* need at least this much space */
 932  932  
 933  933          /*
 934  934           * If hi rolled over the top, try cutting back.
 935  935           */
 936  936          if (hi < lo) {
 937  937                  *lenp = 0UL - lo - 1UL;
 938  938                  /* See if this really happens. If so, then we figure out why */
 939  939                  valid_va_range_aligned_wraparound++;
 940  940                  hi = lo + *lenp;
 941  941          }
 942  942          if (*lenp < tot_len) {
 943  943                  return (0);
 944  944          }
 945  945  
 946  946  #if defined(__amd64)
 947  947          /*
 948  948           * Deal with a possible hole in the address range between
 949  949           * hole_start and hole_end that should never be mapped.
 950  950           */
 951  951          if (lo < hole_start) {
 952  952                  if (hi > hole_start) {
 953  953                          if (hi < hole_end) {
 954  954                                  hi = hole_start;
 955  955                          } else {
 956  956                                  /* lo < hole_start && hi >= hole_end */
 957  957                                  if (dir == AH_LO) {
 958  958                                          /*
 959  959                                           * prefer lowest range
 960  960                                           */
 961  961                                          if (hole_start - lo >= tot_len)
 962  962                                                  hi = hole_start;
 963  963                                          else if (hi - hole_end >= tot_len)
 964  964                                                  lo = hole_end;
 965  965                                          else
 966  966                                                  return (0);
 967  967                                  } else {
 968  968                                          /*
 969  969                                           * prefer highest range
 970  970                                           */
 971  971                                          if (hi - hole_end >= tot_len)
 972  972                                                  lo = hole_end;
 973  973                                          else if (hole_start - lo >= tot_len)
 974  974                                                  hi = hole_start;
 975  975                                          else
 976  976                                                  return (0);
 977  977                                  }
 978  978                          }
 979  979                  }
 980  980          } else {
 981  981                  /* lo >= hole_start */
 982  982                  if (hi < hole_end)
 983  983                          return (0);
 984  984                  if (lo < hole_end)
 985  985                          lo = hole_end;
 986  986          }
 987  987  #endif
 988  988  
 989  989          if (hi - lo < tot_len)
 990  990                  return (0);
 991  991  
 992  992          if (align > 1) {
 993  993                  uintptr_t tlo = lo + redzone;
 994  994                  uintptr_t thi = hi - redzone;
 995  995                  tlo = (uintptr_t)P2PHASEUP(tlo, align, off);
 996  996                  if (tlo < lo + redzone) {
 997  997                          return (0);
 998  998                  }
 999  999                  if (thi < tlo || thi - tlo < minlen) {
1000 1000                          return (0);
1001 1001                  }
1002 1002          }
1003 1003  
1004 1004          *basep = (caddr_t)lo;
1005 1005          *lenp = hi - lo;
1006 1006          return (1);
1007 1007  }
1008 1008  
1009 1009  /*
1010 1010   * Determine whether [*basep, *basep + *lenp) contains a mappable range of
1011 1011   * addresses at least "minlen" long.  On success, 1 is returned and *basep
1012 1012   * and *lenp are adjusted to describe the acceptable range.  On failure, 0
1013 1013   * is returned.
1014 1014   */
1015 1015  int
1016 1016  valid_va_range(caddr_t *basep, size_t *lenp, size_t minlen, int dir)
1017 1017  {
1018 1018          return (valid_va_range_aligned(basep, lenp, minlen, dir, 0, 0, 0));
1019 1019  }
1020 1020  
1021 1021  /*
1022 1022   * Default to forbidding the first 64k of address space.  This protects most
1023 1023   * reasonably sized structures from dereferences through NULL:
1024 1024   *     ((foo_t *)0)->bar
1025 1025   */
1026 1026  uintptr_t forbidden_null_mapping_sz = 0x10000;
1027 1027  
1028 1028  /*
1029 1029   * Determine whether [addr, addr+len] are valid user addresses.
1030 1030   */
1031 1031  /*ARGSUSED*/
1032 1032  int
1033 1033  valid_usr_range(caddr_t addr, size_t len, uint_t prot, struct as *as,
1034 1034      caddr_t userlimit)
1035 1035  {
1036 1036          caddr_t eaddr = addr + len;
1037 1037  
1038 1038          if (eaddr <= addr || addr >= userlimit || eaddr > userlimit)
1039 1039                  return (RANGE_BADADDR);
1040 1040  
1041 1041          if ((addr <= (caddr_t)forbidden_null_mapping_sz) &&
1042 1042              as->a_proc != NULL &&
1043 1043              secflag_enabled(as->a_proc, PROC_SEC_FORBIDNULLMAP))
1044 1044                  return (RANGE_BADADDR);
1045 1045  
1046 1046  #if defined(__amd64)
1047 1047          /*
1048 1048           * Check for the VA hole
1049 1049           */
1050 1050          if (eaddr > (caddr_t)hole_start && addr < (caddr_t)hole_end)
1051 1051                  return (RANGE_BADADDR);
1052 1052  #endif
1053 1053  
1054 1054          return (RANGE_OKAY);
1055 1055  }
1056 1056  
1057 1057  /*
1058 1058   * Return 1 if the page frame is onboard memory, else 0.
1059 1059   */
1060 1060  int
1061 1061  pf_is_memory(pfn_t pf)
1062 1062  {
1063 1063          if (pfn_is_foreign(pf))
1064 1064                  return (0);
1065 1065          return (address_in_memlist(phys_install, pfn_to_pa(pf), 1));
1066 1066  }
1067 1067  
1068 1068  /*
1069 1069   * return the memrange containing pfn
1070 1070   */
1071 1071  int
1072 1072  memrange_num(pfn_t pfn)
1073 1073  {
1074 1074          int n;
1075 1075  
1076 1076          for (n = 0; n < nranges - 1; ++n) {
1077 1077                  if (pfn >= memranges[n])
1078 1078                          break;
1079 1079          }
1080 1080          return (n);
1081 1081  }
1082 1082  
1083 1083  /*
1084 1084   * return the mnoderange containing pfn
1085 1085   */
1086 1086  /*ARGSUSED*/
1087 1087  int
1088 1088  pfn_2_mtype(pfn_t pfn)
1089 1089  {
1090 1090  #if defined(__xpv)
1091 1091          return (0);
1092 1092  #else
1093 1093          int     n;
1094 1094  
1095 1095          /* Always start from highest pfn and work our way down */
1096 1096          for (n = mtypetop; n != -1; n = mnoderanges[n].mnr_next) {
1097 1097                  if (pfn >= mnoderanges[n].mnr_pfnlo) {
1098 1098                          break;
1099 1099                  }
1100 1100          }
1101 1101          return (n);
1102 1102  #endif
1103 1103  }
1104 1104  
1105 1105  #if !defined(__xpv)
1106 1106  /*
1107 1107   * is_contigpage_free:
1108 1108   *      returns a page list of contiguous pages. It minimally has to return
1109 1109   *      minctg pages. Caller determines minctg based on the scatter-gather
1110 1110   *      list length.
1111 1111   *
1112 1112   *      pfnp is set to the next page frame to search on return.
1113 1113   */
1114 1114  static page_t *
1115 1115  is_contigpage_free(
1116 1116          pfn_t *pfnp,
1117 1117          pgcnt_t *pgcnt,
1118 1118          pgcnt_t minctg,
1119 1119          uint64_t pfnseg,
1120 1120          int iolock)
1121 1121  {
1122 1122          int     i = 0;
1123 1123          pfn_t   pfn = *pfnp;
1124 1124          page_t  *pp;
1125 1125          page_t  *plist = NULL;
1126 1126  
1127 1127          /*
1128 1128           * fail if pfn + minctg crosses a segment boundary.
1129 1129           * Adjust for next starting pfn to begin at segment boundary.
1130 1130           */
1131 1131  
1132 1132          if (((*pfnp + minctg - 1) & pfnseg) < (*pfnp & pfnseg)) {
1133 1133                  *pfnp = roundup(*pfnp, pfnseg + 1);
1134 1134                  return (NULL);
1135 1135          }
1136 1136  
1137 1137          do {
1138 1138  retry:
1139 1139                  pp = page_numtopp_nolock(pfn + i);
1140 1140                  if ((pp == NULL) || IS_DUMP_PAGE(pp) ||
1141 1141                      (page_trylock(pp, SE_EXCL) == 0)) {
1142 1142                          (*pfnp)++;
1143 1143                          break;
1144 1144                  }
1145 1145                  if (page_pptonum(pp) != pfn + i) {
1146 1146                          page_unlock(pp);
1147 1147                          goto retry;
1148 1148                  }
1149 1149  
1150 1150                  if (!(PP_ISFREE(pp))) {
1151 1151                          page_unlock(pp);
1152 1152                          (*pfnp)++;
1153 1153                          break;
1154 1154                  }
1155 1155  
1156 1156                  if (!PP_ISAGED(pp)) {
1157 1157                          page_list_sub(pp, PG_CACHE_LIST);
1158 1158                          page_hashout(pp, (kmutex_t *)NULL);
1159 1159                  } else {
1160 1160                          page_list_sub(pp, PG_FREE_LIST);
1161 1161                  }
1162 1162  
1163 1163                  if (iolock)
1164 1164                          page_io_lock(pp);
1165 1165                  page_list_concat(&plist, &pp);
1166 1166  
1167 1167                  /*
1168 1168                   * exit loop when pgcnt satisfied or segment boundary reached.
1169 1169                   */
1170 1170  
1171 1171          } while ((++i < *pgcnt) && ((pfn + i) & pfnseg));
1172 1172  
1173 1173          *pfnp += i;             /* set to next pfn to search */
1174 1174  
1175 1175          if (i >= minctg) {
1176 1176                  *pgcnt -= i;
1177 1177                  return (plist);
1178 1178          }
1179 1179  
1180 1180          /*
1181 1181           * failure: minctg not satisfied.
1182 1182           *
1183 1183           * if next request crosses segment boundary, set next pfn
1184 1184           * to search from the segment boundary.
1185 1185           */
1186 1186          if (((*pfnp + minctg - 1) & pfnseg) < (*pfnp & pfnseg))
1187 1187                  *pfnp = roundup(*pfnp, pfnseg + 1);
1188 1188  
1189 1189          /* clean up any pages already allocated */
1190 1190  
1191 1191          while (plist) {
1192 1192                  pp = plist;
1193 1193                  page_sub(&plist, pp);
1194 1194                  page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
1195 1195                  if (iolock)
1196 1196                          page_io_unlock(pp);
1197 1197                  page_unlock(pp);
1198 1198          }
1199 1199  
1200 1200          return (NULL);
1201 1201  }
1202 1202  #endif  /* !__xpv */
1203 1203  
1204 1204  /*
1205 1205   * verify that pages being returned from allocator have correct DMA attribute
1206 1206   */
1207 1207  #ifndef DEBUG
1208 1208  #define check_dma(a, b, c) (void)(0)
1209 1209  #else
1210 1210  static void
1211 1211  check_dma(ddi_dma_attr_t *dma_attr, page_t *pp, int cnt)
1212 1212  {
1213 1213          if (dma_attr == NULL)
1214 1214                  return;
1215 1215  
1216 1216          while (cnt-- > 0) {
1217 1217                  if (pa_to_ma(pfn_to_pa(pp->p_pagenum)) <
1218 1218                      dma_attr->dma_attr_addr_lo)
1219 1219                          panic("PFN (pp=%p) below dma_attr_addr_lo", (void *)pp);
1220 1220                  if (pa_to_ma(pfn_to_pa(pp->p_pagenum)) >=
1221 1221                      dma_attr->dma_attr_addr_hi)
1222 1222                          panic("PFN (pp=%p) above dma_attr_addr_hi", (void *)pp);
1223 1223                  pp = pp->p_next;
1224 1224          }
1225 1225  }
1226 1226  #endif
1227 1227  
1228 1228  #if !defined(__xpv)
1229 1229  static page_t *
1230 1230  page_get_contigpage(pgcnt_t *pgcnt, ddi_dma_attr_t *mattr, int iolock)
1231 1231  {
1232 1232          pfn_t           pfn;
1233 1233          int             sgllen;
1234 1234          uint64_t        pfnseg;
1235 1235          pgcnt_t         minctg;
1236 1236          page_t          *pplist = NULL, *plist;
1237 1237          uint64_t        lo, hi;
1238 1238          pgcnt_t         pfnalign = 0;
1239 1239          static pfn_t    startpfn;
1240 1240          static pgcnt_t  lastctgcnt;
1241 1241          uintptr_t       align;
1242 1242  
1243 1243          CONTIG_LOCK();
1244 1244  
1245 1245          if (mattr) {
1246 1246                  lo = mmu_btop((mattr->dma_attr_addr_lo + MMU_PAGEOFFSET));
1247 1247                  hi = mmu_btop(mattr->dma_attr_addr_hi);
1248 1248                  if (hi >= physmax)
1249 1249                          hi = physmax - 1;
1250 1250                  sgllen = mattr->dma_attr_sgllen;
1251 1251                  pfnseg = mmu_btop(mattr->dma_attr_seg);
1252 1252  
1253 1253                  align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer);
1254 1254                  if (align > MMU_PAGESIZE)
1255 1255                          pfnalign = mmu_btop(align);
1256 1256  
1257 1257                  /*
1258 1258                   * in order to satisfy the request, must minimally
1259 1259                   * acquire minctg contiguous pages
1260 1260                   */
1261 1261                  minctg = howmany(*pgcnt, sgllen);
1262 1262  
1263 1263                  ASSERT(hi >= lo);
1264 1264  
1265 1265                  /*
1266 1266                   * start from where last searched if the minctg >= lastctgcnt
1267 1267                   */
1268 1268                  if (minctg < lastctgcnt || startpfn < lo || startpfn > hi)
1269 1269                          startpfn = lo;
1270 1270          } else {
1271 1271                  hi = physmax - 1;
1272 1272                  lo = 0;
1273 1273                  sgllen = 1;
1274 1274                  pfnseg = mmu.highest_pfn;
1275 1275                  minctg = *pgcnt;
1276 1276  
1277 1277                  if (minctg < lastctgcnt)
1278 1278                          startpfn = lo;
1279 1279          }
1280 1280          lastctgcnt = minctg;
1281 1281  
1282 1282          ASSERT(pfnseg + 1 >= (uint64_t)minctg);
1283 1283  
1284 1284          /* conserve 16m memory - start search above 16m when possible */
1285 1285          if (hi > PFN_16M && startpfn < PFN_16M)
1286 1286                  startpfn = PFN_16M;
1287 1287  
1288 1288          pfn = startpfn;
1289 1289          if (pfnalign)
1290 1290                  pfn = P2ROUNDUP(pfn, pfnalign);
1291 1291  
1292 1292          while (pfn + minctg - 1 <= hi) {
1293 1293  
1294 1294                  plist = is_contigpage_free(&pfn, pgcnt, minctg, pfnseg, iolock);
1295 1295                  if (plist) {
1296 1296                          page_list_concat(&pplist, &plist);
1297 1297                          sgllen--;
1298 1298                          /*
1299 1299                           * return when contig pages no longer needed
1300 1300                           */
1301 1301                          if (!*pgcnt || ((*pgcnt <= sgllen) && !pfnalign)) {
1302 1302                                  startpfn = pfn;
1303 1303                                  CONTIG_UNLOCK();
1304 1304                                  check_dma(mattr, pplist, *pgcnt);
1305 1305                                  return (pplist);
1306 1306                          }
1307 1307                          minctg = howmany(*pgcnt, sgllen);
1308 1308                  }
1309 1309                  if (pfnalign)
1310 1310                          pfn = P2ROUNDUP(pfn, pfnalign);
1311 1311          }
1312 1312  
1313 1313          /* cannot find contig pages in specified range */
1314 1314          if (startpfn == lo) {
1315 1315                  CONTIG_UNLOCK();
1316 1316                  return (NULL);
1317 1317          }
1318 1318  
1319 1319          /* did not start with lo previously */
1320 1320          pfn = lo;
1321 1321          if (pfnalign)
1322 1322                  pfn = P2ROUNDUP(pfn, pfnalign);
1323 1323  
1324 1324          /* allow search to go above startpfn */
1325 1325          while (pfn < startpfn) {
1326 1326  
1327 1327                  plist = is_contigpage_free(&pfn, pgcnt, minctg, pfnseg, iolock);
1328 1328                  if (plist != NULL) {
1329 1329  
1330 1330                          page_list_concat(&pplist, &plist);
1331 1331                          sgllen--;
1332 1332  
1333 1333                          /*
1334 1334                           * return when contig pages no longer needed
1335 1335                           */
1336 1336                          if (!*pgcnt || ((*pgcnt <= sgllen) && !pfnalign)) {
1337 1337                                  startpfn = pfn;
1338 1338                                  CONTIG_UNLOCK();
1339 1339                                  check_dma(mattr, pplist, *pgcnt);
1340 1340                                  return (pplist);
1341 1341                          }
1342 1342                          minctg = howmany(*pgcnt, sgllen);
1343 1343                  }
1344 1344                  if (pfnalign)
1345 1345                          pfn = P2ROUNDUP(pfn, pfnalign);
1346 1346          }
1347 1347          CONTIG_UNLOCK();
1348 1348          return (NULL);
1349 1349  }
1350 1350  #endif  /* !__xpv */
1351 1351  
1352 1352  /*
1353 1353   * mnode_range_cnt() calculates the number of memory ranges for mnode and
1354 1354   * memranges[]. Used to determine the size of page lists and mnoderanges.
1355 1355   */
1356 1356  int
1357 1357  mnode_range_cnt(int mnode)
1358 1358  {
1359 1359  #if defined(__xpv)
1360 1360          ASSERT(mnode == 0);
1361 1361          return (1);
1362 1362  #else   /* __xpv */
1363 1363          int     mri;
1364 1364          int     mnrcnt = 0;
1365 1365  
1366 1366          if (mem_node_config[mnode].exists != 0) {
1367 1367                  mri = nranges - 1;
1368 1368  
1369 1369                  /* find the memranges index below contained in mnode range */
1370 1370  
1371 1371                  while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase)
1372 1372                          mri--;
1373 1373  
1374 1374                  /*
1375 1375                   * increment mnode range counter when memranges or mnode
1376 1376                   * boundary is reached.
1377 1377                   */
1378 1378                  while (mri >= 0 &&
1379 1379                      mem_node_config[mnode].physmax >= MEMRANGELO(mri)) {
1380 1380                          mnrcnt++;
1381 1381                          if (mem_node_config[mnode].physmax > MEMRANGEHI(mri))
1382 1382                                  mri--;
1383 1383                          else
1384 1384                                  break;
1385 1385                  }
1386 1386          }
1387 1387          ASSERT(mnrcnt <= MAX_MNODE_MRANGES);
1388 1388          return (mnrcnt);
1389 1389  #endif  /* __xpv */
1390 1390  }
1391 1391  
1392 1392  static int
1393 1393  mnoderange_cmp(const void *v1, const void *v2)
1394 1394  {
1395 1395          const mnoderange_t *m1 = v1;
1396 1396          const mnoderange_t *m2 = v2;
1397 1397  
1398 1398          if (m1->mnr_pfnlo < m2->mnr_pfnlo)
1399 1399                  return (-1);
1400 1400          return (m1->mnr_pfnlo > m2->mnr_pfnlo);
1401 1401  }
1402 1402  
1403 1403  void
1404 1404  mnode_range_setup(mnoderange_t *mnoderanges)
1405 1405  {
1406 1406          mnoderange_t *mp;
1407 1407          size_t nr_ranges;
1408 1408          size_t mnode;
1409 1409  
1410 1410          for (mnode = 0, nr_ranges = 0, mp = mnoderanges;
1411 1411              mnode < max_mem_nodes; mnode++) {
1412 1412                  size_t mri = nranges - 1;
1413 1413  
1414 1414                  if (mem_node_config[mnode].exists == 0)
1415 1415                          continue;
1416 1416  
1417 1417                  while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase)
1418 1418                          mri--;
1419 1419  
1420 1420                  while (mri >= 0 && mem_node_config[mnode].physmax >=
1421 1421                      MEMRANGELO(mri)) {
1422 1422                          mp->mnr_pfnlo = MAX(MEMRANGELO(mri),
1423 1423                              mem_node_config[mnode].physbase);
1424 1424                          mp->mnr_pfnhi = MIN(MEMRANGEHI(mri),
1425 1425                              mem_node_config[mnode].physmax);
1426 1426                          mp->mnr_mnode = mnode;
1427 1427                          mp->mnr_memrange = mri;
1428 1428                          mp->mnr_next = -1;
1429 1429                          mp->mnr_exists = 1;
1430 1430                          mp++;
1431 1431                          nr_ranges++;
1432 1432                          if (mem_node_config[mnode].physmax > MEMRANGEHI(mri))
1433 1433                                  mri--;
1434 1434                          else
1435 1435                                  break;
1436 1436                  }
1437 1437          }
1438 1438  
1439 1439          /*
1440 1440           * mnoderangecnt can be larger than nr_ranges when memory DR is
1441 1441           * supposedly supported.
1442 1442           */
1443 1443          VERIFY3U(nr_ranges, <=, mnoderangecnt);
1444 1444  
1445 1445          qsort(mnoderanges, nr_ranges, sizeof (mnoderange_t), mnoderange_cmp);
1446 1446  
1447 1447          /*
1448 1448           * If some intrepid soul takes the axe to the memory DR code, we can
1449 1449           * remove ->mnr_next altogether, as we just sorted by ->mnr_pfnlo order.
1450 1450           *
1451 1451           * The VERIFY3U() above can be "==" then too.
1452 1452           */
1453 1453          for (size_t i = 1; i < nr_ranges; i++)
1454 1454                  mnoderanges[i].mnr_next = i - 1;
1455 1455  
1456 1456          mtypetop = nr_ranges - 1;
1457 1457          mtype16m = pfn_2_mtype(PFN_16MEG - 1); /* Can be -1 ... */
1458 1458          if (physmax4g)
1459 1459                  mtype4g = pfn_2_mtype(0xfffff);
1460 1460  }
1461 1461  
1462 1462  #ifndef __xpv
1463 1463  /*
1464 1464   * Update mnoderanges for memory hot-add DR operations.
1465 1465   */
1466 1466  static void
1467 1467  mnode_range_add(int mnode)
1468 1468  {
1469 1469          int     *prev;
1470 1470          int     n, mri;
1471 1471          pfn_t   start, end;
1472 1472          extern  void membar_sync(void);
1473 1473  
1474 1474          ASSERT(0 <= mnode && mnode < max_mem_nodes);
1475 1475          ASSERT(mem_node_config[mnode].exists);
1476 1476          start = mem_node_config[mnode].physbase;
1477 1477          end = mem_node_config[mnode].physmax;
1478 1478          ASSERT(start <= end);
1479 1479          mutex_enter(&mnoderange_lock);
1480 1480  
1481 1481  #ifdef  DEBUG
1482 1482          /* Check whether it interleaves with other memory nodes. */
1483 1483          for (n = mtypetop; n != -1; n = mnoderanges[n].mnr_next) {
1484 1484                  ASSERT(mnoderanges[n].mnr_exists);
1485 1485                  if (mnoderanges[n].mnr_mnode == mnode)
1486 1486                          continue;
1487 1487                  ASSERT(start > mnoderanges[n].mnr_pfnhi ||
1488 1488                      end < mnoderanges[n].mnr_pfnlo);
1489 1489          }
1490 1490  #endif  /* DEBUG */
1491 1491  
1492 1492          mri = nranges - 1;
1493 1493          while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase)
1494 1494                  mri--;
1495 1495          while (mri >= 0 && mem_node_config[mnode].physmax >= MEMRANGELO(mri)) {
1496 1496                  /* Check whether mtype already exists. */
1497 1497                  for (n = mtypetop; n != -1; n = mnoderanges[n].mnr_next) {
1498 1498                          if (mnoderanges[n].mnr_mnode == mnode &&
1499 1499                              mnoderanges[n].mnr_memrange == mri) {
1500 1500                                  mnoderanges[n].mnr_pfnlo = MAX(MEMRANGELO(mri),
1501 1501                                      start);
1502 1502                                  mnoderanges[n].mnr_pfnhi = MIN(MEMRANGEHI(mri),
1503 1503                                      end);
1504 1504                                  break;
1505 1505                          }
1506 1506                  }
1507 1507  
1508 1508                  /* Add a new entry if it doesn't exist yet. */
1509 1509                  if (n == -1) {
1510 1510                          /* Try to find an unused entry in mnoderanges array. */
1511 1511                          for (n = 0; n < mnoderangecnt; n++) {
1512 1512                                  if (mnoderanges[n].mnr_exists == 0)
1513 1513                                          break;
1514 1514                          }
1515 1515                          ASSERT(n < mnoderangecnt);
1516 1516                          mnoderanges[n].mnr_pfnlo = MAX(MEMRANGELO(mri), start);
1517 1517                          mnoderanges[n].mnr_pfnhi = MIN(MEMRANGEHI(mri), end);
1518 1518                          mnoderanges[n].mnr_mnode = mnode;
1519 1519                          mnoderanges[n].mnr_memrange = mri;
1520 1520                          mnoderanges[n].mnr_exists = 1;
1521 1521                          /* Page 0 should always be present. */
1522 1522                          for (prev = &mtypetop;
1523 1523                              mnoderanges[*prev].mnr_pfnlo > start;
1524 1524                              prev = &mnoderanges[*prev].mnr_next) {
1525 1525                                  ASSERT(mnoderanges[*prev].mnr_next >= 0);
1526 1526                                  ASSERT(mnoderanges[*prev].mnr_pfnlo > end);
1527 1527                          }
1528 1528                          mnoderanges[n].mnr_next = *prev;
1529 1529                          membar_sync();
1530 1530                          *prev = n;
1531 1531                  }
1532 1532  
1533 1533                  if (mem_node_config[mnode].physmax > MEMRANGEHI(mri))
1534 1534                          mri--;
1535 1535                  else
1536 1536                          break;
1537 1537          }
1538 1538  
1539 1539          mutex_exit(&mnoderange_lock);
1540 1540  }
1541 1541  
1542 1542  /*
1543 1543   * Update mnoderanges for memory hot-removal DR operations.
1544 1544   */
1545 1545  static void
1546 1546  mnode_range_del(int mnode)
1547 1547  {
1548 1548          _NOTE(ARGUNUSED(mnode));
1549 1549          ASSERT(0 <= mnode && mnode < max_mem_nodes);
1550 1550          /* TODO: support deletion operation. */
1551 1551          ASSERT(0);
1552 1552  }
1553 1553  
1554 1554  void
1555 1555  plat_slice_add(pfn_t start, pfn_t end)
1556 1556  {
1557 1557          mem_node_add_slice(start, end);
1558 1558          if (plat_dr_enabled()) {
1559 1559                  mnode_range_add(PFN_2_MEM_NODE(start));
1560 1560          }
1561 1561  }
1562 1562  
1563 1563  void
1564 1564  plat_slice_del(pfn_t start, pfn_t end)
1565 1565  {
1566 1566          ASSERT(PFN_2_MEM_NODE(start) == PFN_2_MEM_NODE(end));
1567 1567          ASSERT(plat_dr_enabled());
1568 1568          mnode_range_del(PFN_2_MEM_NODE(start));
1569 1569          mem_node_del_slice(start, end);
1570 1570  }
1571 1571  #endif  /* __xpv */
1572 1572  
1573 1573  /*ARGSUSED*/
1574 1574  int
1575 1575  mtype_init(vnode_t *vp, caddr_t vaddr, uint_t *flags, size_t pgsz)
1576 1576  {
1577 1577          int mtype = mtypetop;
1578 1578  
1579 1579  #if !defined(__xpv)
1580 1580  #if defined(__i386)
1581 1581          /*
1582 1582           * set the mtype range
1583 1583           * - kmem requests need to be below 4g if restricted_kmemalloc is set.
1584 1584           * - for non kmem requests, set range to above 4g if memory below 4g
1585 1585           * runs low.
1586 1586           */
1587 1587          if (restricted_kmemalloc && VN_ISKAS(vp) &&
1588 1588              (caddr_t)(vaddr) >= kernelheap &&
1589 1589              (caddr_t)(vaddr) < ekernelheap) {
1590 1590                  ASSERT(physmax4g);
1591 1591                  mtype = mtype4g;
1592 1592                  if (RESTRICT16M_ALLOC(freemem4g - btop(pgsz),
1593 1593                      btop(pgsz), *flags)) {
1594 1594                          *flags |= PGI_MT_RANGE16M;
1595 1595                  } else {
1596 1596                          VM_STAT_ADD(vmm_vmstats.unrestrict16mcnt);
1597 1597                          VM_STAT_COND_ADD((*flags & PG_PANIC),
1598 1598                              vmm_vmstats.pgpanicalloc);
1599 1599                          *flags |= PGI_MT_RANGE0;
1600 1600                  }
1601 1601                  return (mtype);
1602 1602          }
1603 1603  #endif  /* __i386 */
1604 1604  
1605 1605          if (RESTRICT4G_ALLOC) {
1606 1606                  VM_STAT_ADD(vmm_vmstats.restrict4gcnt);
1607 1607                  /* here only for > 4g systems */
1608 1608                  *flags |= PGI_MT_RANGE4G;
1609 1609          } else if (RESTRICT16M_ALLOC(freemem, btop(pgsz), *flags)) {
1610 1610                  *flags |= PGI_MT_RANGE16M;
1611 1611          } else {
1612 1612                  VM_STAT_ADD(vmm_vmstats.unrestrict16mcnt);
1613 1613                  VM_STAT_COND_ADD((*flags & PG_PANIC), vmm_vmstats.pgpanicalloc);

↓ open down ↓

1613 lines elided

↑ open up ↑

1614 1614                  *flags |= PGI_MT_RANGE0;
1615 1615          }
1616 1616  #endif /* !__xpv */
1617 1617          return (mtype);
1618 1618  }
1619 1619  
1620 1620  
1621 1621  /* mtype init for page_get_replacement_page */
1622 1622  /*ARGSUSED*/
1623 1623  int
1624      -mtype_pgr_init(int *flags, page_t *pp, int mnode, pgcnt_t pgcnt)
     1624 +mtype_pgr_init(int *flags, page_t *pp, pgcnt_t pgcnt)
1625 1625  {
1626 1626          int mtype = mtypetop;
1627 1627  #if !defined(__xpv)
1628 1628          if (RESTRICT16M_ALLOC(freemem, pgcnt, *flags)) {
1629 1629                  *flags |= PGI_MT_RANGE16M;
1630 1630          } else {
1631 1631                  VM_STAT_ADD(vmm_vmstats.unrestrict16mcnt);
1632 1632                  *flags |= PGI_MT_RANGE0;
1633 1633          }
1634 1634  #endif

1635 1635          return (mtype);
1636 1636  }
1637 1637  
1638 1638  /*
1639 1639   * Determine if the mnode range specified in mtype contains memory belonging
1640 1640   * to memory node mnode.  If flags & PGI_MT_RANGE is set then mtype contains
1641 1641   * the range from high pfn to 0, 16m or 4g.
1642 1642   *
1643 1643   * Return first mnode range type index found otherwise return -1 if none found.
1644 1644   */
1645 1645  int
1646 1646  mtype_func(int mnode, int mtype, uint_t flags)
1647 1647  {
1648 1648          if (flags & PGI_MT_RANGE) {
1649 1649                  int     mnr_lim = MRI_0;
1650 1650  
1651 1651                  if (flags & PGI_MT_NEXT) {
1652 1652                          mtype = mnoderanges[mtype].mnr_next;
1653 1653                  }
1654 1654                  if (flags & PGI_MT_RANGE4G)
1655 1655                          mnr_lim = MRI_4G;       /* exclude 0-4g range */
1656 1656                  else if (flags & PGI_MT_RANGE16M)
1657 1657                          mnr_lim = MRI_16M;      /* exclude 0-16m range */
1658 1658                  while (mtype != -1 &&
1659 1659                      mnoderanges[mtype].mnr_memrange <= mnr_lim) {
1660 1660                          if (mnoderanges[mtype].mnr_mnode == mnode)
1661 1661                                  return (mtype);
1662 1662                          mtype = mnoderanges[mtype].mnr_next;
1663 1663                  }
1664 1664          } else if (mnoderanges[mtype].mnr_mnode == mnode) {
1665 1665                  return (mtype);
1666 1666          }
1667 1667          return (-1);
1668 1668  }
1669 1669  
1670 1670  /*
1671 1671   * Update the page list max counts with the pfn range specified by the
1672 1672   * input parameters.
1673 1673   */
1674 1674  void
1675 1675  mtype_modify_max(pfn_t startpfn, long cnt)
1676 1676  {
1677 1677          int             mtype;
1678 1678          pgcnt_t         inc;
1679 1679          spgcnt_t        scnt = (spgcnt_t)(cnt);
1680 1680          pgcnt_t         acnt = ABS(scnt);
1681 1681          pfn_t           endpfn = startpfn + acnt;
1682 1682          pfn_t           pfn, lo;
1683 1683  
1684 1684          if (!physmax4g)
1685 1685                  return;
1686 1686  
1687 1687          mtype = mtypetop;
1688 1688          for (pfn = endpfn; pfn > startpfn; ) {
1689 1689                  ASSERT(mtype != -1);
1690 1690                  lo = mnoderanges[mtype].mnr_pfnlo;
1691 1691                  if (pfn > lo) {
1692 1692                          if (startpfn >= lo) {
1693 1693                                  inc = pfn - startpfn;
1694 1694                          } else {
1695 1695                                  inc = pfn - lo;
1696 1696                          }
1697 1697                          if (mnoderanges[mtype].mnr_memrange != MRI_4G) {
1698 1698                                  if (scnt > 0)
1699 1699                                          maxmem4g += inc;
1700 1700                                  else
1701 1701                                          maxmem4g -= inc;
1702 1702                          }
1703 1703                          pfn -= inc;
1704 1704                  }
1705 1705                  mtype = mnoderanges[mtype].mnr_next;
1706 1706          }
1707 1707  }
1708 1708  
1709 1709  int
1710 1710  mtype_2_mrange(int mtype)
1711 1711  {
1712 1712          return (mnoderanges[mtype].mnr_memrange);
1713 1713  }
1714 1714  
1715 1715  void
1716 1716  mnodetype_2_pfn(int mnode, int mtype, pfn_t *pfnlo, pfn_t *pfnhi)
1717 1717  {
1718 1718          _NOTE(ARGUNUSED(mnode));
1719 1719          ASSERT(mnoderanges[mtype].mnr_mnode == mnode);
1720 1720          *pfnlo = mnoderanges[mtype].mnr_pfnlo;
1721 1721          *pfnhi = mnoderanges[mtype].mnr_pfnhi;
1722 1722  }
1723 1723  
1724 1724  size_t
1725 1725  plcnt_sz(size_t ctrs_sz)
1726 1726  {
1727 1727  #ifdef DEBUG
1728 1728          int     szc, colors;
1729 1729  
1730 1730          ctrs_sz += mnoderangecnt * sizeof (struct mnr_mts) * mmu_page_sizes;
1731 1731          for (szc = 0; szc < mmu_page_sizes; szc++) {
1732 1732                  colors = page_get_pagecolors(szc);
1733 1733                  ctrs_sz += mnoderangecnt * sizeof (pgcnt_t) * colors;
1734 1734          }
1735 1735  #endif
1736 1736          return (ctrs_sz);
1737 1737  }
1738 1738  
1739 1739  caddr_t
1740 1740  plcnt_init(caddr_t addr)
1741 1741  {
1742 1742  #ifdef DEBUG
1743 1743          int     mt, szc, colors;
1744 1744  
1745 1745          for (mt = 0; mt < mnoderangecnt; mt++) {
1746 1746                  mnoderanges[mt].mnr_mts = (struct mnr_mts *)addr;
1747 1747                  addr += (sizeof (struct mnr_mts) * mmu_page_sizes);
1748 1748                  for (szc = 0; szc < mmu_page_sizes; szc++) {
1749 1749                          colors = page_get_pagecolors(szc);
1750 1750                          mnoderanges[mt].mnr_mts[szc].mnr_mts_colors = colors;
1751 1751                          mnoderanges[mt].mnr_mts[szc].mnr_mtsc_pgcnt =
1752 1752                              (pgcnt_t *)addr;
1753 1753                          addr += (sizeof (pgcnt_t) * colors);
1754 1754                  }
1755 1755          }
1756 1756  #endif
1757 1757          return (addr);
1758 1758  }
1759 1759  
1760 1760  void
1761 1761  plcnt_inc_dec(page_t *pp, int mtype, int szc, long cnt, int flags)
1762 1762  {
1763 1763          _NOTE(ARGUNUSED(pp));
1764 1764  #ifdef DEBUG
1765 1765          int     bin = PP_2_BIN(pp);
1766 1766  
1767 1767          atomic_add_long(&mnoderanges[mtype].mnr_mts[szc].mnr_mts_pgcnt, cnt);
1768 1768          atomic_add_long(&mnoderanges[mtype].mnr_mts[szc].mnr_mtsc_pgcnt[bin],
1769 1769              cnt);
1770 1770  #endif
1771 1771          ASSERT(mtype == PP_2_MTYPE(pp));
1772 1772          if (physmax4g && mnoderanges[mtype].mnr_memrange != MRI_4G)
1773 1773                  atomic_add_long(&freemem4g, cnt);
1774 1774          if (flags & PG_CACHE_LIST)
1775 1775                  atomic_add_long(&mnoderanges[mtype].mnr_mt_clpgcnt, cnt);
1776 1776          else
1777 1777                  atomic_add_long(&mnoderanges[mtype].mnr_mt_flpgcnt[szc], cnt);
1778 1778          atomic_add_long(&mnoderanges[mtype].mnr_mt_totcnt, cnt);
1779 1779  }
1780 1780  
1781 1781  /*
1782 1782   * Returns the free page count for mnode
1783 1783   */
1784 1784  int
1785 1785  mnode_pgcnt(int mnode)
1786 1786  {
1787 1787          int     mtype = mtypetop;
1788 1788          int     flags = PGI_MT_RANGE0;
1789 1789          pgcnt_t pgcnt = 0;
1790 1790  
1791 1791          mtype = mtype_func(mnode, mtype, flags);
1792 1792  
1793 1793          while (mtype != -1) {
1794 1794                  pgcnt += MTYPE_FREEMEM(mtype);
1795 1795                  mtype = mtype_func(mnode, mtype, flags | PGI_MT_NEXT);
1796 1796          }
1797 1797          return (pgcnt);
1798 1798  }
1799 1799  
1800 1800  /*
1801 1801   * Initialize page coloring variables based on the l2 cache parameters.
1802 1802   * Calculate and return memory needed for page coloring data structures.
1803 1803   */
1804 1804  size_t
1805 1805  page_coloring_init(uint_t l2_sz, int l2_linesz, int l2_assoc)
1806 1806  {
1807 1807          _NOTE(ARGUNUSED(l2_linesz));
1808 1808          size_t  colorsz = 0;
1809 1809          int     i;
1810 1810          int     colors;
1811 1811  
1812 1812  #if defined(__xpv)
1813 1813          /*
1814 1814           * Hypervisor domains currently don't have any concept of NUMA.
1815 1815           * Hence we'll act like there is only 1 memrange.
1816 1816           */
1817 1817          i = memrange_num(1);
1818 1818  #else /* !__xpv */
1819 1819          /*
1820 1820           * Reduce the memory ranges lists if we don't have large amounts
1821 1821           * of memory. This avoids searching known empty free lists.
1822 1822           * To support memory DR operations, we need to keep memory ranges
1823 1823           * for possible memory hot-add operations.
1824 1824           */
1825 1825          if (plat_dr_physmax > physmax)
1826 1826                  i = memrange_num(plat_dr_physmax);
1827 1827          else
1828 1828                  i = memrange_num(physmax);
1829 1829  #if defined(__i386)
1830 1830          if (i > MRI_4G)
1831 1831                  restricted_kmemalloc = 0;
1832 1832  #endif
1833 1833          /* physmax greater than 4g */
1834 1834          if (i == MRI_4G)
1835 1835                  physmax4g = 1;
1836 1836  #endif /* !__xpv */
1837 1837          memranges += i;
1838 1838          nranges -= i;
1839 1839  
1840 1840          ASSERT(mmu_page_sizes <= MMU_PAGE_SIZES);
1841 1841  
1842 1842          ASSERT(ISP2(l2_linesz));
1843 1843          ASSERT(l2_sz > MMU_PAGESIZE);
1844 1844  
1845 1845          /* l2_assoc is 0 for fully associative l2 cache */
1846 1846          if (l2_assoc)
1847 1847                  l2_colors = MAX(1, l2_sz / (l2_assoc * MMU_PAGESIZE));
1848 1848          else
1849 1849                  l2_colors = 1;
1850 1850  
1851 1851          ASSERT(ISP2(l2_colors));
1852 1852  
1853 1853          /* for scalability, configure at least PAGE_COLORS_MIN color bins */
1854 1854          page_colors = MAX(l2_colors, PAGE_COLORS_MIN);
1855 1855  
1856 1856          /*
1857 1857           * cpu_page_colors is non-zero when a page color may be spread across
1858 1858           * multiple bins.
1859 1859           */
1860 1860          if (l2_colors < page_colors)
1861 1861                  cpu_page_colors = l2_colors;
1862 1862  
1863 1863          ASSERT(ISP2(page_colors));
1864 1864  
1865 1865          page_colors_mask = page_colors - 1;
1866 1866  
1867 1867          ASSERT(ISP2(CPUSETSIZE()));
1868 1868          page_coloring_shift = lowbit(CPUSETSIZE());
1869 1869  
1870 1870          /* initialize number of colors per page size */
1871 1871          for (i = 0; i <= mmu.max_page_level; i++) {
1872 1872                  hw_page_array[i].hp_size = LEVEL_SIZE(i);
1873 1873                  hw_page_array[i].hp_shift = LEVEL_SHIFT(i);
1874 1874                  hw_page_array[i].hp_pgcnt = LEVEL_SIZE(i) >> LEVEL_SHIFT(0);
1875 1875                  hw_page_array[i].hp_colors = (page_colors_mask >>
1876 1876                      (hw_page_array[i].hp_shift - hw_page_array[0].hp_shift))
1877 1877                      + 1;
1878 1878                  colorequivszc[i] = 0;
1879 1879          }
1880 1880  
1881 1881          /*
1882 1882           * The value of cpu_page_colors determines if additional color bins
1883 1883           * need to be checked for a particular color in the page_get routines.
1884 1884           */
1885 1885          if (cpu_page_colors != 0) {
1886 1886  
1887 1887                  int a = lowbit(page_colors) - lowbit(cpu_page_colors);
1888 1888                  ASSERT(a > 0);
1889 1889                  ASSERT(a < 16);
1890 1890  
1891 1891                  for (i = 0; i <= mmu.max_page_level; i++) {
1892 1892                          if ((colors = hw_page_array[i].hp_colors) <= 1) {
1893 1893                                  colorequivszc[i] = 0;
1894 1894                                  continue;
1895 1895                          }
1896 1896                          while ((colors >> a) == 0)
1897 1897                                  a--;
1898 1898                          ASSERT(a >= 0);
1899 1899  
1900 1900                          /* higher 4 bits encodes color equiv mask */
1901 1901                          colorequivszc[i] = (a << 4);
1902 1902                  }
1903 1903          }
1904 1904  
1905 1905          /* factor in colorequiv to check additional 'equivalent' bins. */
1906 1906          if (colorequiv > 1) {
1907 1907  
1908 1908                  int a = lowbit(colorequiv) - 1;
1909 1909                  if (a > 15)
1910 1910                          a = 15;
1911 1911  
1912 1912                  for (i = 0; i <= mmu.max_page_level; i++) {
1913 1913                          if ((colors = hw_page_array[i].hp_colors) <= 1) {
1914 1914                                  continue;
1915 1915                          }
1916 1916                          while ((colors >> a) == 0)
1917 1917                                  a--;
1918 1918                          if ((a << 4) > colorequivszc[i]) {
1919 1919                                  colorequivszc[i] = (a << 4);
1920 1920                          }
1921 1921                  }
1922 1922          }
1923 1923  
1924 1924          /* size for mnoderanges */
1925 1925          for (mnoderangecnt = 0, i = 0; i < max_mem_nodes; i++)
1926 1926                  mnoderangecnt += mnode_range_cnt(i);
1927 1927          if (plat_dr_support_memory()) {
1928 1928                  /*
1929 1929                   * Reserve enough space for memory DR operations.
1930 1930                   * Two extra mnoderanges for possbile fragmentations,
1931 1931                   * one for the 2G boundary and the other for the 4G boundary.
1932 1932                   * We don't expect a memory board crossing the 16M boundary
1933 1933                   * for memory hot-add operations on x86 platforms.
1934 1934                   */
1935 1935                  mnoderangecnt += 2 + max_mem_nodes - lgrp_plat_node_cnt;
1936 1936          }
1937 1937          colorsz = mnoderangecnt * sizeof (mnoderange_t);
1938 1938  
1939 1939          /* size for fpc_mutex and cpc_mutex */
1940 1940          colorsz += (2 * max_mem_nodes * sizeof (kmutex_t) * NPC_MUTEX);
1941 1941  
1942 1942          /* size of page_freelists */
1943 1943          colorsz += mnoderangecnt * sizeof (page_t ***);
1944 1944          colorsz += mnoderangecnt * mmu_page_sizes * sizeof (page_t **);
1945 1945  
1946 1946          for (i = 0; i < mmu_page_sizes; i++) {
1947 1947                  colors = page_get_pagecolors(i);
1948 1948                  colorsz += mnoderangecnt * colors * sizeof (page_t *);
1949 1949          }
1950 1950  
1951 1951          /* size of page_cachelists */
1952 1952          colorsz += mnoderangecnt * sizeof (page_t **);
1953 1953          colorsz += mnoderangecnt * page_colors * sizeof (page_t *);
1954 1954  
1955 1955          return (colorsz);
1956 1956  }
1957 1957  
1958 1958  /*
1959 1959   * Called once at startup to configure page_coloring data structures and
1960 1960   * does the 1st page_free()/page_freelist_add().
1961 1961   */
1962 1962  void
1963 1963  page_coloring_setup(caddr_t pcmemaddr)
1964 1964  {
1965 1965          int     i;
1966 1966          int     j;
1967 1967          int     k;
1968 1968          caddr_t addr;
1969 1969          int     colors;
1970 1970  
1971 1971          /*
1972 1972           * do page coloring setup
1973 1973           */
1974 1974          addr = pcmemaddr;
1975 1975  
1976 1976          mnoderanges = (mnoderange_t *)addr;
1977 1977          addr += (mnoderangecnt * sizeof (mnoderange_t));
1978 1978  
1979 1979          mnode_range_setup(mnoderanges);
1980 1980  
1981 1981          for (k = 0; k < NPC_MUTEX; k++) {
1982 1982                  fpc_mutex[k] = (kmutex_t *)addr;
1983 1983                  addr += (max_mem_nodes * sizeof (kmutex_t));
1984 1984          }
1985 1985          for (k = 0; k < NPC_MUTEX; k++) {
1986 1986                  cpc_mutex[k] = (kmutex_t *)addr;
1987 1987                  addr += (max_mem_nodes * sizeof (kmutex_t));
1988 1988          }
1989 1989          page_freelists = (page_t ****)addr;
1990 1990          addr += (mnoderangecnt * sizeof (page_t ***));
1991 1991  
1992 1992          page_cachelists = (page_t ***)addr;
1993 1993          addr += (mnoderangecnt * sizeof (page_t **));
1994 1994  
1995 1995          for (i = 0; i < mnoderangecnt; i++) {
1996 1996                  page_freelists[i] = (page_t ***)addr;
1997 1997                  addr += (mmu_page_sizes * sizeof (page_t **));
1998 1998  
1999 1999                  for (j = 0; j < mmu_page_sizes; j++) {
2000 2000                          colors = page_get_pagecolors(j);
2001 2001                          page_freelists[i][j] = (page_t **)addr;
2002 2002                          addr += (colors * sizeof (page_t *));
2003 2003                  }
2004 2004                  page_cachelists[i] = (page_t **)addr;
2005 2005                  addr += (page_colors * sizeof (page_t *));
2006 2006          }
2007 2007  }
2008 2008  
2009 2009  #if defined(__xpv)
2010 2010  /*
2011 2011   * Give back 10% of the io_pool pages to the free list.
2012 2012   * Don't shrink the pool below some absolute minimum.
2013 2013   */
2014 2014  static void
2015 2015  page_io_pool_shrink()
2016 2016  {
2017 2017          int retcnt;
2018 2018          page_t *pp, *pp_first, *pp_last, **curpool;
2019 2019          mfn_t mfn;
2020 2020          int bothpools = 0;
2021 2021  
2022 2022          mutex_enter(&io_pool_lock);
2023 2023          io_pool_shrink_attempts++;      /* should be a kstat? */
2024 2024          retcnt = io_pool_cnt / 10;
2025 2025          if (io_pool_cnt - retcnt < io_pool_cnt_min)
2026 2026                  retcnt = io_pool_cnt - io_pool_cnt_min;
2027 2027          if (retcnt <= 0)
2028 2028                  goto done;
2029 2029          io_pool_shrinks++;      /* should be a kstat? */
2030 2030          curpool = &io_pool_4g;
2031 2031  domore:
2032 2032          /*
2033 2033           * Loop through taking pages from the end of the list
2034 2034           * (highest mfns) till amount to return reached.
2035 2035           */
2036 2036          for (pp = *curpool; pp && retcnt > 0; ) {
2037 2037                  pp_first = pp_last = pp->p_prev;
2038 2038                  if (pp_first == *curpool)
2039 2039                          break;
2040 2040                  retcnt--;
2041 2041                  io_pool_cnt--;
2042 2042                  page_io_pool_sub(curpool, pp_first, pp_last);
2043 2043                  if ((mfn = pfn_to_mfn(pp->p_pagenum)) < start_mfn)
2044 2044                          start_mfn = mfn;
2045 2045                  page_free(pp_first, 1);
2046 2046                  pp = *curpool;
2047 2047          }
2048 2048          if (retcnt != 0 && !bothpools) {
2049 2049                  /*
2050 2050                   * If not enough found in less constrained pool try the
2051 2051                   * more constrained one.
2052 2052                   */
2053 2053                  curpool = &io_pool_16m;
2054 2054                  bothpools = 1;
2055 2055                  goto domore;
2056 2056          }
2057 2057  done:
2058 2058          mutex_exit(&io_pool_lock);
2059 2059  }
2060 2060  
2061 2061  #endif  /* __xpv */
2062 2062  
2063 2063  uint_t
2064 2064  page_create_update_flags_x86(uint_t flags)
2065 2065  {
2066 2066  #if defined(__xpv)
2067 2067          /*
2068 2068           * Check this is an urgent allocation and free pages are depleted.
2069 2069           */
2070 2070          if (!(flags & PG_WAIT) && freemem < desfree)
2071 2071                  page_io_pool_shrink();
2072 2072  #else /* !__xpv */
2073 2073          /*
2074 2074           * page_create_get_something may call this because 4g memory may be
2075 2075           * depleted. Set flags to allow for relocation of base page below
2076 2076           * 4g if necessary.
2077 2077           */
2078 2078          if (physmax4g)
2079 2079                  flags |= (PGI_PGCPSZC0 | PGI_PGCPHIPRI);
2080 2080  #endif /* __xpv */
2081 2081          return (flags);
2082 2082  }
2083 2083  
2084 2084  /*ARGSUSED*/
2085 2085  int
2086 2086  bp_color(struct buf *bp)
2087 2087  {
2088 2088          return (0);
2089 2089  }
2090 2090  
2091 2091  #if defined(__xpv)
2092 2092  
2093 2093  /*
2094 2094   * Take pages out of an io_pool
2095 2095   */
2096 2096  static void
2097 2097  page_io_pool_sub(page_t **poolp, page_t *pp_first, page_t *pp_last)
2098 2098  {
2099 2099          if (*poolp == pp_first) {
2100 2100                  *poolp = pp_last->p_next;
2101 2101                  if (*poolp == pp_first)
2102 2102                          *poolp = NULL;
2103 2103          }
2104 2104          pp_first->p_prev->p_next = pp_last->p_next;
2105 2105          pp_last->p_next->p_prev = pp_first->p_prev;
2106 2106          pp_first->p_prev = pp_last;
2107 2107          pp_last->p_next = pp_first;
2108 2108  }
2109 2109  
2110 2110  /*
2111 2111   * Put a page on the io_pool list. The list is ordered by increasing MFN.
2112 2112   */
2113 2113  static void
2114 2114  page_io_pool_add(page_t **poolp, page_t *pp)
2115 2115  {
2116 2116          page_t  *look;
2117 2117          mfn_t   mfn = mfn_list[pp->p_pagenum];
2118 2118  
2119 2119          if (*poolp == NULL) {
2120 2120                  *poolp = pp;
2121 2121                  pp->p_next = pp;
2122 2122                  pp->p_prev = pp;
2123 2123                  return;
2124 2124          }
2125 2125  
2126 2126          /*
2127 2127           * Since we try to take pages from the high end of the pool
2128 2128           * chances are good that the pages to be put on the list will
2129 2129           * go at or near the end of the list. so start at the end and
2130 2130           * work backwards.
2131 2131           */
2132 2132          look = (*poolp)->p_prev;
2133 2133          while (mfn < mfn_list[look->p_pagenum]) {
2134 2134                  look = look->p_prev;
2135 2135                  if (look == (*poolp)->p_prev)
2136 2136                          break; /* backed all the way to front of list */
2137 2137          }
2138 2138  
2139 2139          /* insert after look */
2140 2140          pp->p_prev = look;
2141 2141          pp->p_next = look->p_next;
2142 2142          pp->p_next->p_prev = pp;
2143 2143          look->p_next = pp;
2144 2144          if (mfn < mfn_list[(*poolp)->p_pagenum]) {
2145 2145                  /*
2146 2146                   * we inserted a new first list element
2147 2147                   * adjust pool pointer to newly inserted element
2148 2148                   */
2149 2149                  *poolp = pp;
2150 2150          }
2151 2151  }
2152 2152  
2153 2153  /*
2154 2154   * Add a page to the io_pool.  Setting the force flag will force the page
2155 2155   * into the io_pool no matter what.
2156 2156   */
2157 2157  static void
2158 2158  add_page_to_pool(page_t *pp, int force)
2159 2159  {
2160 2160          page_t *highest;
2161 2161          page_t *freep = NULL;
2162 2162  
2163 2163          mutex_enter(&io_pool_lock);
2164 2164          /*
2165 2165           * Always keep the scarce low memory pages
2166 2166           */
2167 2167          if (mfn_list[pp->p_pagenum] < PFN_16MEG) {
2168 2168                  ++io_pool_cnt;
2169 2169                  page_io_pool_add(&io_pool_16m, pp);
2170 2170                  goto done;
2171 2171          }
2172 2172          if (io_pool_cnt < io_pool_cnt_max || force || io_pool_4g == NULL) {
2173 2173                  ++io_pool_cnt;
2174 2174                  page_io_pool_add(&io_pool_4g, pp);
2175 2175          } else {
2176 2176                  highest = io_pool_4g->p_prev;
2177 2177                  if (mfn_list[pp->p_pagenum] < mfn_list[highest->p_pagenum]) {
2178 2178                          page_io_pool_sub(&io_pool_4g, highest, highest);
2179 2179                          page_io_pool_add(&io_pool_4g, pp);
2180 2180                          freep = highest;
2181 2181                  } else {
2182 2182                          freep = pp;
2183 2183                  }
2184 2184          }
2185 2185  done:
2186 2186          mutex_exit(&io_pool_lock);
2187 2187          if (freep)
2188 2188                  page_free(freep, 1);
2189 2189  }
2190 2190  
2191 2191  
2192 2192  int contig_pfn_cnt;     /* no of pfns in the contig pfn list */
2193 2193  int contig_pfn_max;     /* capacity of the contig pfn list */
2194 2194  int next_alloc_pfn;     /* next position in list to start a contig search */
2195 2195  int contig_pfnlist_updates;     /* pfn list update count */
2196 2196  int contig_pfnlist_builds;      /* how many times have we (re)built list */
2197 2197  int contig_pfnlist_buildfailed; /* how many times has list build failed */
2198 2198  int create_contig_pending;      /* nonzero means taskq creating contig list */
2199 2199  pfn_t *contig_pfn_list = NULL;  /* list of contig pfns in ascending mfn order */
2200 2200  
2201 2201  /*
2202 2202   * Function to use in sorting a list of pfns by their underlying mfns.
2203 2203   */
2204 2204  static int
2205 2205  mfn_compare(const void *pfnp1, const void *pfnp2)
2206 2206  {
2207 2207          mfn_t mfn1 = mfn_list[*(pfn_t *)pfnp1];
2208 2208          mfn_t mfn2 = mfn_list[*(pfn_t *)pfnp2];
2209 2209  
2210 2210          if (mfn1 > mfn2)
2211 2211                  return (1);
2212 2212          if (mfn1 < mfn2)
2213 2213                  return (-1);
2214 2214          return (0);
2215 2215  }
2216 2216  
2217 2217  /*
2218 2218   * Compact the contig_pfn_list by tossing all the non-contiguous
2219 2219   * elements from the list.
2220 2220   */
2221 2221  static void
2222 2222  compact_contig_pfn_list(void)
2223 2223  {
2224 2224          pfn_t pfn, lapfn, prev_lapfn;
2225 2225          mfn_t mfn;
2226 2226          int i, newcnt = 0;
2227 2227  
2228 2228          prev_lapfn = 0;
2229 2229          for (i = 0; i < contig_pfn_cnt - 1; i++) {
2230 2230                  pfn = contig_pfn_list[i];
2231 2231                  lapfn = contig_pfn_list[i + 1];
2232 2232                  mfn = mfn_list[pfn];
2233 2233                  /*
2234 2234                   * See if next pfn is for a contig mfn
2235 2235                   */
2236 2236                  if (mfn_list[lapfn] != mfn + 1)
2237 2237                          continue;
2238 2238                  /*
2239 2239                   * pfn and lookahead are both put in list
2240 2240                   * unless pfn is the previous lookahead.
2241 2241                   */
2242 2242                  if (pfn != prev_lapfn)
2243 2243                          contig_pfn_list[newcnt++] = pfn;
2244 2244                  contig_pfn_list[newcnt++] = lapfn;
2245 2245                  prev_lapfn = lapfn;
2246 2246          }
2247 2247          for (i = newcnt; i < contig_pfn_cnt; i++)
2248 2248                  contig_pfn_list[i] = 0;
2249 2249          contig_pfn_cnt = newcnt;
2250 2250  }
2251 2251  
2252 2252  /*ARGSUSED*/
2253 2253  static void
2254 2254  call_create_contiglist(void *arg)
2255 2255  {
2256 2256          (void) create_contig_pfnlist(PG_WAIT);
2257 2257  }
2258 2258  
2259 2259  /*
2260 2260   * Create list of freelist pfns that have underlying
2261 2261   * contiguous mfns.  The list is kept in ascending mfn order.
2262 2262   * returns 1 if list created else 0.
2263 2263   */
2264 2264  static int
2265 2265  create_contig_pfnlist(uint_t flags)
2266 2266  {
2267 2267          pfn_t pfn;
2268 2268          page_t *pp;
2269 2269          int ret = 1;
2270 2270  
2271 2271          mutex_enter(&contig_list_lock);
2272 2272          if (contig_pfn_list != NULL)
2273 2273                  goto out;
2274 2274          contig_pfn_max = freemem + (freemem / 10);
2275 2275          contig_pfn_list = kmem_zalloc(contig_pfn_max * sizeof (pfn_t),
2276 2276              (flags & PG_WAIT) ? KM_SLEEP : KM_NOSLEEP);
2277 2277          if (contig_pfn_list == NULL) {
2278 2278                  /*
2279 2279                   * If we could not create the contig list (because
2280 2280                   * we could not sleep for memory).  Dispatch a taskq that can
2281 2281                   * sleep to get the memory.
2282 2282                   */
2283 2283                  if (!create_contig_pending) {
2284 2284                          if (taskq_dispatch(system_taskq, call_create_contiglist,
2285 2285                              NULL, TQ_NOSLEEP) != TASKQID_INVALID)
2286 2286                                  create_contig_pending = 1;
2287 2287                  }
2288 2288                  contig_pfnlist_buildfailed++;   /* count list build failures */
2289 2289                  ret = 0;
2290 2290                  goto out;
2291 2291          }
2292 2292          create_contig_pending = 0;
2293 2293          ASSERT(contig_pfn_cnt == 0);
2294 2294          for (pfn = 0; pfn < mfn_count; pfn++) {
2295 2295                  pp = page_numtopp_nolock(pfn);
2296 2296                  if (pp == NULL || !PP_ISFREE(pp))
2297 2297                          continue;
2298 2298                  contig_pfn_list[contig_pfn_cnt] = pfn;
2299 2299                  if (++contig_pfn_cnt == contig_pfn_max)
2300 2300                          break;
2301 2301          }
2302 2302          /*
2303 2303           * Sanity check the new list.
2304 2304           */
2305 2305          if (contig_pfn_cnt < 2) { /* no contig pfns */
2306 2306                  contig_pfn_cnt = 0;
2307 2307                  contig_pfnlist_buildfailed++;
2308 2308                  kmem_free(contig_pfn_list, contig_pfn_max * sizeof (pfn_t));
2309 2309                  contig_pfn_list = NULL;
2310 2310                  contig_pfn_max = 0;
2311 2311                  ret = 0;
2312 2312                  goto out;
2313 2313          }
2314 2314          qsort(contig_pfn_list, contig_pfn_cnt, sizeof (pfn_t), mfn_compare);
2315 2315          compact_contig_pfn_list();
2316 2316          /*
2317 2317           * Make sure next search of the newly created contiguous pfn
2318 2318           * list starts at the beginning of the list.
2319 2319           */
2320 2320          next_alloc_pfn = 0;
2321 2321          contig_pfnlist_builds++;        /* count list builds */
2322 2322  out:
2323 2323          mutex_exit(&contig_list_lock);
2324 2324          return (ret);
2325 2325  }
2326 2326  
2327 2327  
2328 2328  /*
2329 2329   * Toss the current contig pfnlist.  Someone is about to do a massive
2330 2330   * update to pfn<->mfn mappings.  So we have them destroy the list and lock
2331 2331   * it till they are done with their update.
2332 2332   */
2333 2333  void
2334 2334  clear_and_lock_contig_pfnlist()
2335 2335  {
2336 2336          pfn_t *listp = NULL;
2337 2337          size_t listsize;
2338 2338  
2339 2339          mutex_enter(&contig_list_lock);
2340 2340          if (contig_pfn_list != NULL) {
2341 2341                  listp = contig_pfn_list;
2342 2342                  listsize = contig_pfn_max * sizeof (pfn_t);
2343 2343                  contig_pfn_list = NULL;
2344 2344                  contig_pfn_max = contig_pfn_cnt = 0;
2345 2345          }
2346 2346          if (listp != NULL)
2347 2347                  kmem_free(listp, listsize);
2348 2348  }
2349 2349  
2350 2350  /*
2351 2351   * Unlock the contig_pfn_list.  The next attempted use of it will cause
2352 2352   * it to be re-created.
2353 2353   */
2354 2354  void
2355 2355  unlock_contig_pfnlist()
2356 2356  {
2357 2357          mutex_exit(&contig_list_lock);
2358 2358  }
2359 2359  
2360 2360  /*
2361 2361   * Update the contiguous pfn list in response to a pfn <-> mfn reassignment
2362 2362   */
2363 2363  void
2364 2364  update_contig_pfnlist(pfn_t pfn, mfn_t oldmfn, mfn_t newmfn)
2365 2365  {
2366 2366          int probe_hi, probe_lo, probe_pos, insert_after, insert_point;
2367 2367          pfn_t probe_pfn;
2368 2368          mfn_t probe_mfn;
2369 2369          int drop_lock = 0;
2370 2370  
2371 2371          if (mutex_owner(&contig_list_lock) != curthread) {
2372 2372                  drop_lock = 1;
2373 2373                  mutex_enter(&contig_list_lock);
2374 2374          }
2375 2375          if (contig_pfn_list == NULL)
2376 2376                  goto done;
2377 2377          contig_pfnlist_updates++;
2378 2378          /*
2379 2379           * Find the pfn in the current list.  Use a binary chop to locate it.
2380 2380           */
2381 2381          probe_hi = contig_pfn_cnt - 1;
2382 2382          probe_lo = 0;
2383 2383          probe_pos = (probe_hi + probe_lo) / 2;
2384 2384          while ((probe_pfn = contig_pfn_list[probe_pos]) != pfn) {
2385 2385                  if (probe_pos == probe_lo) { /* pfn not in list */
2386 2386                          probe_pos = -1;
2387 2387                          break;
2388 2388                  }
2389 2389                  if (pfn_to_mfn(probe_pfn) <= oldmfn)
2390 2390                          probe_lo = probe_pos;
2391 2391                  else
2392 2392                          probe_hi = probe_pos;
2393 2393                  probe_pos = (probe_hi + probe_lo) / 2;
2394 2394          }
2395 2395          if (probe_pos >= 0) {
2396 2396                  /*
2397 2397                   * Remove pfn from list and ensure next alloc
2398 2398                   * position stays in bounds.
2399 2399                   */
2400 2400                  if (--contig_pfn_cnt <= next_alloc_pfn)
2401 2401                          next_alloc_pfn = 0;
2402 2402                  if (contig_pfn_cnt < 2) { /* no contig pfns */
2403 2403                          contig_pfn_cnt = 0;
2404 2404                          kmem_free(contig_pfn_list,
2405 2405                              contig_pfn_max * sizeof (pfn_t));
2406 2406                          contig_pfn_list = NULL;
2407 2407                          contig_pfn_max = 0;
2408 2408                          goto done;
2409 2409                  }
2410 2410                  ovbcopy(&contig_pfn_list[probe_pos + 1],
2411 2411                      &contig_pfn_list[probe_pos],
2412 2412                      (contig_pfn_cnt - probe_pos) * sizeof (pfn_t));
2413 2413          }
2414 2414          if (newmfn == MFN_INVALID)
2415 2415                  goto done;
2416 2416          /*
2417 2417           * Check if new mfn has adjacent mfns in the list
2418 2418           */
2419 2419          probe_hi = contig_pfn_cnt - 1;
2420 2420          probe_lo = 0;
2421 2421          insert_after = -2;
2422 2422          do {
2423 2423                  probe_pos = (probe_hi + probe_lo) / 2;
2424 2424                  probe_mfn = pfn_to_mfn(contig_pfn_list[probe_pos]);
2425 2425                  if (newmfn == probe_mfn + 1)
2426 2426                          insert_after = probe_pos;
2427 2427                  else if (newmfn == probe_mfn - 1)
2428 2428                          insert_after = probe_pos - 1;
2429 2429                  if (probe_pos == probe_lo)
2430 2430                          break;
2431 2431                  if (probe_mfn <= newmfn)
2432 2432                          probe_lo = probe_pos;
2433 2433                  else
2434 2434                          probe_hi = probe_pos;
2435 2435          } while (insert_after == -2);
2436 2436          /*
2437 2437           * If there is space in the list and there are adjacent mfns
2438 2438           * insert the pfn in to its proper place in the list.
2439 2439           */
2440 2440          if (insert_after != -2 && contig_pfn_cnt + 1 <= contig_pfn_max) {
2441 2441                  insert_point = insert_after + 1;
2442 2442                  ovbcopy(&contig_pfn_list[insert_point],
2443 2443                      &contig_pfn_list[insert_point + 1],
2444 2444                      (contig_pfn_cnt - insert_point) * sizeof (pfn_t));
2445 2445                  contig_pfn_list[insert_point] = pfn;
2446 2446                  contig_pfn_cnt++;
2447 2447          }
2448 2448  done:
2449 2449          if (drop_lock)
2450 2450                  mutex_exit(&contig_list_lock);
2451 2451  }
2452 2452  
2453 2453  /*
2454 2454   * Called to (re-)populate the io_pool from the free page lists.
2455 2455   */
2456 2456  long
2457 2457  populate_io_pool(void)
2458 2458  {
2459 2459          pfn_t pfn;
2460 2460          mfn_t mfn, max_mfn;
2461 2461          page_t *pp;
2462 2462  
2463 2463          /*
2464 2464           * Figure out the bounds of the pool on first invocation.
2465 2465           * We use a percentage of memory for the io pool size.
2466 2466           * we allow that to shrink, but not to less than a fixed minimum
2467 2467           */
2468 2468          if (io_pool_cnt_max == 0) {
2469 2469                  io_pool_cnt_max = physmem / (100 / io_pool_physmem_pct);
2470 2470                  io_pool_cnt_lowater = io_pool_cnt_max;
2471 2471                  /*
2472 2472                   * This is the first time in populate_io_pool, grab a va to use
2473 2473                   * when we need to allocate pages.
2474 2474                   */
2475 2475                  io_pool_kva = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP);
2476 2476          }
2477 2477          /*
2478 2478           * If we are out of pages in the pool, then grow the size of the pool
2479 2479           */
2480 2480          if (io_pool_cnt == 0) {
2481 2481                  /*
2482 2482                   * Grow the max size of the io pool by 5%, but never more than
2483 2483                   * 25% of physical memory.
2484 2484                   */
2485 2485                  if (io_pool_cnt_max < physmem / 4)
2486 2486                          io_pool_cnt_max += io_pool_cnt_max / 20;
2487 2487          }
2488 2488          io_pool_grows++;        /* should be a kstat? */
2489 2489  
2490 2490          /*
2491 2491           * Get highest mfn on this platform, but limit to the 32 bit DMA max.
2492 2492           */
2493 2493          (void) mfn_to_pfn(start_mfn);
2494 2494          max_mfn = MIN(cached_max_mfn, PFN_4GIG);
2495 2495          for (mfn = start_mfn; mfn < max_mfn; start_mfn = ++mfn) {
2496 2496                  pfn = mfn_to_pfn(mfn);
2497 2497                  if (pfn & PFN_IS_FOREIGN_MFN)
2498 2498                          continue;
2499 2499                  /*
2500 2500                   * try to allocate it from free pages
2501 2501                   */
2502 2502                  pp = page_numtopp_alloc(pfn);
2503 2503                  if (pp == NULL)
2504 2504                          continue;
2505 2505                  PP_CLRFREE(pp);
2506 2506                  add_page_to_pool(pp, 1);
2507 2507                  if (io_pool_cnt >= io_pool_cnt_max)
2508 2508                          break;
2509 2509          }
2510 2510  
2511 2511          return (io_pool_cnt);
2512 2512  }
2513 2513  
2514 2514  /*
2515 2515   * Destroy a page that was being used for DMA I/O. It may or
2516 2516   * may not actually go back to the io_pool.
2517 2517   */
2518 2518  void
2519 2519  page_destroy_io(page_t *pp)
2520 2520  {
2521 2521          mfn_t mfn = mfn_list[pp->p_pagenum];
2522 2522  
2523 2523          /*
2524 2524           * When the page was alloc'd a reservation was made, release it now
2525 2525           */
2526 2526          page_unresv(1);
2527 2527          /*
2528 2528           * Unload translations, if any, then hash out the
2529 2529           * page to erase its identity.
2530 2530           */
2531 2531          (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
2532 2532          page_hashout(pp, NULL);
2533 2533  
2534 2534          /*
2535 2535           * If the page came from the free lists, just put it back to them.
2536 2536           * DomU pages always go on the free lists as well.
2537 2537           */
2538 2538          if (!DOMAIN_IS_INITDOMAIN(xen_info) || mfn >= PFN_4GIG) {
2539 2539                  page_free(pp, 1);
2540 2540                  return;
2541 2541          }
2542 2542  
2543 2543          add_page_to_pool(pp, 0);
2544 2544  }
2545 2545  
2546 2546  
2547 2547  long contig_searches;           /* count of times contig pages requested */
2548 2548  long contig_search_restarts;    /* count of contig ranges tried */
2549 2549  long contig_search_failed;      /* count of contig alloc failures */
2550 2550  
2551 2551  /*
2552 2552   * Free partial page list
2553 2553   */
2554 2554  static void
2555 2555  free_partial_list(page_t **pplist)
2556 2556  {
2557 2557          page_t *pp;
2558 2558  
2559 2559          while (*pplist != NULL) {
2560 2560                  pp = *pplist;
2561 2561                  page_io_pool_sub(pplist, pp, pp);
2562 2562                  page_free(pp, 1);
2563 2563          }
2564 2564  }
2565 2565  
2566 2566  /*
2567 2567   * Look thru the contiguous pfns that are not part of the io_pool for
2568 2568   * contiguous free pages.  Return a list of the found pages or NULL.
2569 2569   */
2570 2570  page_t *
2571 2571  find_contig_free(uint_t npages, uint_t flags, uint64_t pfnseg,
2572 2572      pgcnt_t pfnalign)
2573 2573  {
2574 2574          page_t *pp, *plist = NULL;
2575 2575          mfn_t mfn, prev_mfn, start_mfn;
2576 2576          pfn_t pfn;
2577 2577          int pages_needed, pages_requested;
2578 2578          int search_start;
2579 2579  
2580 2580          /*
2581 2581           * create the contig pfn list if not already done
2582 2582           */
2583 2583  retry:
2584 2584          mutex_enter(&contig_list_lock);
2585 2585          if (contig_pfn_list == NULL) {
2586 2586                  mutex_exit(&contig_list_lock);
2587 2587                  if (!create_contig_pfnlist(flags)) {
2588 2588                          return (NULL);
2589 2589                  }
2590 2590                  goto retry;
2591 2591          }
2592 2592          contig_searches++;
2593 2593          /*
2594 2594           * Search contiguous pfn list for physically contiguous pages not in
2595 2595           * the io_pool.  Start the search where the last search left off.
2596 2596           */
2597 2597          pages_requested = pages_needed = npages;
2598 2598          search_start = next_alloc_pfn;
2599 2599          start_mfn = prev_mfn = 0;
2600 2600          while (pages_needed) {
2601 2601                  pfn = contig_pfn_list[next_alloc_pfn];
2602 2602                  mfn = pfn_to_mfn(pfn);
2603 2603                  /*
2604 2604                   * Check if mfn is first one or contig to previous one and
2605 2605                   * if page corresponding to mfn is free and that mfn
2606 2606                   * range is not crossing a segment boundary.
2607 2607                   */
2608 2608                  if ((prev_mfn == 0 || mfn == prev_mfn + 1) &&
2609 2609                      (pp = page_numtopp_alloc(pfn)) != NULL &&
2610 2610                      !((mfn & pfnseg) < (start_mfn & pfnseg))) {
2611 2611                          PP_CLRFREE(pp);
2612 2612                          page_io_pool_add(&plist, pp);
2613 2613                          pages_needed--;
2614 2614                          if (prev_mfn == 0) {
2615 2615                                  if (pfnalign &&
2616 2616                                      mfn != P2ROUNDUP(mfn, pfnalign)) {
2617 2617                                          /*
2618 2618                                           * not properly aligned
2619 2619                                           */
2620 2620                                          contig_search_restarts++;
2621 2621                                          free_partial_list(&plist);
2622 2622                                          pages_needed = pages_requested;
2623 2623                                          start_mfn = prev_mfn = 0;
2624 2624                                          goto skip;
2625 2625                                  }
2626 2626                                  start_mfn = mfn;
2627 2627                          }
2628 2628                          prev_mfn = mfn;
2629 2629                  } else {
2630 2630                          contig_search_restarts++;
2631 2631                          free_partial_list(&plist);
2632 2632                          pages_needed = pages_requested;
2633 2633                          start_mfn = prev_mfn = 0;
2634 2634                  }
2635 2635  skip:
2636 2636                  if (++next_alloc_pfn == contig_pfn_cnt)
2637 2637                          next_alloc_pfn = 0;
2638 2638                  if (next_alloc_pfn == search_start)
2639 2639                          break; /* all pfns searched */
2640 2640          }
2641 2641          mutex_exit(&contig_list_lock);
2642 2642          if (pages_needed) {
2643 2643                  contig_search_failed++;
2644 2644                  /*
2645 2645                   * Failed to find enough contig pages.
2646 2646                   * free partial page list
2647 2647                   */
2648 2648                  free_partial_list(&plist);
2649 2649          }
2650 2650          return (plist);
2651 2651  }
2652 2652  
2653 2653  /*
2654 2654   * Search the reserved io pool pages for a page range with the
2655 2655   * desired characteristics.
2656 2656   */
2657 2657  page_t *
2658 2658  page_io_pool_alloc(ddi_dma_attr_t *mattr, int contig, pgcnt_t minctg)
2659 2659  {
2660 2660          page_t *pp_first, *pp_last;
2661 2661          page_t *pp, **poolp;
2662 2662          pgcnt_t nwanted, pfnalign;
2663 2663          uint64_t pfnseg;
2664 2664          mfn_t mfn, tmfn, hi_mfn, lo_mfn;
2665 2665          int align, attempt = 0;
2666 2666  
2667 2667          if (minctg == 1)
2668 2668                  contig = 0;
2669 2669          lo_mfn = mmu_btop(mattr->dma_attr_addr_lo);
2670 2670          hi_mfn = mmu_btop(mattr->dma_attr_addr_hi);
2671 2671          pfnseg = mmu_btop(mattr->dma_attr_seg);
2672 2672          align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer);
2673 2673          if (align > MMU_PAGESIZE)
2674 2674                  pfnalign = mmu_btop(align);
2675 2675          else
2676 2676                  pfnalign = 0;
2677 2677  
2678 2678  try_again:
2679 2679          /*
2680 2680           * See if we want pages for a legacy device
2681 2681           */
2682 2682          if (hi_mfn < PFN_16MEG)
2683 2683                  poolp = &io_pool_16m;
2684 2684          else
2685 2685                  poolp = &io_pool_4g;
2686 2686  try_smaller:
2687 2687          /*
2688 2688           * Take pages from I/O pool. We'll use pages from the highest
2689 2689           * MFN range possible.
2690 2690           */
2691 2691          pp_first = pp_last = NULL;
2692 2692          mutex_enter(&io_pool_lock);
2693 2693          nwanted = minctg;
2694 2694          for (pp = *poolp; pp && nwanted > 0; ) {
2695 2695                  pp = pp->p_prev;
2696 2696  
2697 2697                  /*
2698 2698                   * skip pages above allowable range
2699 2699                   */
2700 2700                  mfn = mfn_list[pp->p_pagenum];
2701 2701                  if (hi_mfn < mfn)
2702 2702                          goto skip;
2703 2703  
2704 2704                  /*
2705 2705                   * stop at pages below allowable range
2706 2706                   */
2707 2707                  if (lo_mfn > mfn)
2708 2708                          break;
2709 2709  restart:
2710 2710                  if (pp_last == NULL) {
2711 2711                          /*
2712 2712                           * Check alignment
2713 2713                           */
2714 2714                          tmfn = mfn - (minctg - 1);
2715 2715                          if (pfnalign && tmfn != P2ROUNDUP(tmfn, pfnalign))
2716 2716                                  goto skip; /* not properly aligned */
2717 2717                          /*
2718 2718                           * Check segment
2719 2719                           */
2720 2720                          if ((mfn & pfnseg) < (tmfn & pfnseg))
2721 2721                                  goto skip; /* crosses seg boundary */
2722 2722                          /*
2723 2723                           * Start building page list
2724 2724                           */
2725 2725                          pp_first = pp_last = pp;
2726 2726                          nwanted--;
2727 2727                  } else {
2728 2728                          /*
2729 2729                           * check physical contiguity if required
2730 2730                           */
2731 2731                          if (contig &&
2732 2732                              mfn_list[pp_first->p_pagenum] != mfn + 1) {
2733 2733                                  /*
2734 2734                                   * not a contiguous page, restart list.
2735 2735                                   */
2736 2736                                  pp_last = NULL;
2737 2737                                  nwanted = minctg;
2738 2738                                  goto restart;
2739 2739                          } else { /* add page to list */
2740 2740                                  pp_first = pp;
2741 2741                                  nwanted--;
2742 2742                          }
2743 2743                  }
2744 2744  skip:
2745 2745                  if (pp == *poolp)
2746 2746                          break;
2747 2747          }
2748 2748  
2749 2749          /*
2750 2750           * If we didn't find memory. Try the more constrained pool, then
2751 2751           * sweep free pages into the DMA pool and try again.
2752 2752           */
2753 2753          if (nwanted != 0) {
2754 2754                  mutex_exit(&io_pool_lock);
2755 2755                  /*
2756 2756                   * If we were looking in the less constrained pool and
2757 2757                   * didn't find pages, try the more constrained pool.
2758 2758                   */
2759 2759                  if (poolp == &io_pool_4g) {
2760 2760                          poolp = &io_pool_16m;
2761 2761                          goto try_smaller;
2762 2762                  }
2763 2763                  kmem_reap();
2764 2764                  if (++attempt < 4) {
2765 2765                          /*
2766 2766                           * Grab some more io_pool pages
2767 2767                           */
2768 2768                          (void) populate_io_pool();
2769 2769                          goto try_again; /* go around and retry */
2770 2770                  }
2771 2771                  return (NULL);
2772 2772          }
2773 2773          /*
2774 2774           * Found the pages, now snip them from the list
2775 2775           */
2776 2776          page_io_pool_sub(poolp, pp_first, pp_last);
2777 2777          io_pool_cnt -= minctg;
2778 2778          /*
2779 2779           * reset low water mark
2780 2780           */
2781 2781          if (io_pool_cnt < io_pool_cnt_lowater)
2782 2782                  io_pool_cnt_lowater = io_pool_cnt;
2783 2783          mutex_exit(&io_pool_lock);
2784 2784          return (pp_first);
2785 2785  }
2786 2786  
2787 2787  page_t *
2788 2788  page_swap_with_hypervisor(struct vnode *vp, u_offset_t off, caddr_t vaddr,
2789 2789      ddi_dma_attr_t *mattr, uint_t flags, pgcnt_t minctg)
2790 2790  {
2791 2791          uint_t kflags;
2792 2792          int order, extra, extpages, i, contig, nbits, extents;
2793 2793          page_t *pp, *expp, *pp_first, **pplist = NULL;
2794 2794          mfn_t *mfnlist = NULL;
2795 2795  
2796 2796          contig = flags & PG_PHYSCONTIG;
2797 2797          if (minctg == 1)
2798 2798                  contig = 0;
2799 2799          flags &= ~PG_PHYSCONTIG;
2800 2800          kflags = flags & PG_WAIT ? KM_SLEEP : KM_NOSLEEP;
2801 2801          /*
2802 2802           * Hypervisor will allocate extents, if we want contig
2803 2803           * pages extent must be >= minctg
2804 2804           */
2805 2805          if (contig) {
2806 2806                  order = highbit(minctg) - 1;
2807 2807                  if (minctg & ((1 << order) - 1))
2808 2808                          order++;
2809 2809                  extpages = 1 << order;
2810 2810          } else {
2811 2811                  order = 0;
2812 2812                  extpages = minctg;
2813 2813          }
2814 2814          if (extpages > minctg) {
2815 2815                  extra = extpages - minctg;
2816 2816                  if (!page_resv(extra, kflags))
2817 2817                          return (NULL);
2818 2818          }
2819 2819          pp_first = NULL;
2820 2820          pplist = kmem_alloc(extpages * sizeof (page_t *), kflags);
2821 2821          if (pplist == NULL)
2822 2822                  goto balloon_fail;
2823 2823          mfnlist = kmem_alloc(extpages * sizeof (mfn_t), kflags);
2824 2824          if (mfnlist == NULL)
2825 2825                  goto balloon_fail;
2826 2826          pp = page_create_va(vp, off, minctg * PAGESIZE, flags, &kvseg, vaddr);
2827 2827          if (pp == NULL)
2828 2828                  goto balloon_fail;
2829 2829          pp_first = pp;
2830 2830          if (extpages > minctg) {
2831 2831                  /*
2832 2832                   * fill out the rest of extent pages to swap
2833 2833                   * with the hypervisor
2834 2834                   */
2835 2835                  for (i = 0; i < extra; i++) {
2836 2836                          expp = page_create_va(vp,
2837 2837                              (u_offset_t)(uintptr_t)io_pool_kva,
2838 2838                              PAGESIZE, flags, &kvseg, io_pool_kva);
2839 2839                          if (expp == NULL)
2840 2840                                  goto balloon_fail;
2841 2841                          (void) hat_pageunload(expp, HAT_FORCE_PGUNLOAD);
2842 2842                          page_io_unlock(expp);
2843 2843                          page_hashout(expp, NULL);
2844 2844                          page_io_lock(expp);
2845 2845                          /*
2846 2846                           * add page to end of list
2847 2847                           */
2848 2848                          expp->p_prev = pp_first->p_prev;
2849 2849                          expp->p_next = pp_first;
2850 2850                          expp->p_prev->p_next = expp;
2851 2851                          pp_first->p_prev = expp;
2852 2852                  }
2853 2853  
2854 2854          }
2855 2855          for (i = 0; i < extpages; i++) {
2856 2856                  pplist[i] = pp;
2857 2857                  pp = pp->p_next;
2858 2858          }
2859 2859          nbits = highbit(mattr->dma_attr_addr_hi);
2860 2860          extents = contig ? 1 : minctg;
2861 2861          if (balloon_replace_pages(extents, pplist, nbits, order,
2862 2862              mfnlist) != extents) {
2863 2863                  if (ioalloc_dbg)
2864 2864                          cmn_err(CE_NOTE, "request to hypervisor"
2865 2865                              " for %d pages, maxaddr %" PRIx64 " failed",
2866 2866                              extpages, mattr->dma_attr_addr_hi);
2867 2867                  goto balloon_fail;
2868 2868          }
2869 2869  
2870 2870          kmem_free(pplist, extpages * sizeof (page_t *));
2871 2871          kmem_free(mfnlist, extpages * sizeof (mfn_t));
2872 2872          /*
2873 2873           * Return any excess pages to free list
2874 2874           */
2875 2875          if (extpages > minctg) {
2876 2876                  for (i = 0; i < extra; i++) {
2877 2877                          pp = pp_first->p_prev;
2878 2878                          page_sub(&pp_first, pp);
2879 2879                          page_io_unlock(pp);
2880 2880                          page_unresv(1);
2881 2881                          page_free(pp, 1);
2882 2882                  }
2883 2883          }
2884 2884          return (pp_first);
2885 2885  balloon_fail:
2886 2886          /*
2887 2887           * Return pages to free list and return failure
2888 2888           */
2889 2889          while (pp_first != NULL) {
2890 2890                  pp = pp_first;
2891 2891                  page_sub(&pp_first, pp);
2892 2892                  page_io_unlock(pp);
2893 2893                  if (pp->p_vnode != NULL)
2894 2894                          page_hashout(pp, NULL);
2895 2895                  page_free(pp, 1);
2896 2896          }
2897 2897          if (pplist)
2898 2898                  kmem_free(pplist, extpages * sizeof (page_t *));
2899 2899          if (mfnlist)
2900 2900                  kmem_free(mfnlist, extpages * sizeof (mfn_t));
2901 2901          page_unresv(extpages - minctg);
2902 2902          return (NULL);
2903 2903  }
2904 2904  
2905 2905  static void
2906 2906  return_partial_alloc(page_t *plist)
2907 2907  {
2908 2908          page_t *pp;
2909 2909  
2910 2910          while (plist != NULL) {
2911 2911                  pp = plist;
2912 2912                  page_sub(&plist, pp);
2913 2913                  page_io_unlock(pp);
2914 2914                  page_destroy_io(pp);
2915 2915          }
2916 2916  }
2917 2917  
2918 2918  static page_t *
2919 2919  page_get_contigpages(
2920 2920          struct vnode    *vp,
2921 2921          u_offset_t      off,
2922 2922          int             *npagesp,
2923 2923          uint_t          flags,
2924 2924          caddr_t         vaddr,
2925 2925          ddi_dma_attr_t  *mattr)
2926 2926  {
2927 2927          mfn_t   max_mfn = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL);
2928 2928          page_t  *plist; /* list to return */
2929 2929          page_t  *pp, *mcpl;
2930 2930          int     contig, anyaddr, npages, getone = 0;
2931 2931          mfn_t   lo_mfn;
2932 2932          mfn_t   hi_mfn;
2933 2933          pgcnt_t pfnalign = 0;
2934 2934          int     align, sgllen;
2935 2935          uint64_t pfnseg;
2936 2936          pgcnt_t minctg;
2937 2937  
2938 2938          npages = *npagesp;
2939 2939          ASSERT(mattr != NULL);
2940 2940          lo_mfn = mmu_btop(mattr->dma_attr_addr_lo);
2941 2941          hi_mfn = mmu_btop(mattr->dma_attr_addr_hi);
2942 2942          sgllen = mattr->dma_attr_sgllen;
2943 2943          pfnseg = mmu_btop(mattr->dma_attr_seg);
2944 2944          align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer);
2945 2945          if (align > MMU_PAGESIZE)
2946 2946                  pfnalign = mmu_btop(align);
2947 2947  
2948 2948          contig = flags & PG_PHYSCONTIG;
2949 2949          if (npages == -1) {
2950 2950                  npages = 1;
2951 2951                  pfnalign = 0;
2952 2952          }
2953 2953          /*
2954 2954           * Clear the contig flag if only one page is needed.
2955 2955           */
2956 2956          if (npages == 1) {
2957 2957                  getone = 1;
2958 2958                  contig = 0;
2959 2959          }
2960 2960  
2961 2961          /*
2962 2962           * Check if any page in the system is fine.
2963 2963           */
2964 2964          anyaddr = lo_mfn == 0 && hi_mfn >= max_mfn;
2965 2965          if (!contig && anyaddr && !pfnalign) {
2966 2966                  flags &= ~PG_PHYSCONTIG;
2967 2967                  plist = page_create_va(vp, off, npages * MMU_PAGESIZE,
2968 2968                      flags, &kvseg, vaddr);
2969 2969                  if (plist != NULL) {
2970 2970                          *npagesp = 0;
2971 2971                          return (plist);
2972 2972                  }
2973 2973          }
2974 2974          plist = NULL;
2975 2975          minctg = howmany(npages, sgllen);
2976 2976          while (npages > sgllen || getone) {
2977 2977                  if (minctg > npages)
2978 2978                          minctg = npages;
2979 2979                  mcpl = NULL;
2980 2980                  /*
2981 2981                   * We could want contig pages with no address range limits.
2982 2982                   */
2983 2983                  if (anyaddr && contig) {
2984 2984                          /*
2985 2985                           * Look for free contig pages to satisfy the request.
2986 2986                           */
2987 2987                          mcpl = find_contig_free(minctg, flags, pfnseg,
2988 2988                              pfnalign);
2989 2989                  }
2990 2990                  /*
2991 2991                   * Try the reserved io pools next
2992 2992                   */
2993 2993                  if (mcpl == NULL)
2994 2994                          mcpl = page_io_pool_alloc(mattr, contig, minctg);
2995 2995                  if (mcpl != NULL) {
2996 2996                          pp = mcpl;
2997 2997                          do {
2998 2998                                  if (!page_hashin(pp, vp, off, NULL)) {
2999 2999                                          panic("page_get_contigpages:"
3000 3000                                              " hashin failed"
3001 3001                                              " pp %p, vp %p, off %llx",
3002 3002                                              (void *)pp, (void *)vp, off);
3003 3003                                  }
3004 3004                                  off += MMU_PAGESIZE;
3005 3005                                  PP_CLRFREE(pp);
3006 3006                                  PP_CLRAGED(pp);
3007 3007                                  page_set_props(pp, P_REF);
3008 3008                                  page_io_lock(pp);
3009 3009                                  pp = pp->p_next;
3010 3010                          } while (pp != mcpl);
3011 3011                  } else {
3012 3012                          /*
3013 3013                           * Hypervisor exchange doesn't handle segment or
3014 3014                           * alignment constraints
3015 3015                           */
3016 3016                          if (mattr->dma_attr_seg < mattr->dma_attr_addr_hi ||
3017 3017                              pfnalign)
3018 3018                                  goto fail;
3019 3019                          /*
3020 3020                           * Try exchanging pages with the hypervisor
3021 3021                           */
3022 3022                          mcpl = page_swap_with_hypervisor(vp, off, vaddr, mattr,
3023 3023                              flags, minctg);
3024 3024                          if (mcpl == NULL)
3025 3025                                  goto fail;
3026 3026                          off += minctg * MMU_PAGESIZE;
3027 3027                  }
3028 3028                  check_dma(mattr, mcpl, minctg);
3029 3029                  /*
3030 3030                   * Here with a minctg run of contiguous pages, add them to the
3031 3031                   * list we will return for this request.
3032 3032                   */
3033 3033                  page_list_concat(&plist, &mcpl);
3034 3034                  npages -= minctg;
3035 3035                  *npagesp = npages;
3036 3036                  sgllen--;
3037 3037                  if (getone)
3038 3038                          break;
3039 3039          }
3040 3040          return (plist);
3041 3041  fail:
3042 3042          return_partial_alloc(plist);
3043 3043          return (NULL);
3044 3044  }
3045 3045  
3046 3046  /*
3047 3047   * Allocator for domain 0 I/O pages. We match the required
3048 3048   * DMA attributes and contiguity constraints.
3049 3049   */
3050 3050  /*ARGSUSED*/
3051 3051  page_t *
3052 3052  page_create_io(
3053 3053          struct vnode    *vp,
3054 3054          u_offset_t      off,
3055 3055          uint_t          bytes,
3056 3056          uint_t          flags,
3057 3057          struct as       *as,
3058 3058          caddr_t         vaddr,
3059 3059          ddi_dma_attr_t  *mattr)
3060 3060  {
3061 3061          page_t  *plist = NULL, *pp;
3062 3062          int     npages = 0, contig, anyaddr, pages_req;
3063 3063          mfn_t   lo_mfn;
3064 3064          mfn_t   hi_mfn;
3065 3065          pgcnt_t pfnalign = 0;
3066 3066          int     align;
3067 3067          int     is_domu = 0;
3068 3068          int     dummy, bytes_got;
3069 3069          mfn_t   max_mfn = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL);
3070 3070  
3071 3071          ASSERT(mattr != NULL);
3072 3072          lo_mfn = mmu_btop(mattr->dma_attr_addr_lo);
3073 3073          hi_mfn = mmu_btop(mattr->dma_attr_addr_hi);
3074 3074          align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer);
3075 3075          if (align > MMU_PAGESIZE)
3076 3076                  pfnalign = mmu_btop(align);
3077 3077  
3078 3078          /*
3079 3079           * Clear the contig flag if only one page is needed or the scatter
3080 3080           * gather list length is >= npages.
3081 3081           */
3082 3082          pages_req = npages = mmu_btopr(bytes);
3083 3083          contig = (flags & PG_PHYSCONTIG);
3084 3084          bytes = P2ROUNDUP(bytes, MMU_PAGESIZE);
3085 3085          if (bytes == MMU_PAGESIZE || mattr->dma_attr_sgllen >= npages)
3086 3086                  contig = 0;
3087 3087  
3088 3088          /*
3089 3089           * Check if any old page in the system is fine.
3090 3090           * DomU should always go down this path.
3091 3091           */
3092 3092          is_domu = !DOMAIN_IS_INITDOMAIN(xen_info);
3093 3093          anyaddr = lo_mfn == 0 && hi_mfn >= max_mfn && !pfnalign;
3094 3094          if ((!contig && anyaddr) || is_domu) {
3095 3095                  flags &= ~PG_PHYSCONTIG;
3096 3096                  plist = page_create_va(vp, off, bytes, flags, &kvseg, vaddr);
3097 3097                  if (plist != NULL)
3098 3098                          return (plist);
3099 3099                  else if (is_domu)
3100 3100                          return (NULL); /* no memory available */
3101 3101          }
3102 3102          /*
3103 3103           * DomU should never reach here
3104 3104           */
3105 3105          if (contig) {
3106 3106                  plist = page_get_contigpages(vp, off, &npages, flags, vaddr,
3107 3107                      mattr);
3108 3108                  if (plist == NULL)
3109 3109                          goto fail;
3110 3110                  bytes_got = (pages_req - npages) << MMU_PAGESHIFT;
3111 3111                  vaddr += bytes_got;
3112 3112                  off += bytes_got;
3113 3113                  /*
3114 3114                   * We now have all the contiguous pages we need, but
3115 3115                   * we may still need additional non-contiguous pages.
3116 3116                   */
3117 3117          }
3118 3118          /*
3119 3119           * now loop collecting the requested number of pages, these do
3120 3120           * not have to be contiguous pages but we will use the contig
3121 3121           * page alloc code to get the pages since it will honor any
3122 3122           * other constraints the pages may have.
3123 3123           */
3124 3124          while (npages--) {
3125 3125                  dummy = -1;
3126 3126                  pp = page_get_contigpages(vp, off, &dummy, flags, vaddr, mattr);
3127 3127                  if (pp == NULL)
3128 3128                          goto fail;
3129 3129                  page_add(&plist, pp);
3130 3130                  vaddr += MMU_PAGESIZE;
3131 3131                  off += MMU_PAGESIZE;
3132 3132          }
3133 3133          return (plist);
3134 3134  fail:
3135 3135          /*
3136 3136           * Failed to get enough pages, return ones we did get
3137 3137           */
3138 3138          return_partial_alloc(plist);
3139 3139          return (NULL);
3140 3140  }
3141 3141  
3142 3142  /*
3143 3143   * Lock and return the page with the highest mfn that we can find.  last_mfn
3144 3144   * holds the last one found, so the next search can start from there.  We
3145 3145   * also keep a counter so that we don't loop forever if the machine has no
3146 3146   * free pages.
3147 3147   *
3148 3148   * This is called from the balloon thread to find pages to give away.  new_high
3149 3149   * is used when new mfn's have been added to the system - we will reset our
3150 3150   * search if the new mfn's are higher than our current search position.
3151 3151   */
3152 3152  page_t *
3153 3153  page_get_high_mfn(mfn_t new_high)
3154 3154  {
3155 3155          static mfn_t last_mfn = 0;
3156 3156          pfn_t pfn;
3157 3157          page_t *pp;
3158 3158          ulong_t loop_count = 0;
3159 3159  
3160 3160          if (new_high > last_mfn)
3161 3161                  last_mfn = new_high;
3162 3162  
3163 3163          for (; loop_count < mfn_count; loop_count++, last_mfn--) {
3164 3164                  if (last_mfn == 0) {
3165 3165                          last_mfn = cached_max_mfn;
3166 3166                  }
3167 3167  
3168 3168                  pfn = mfn_to_pfn(last_mfn);
3169 3169                  if (pfn & PFN_IS_FOREIGN_MFN)
3170 3170                          continue;
3171 3171  
3172 3172                  /* See if the page is free.  If so, lock it. */
3173 3173                  pp = page_numtopp_alloc(pfn);
3174 3174                  if (pp == NULL)
3175 3175                          continue;
3176 3176                  PP_CLRFREE(pp);
3177 3177  
3178 3178                  ASSERT(PAGE_EXCL(pp));
3179 3179                  ASSERT(pp->p_vnode == NULL);
3180 3180                  ASSERT(!hat_page_is_mapped(pp));
3181 3181                  last_mfn--;
3182 3182                  return (pp);
3183 3183          }
3184 3184          return (NULL);
3185 3185  }
3186 3186  
3187 3187  #else /* !__xpv */
3188 3188  
3189 3189  /*
3190 3190   * get a page from any list with the given mnode
3191 3191   */
3192 3192  static page_t *
3193 3193  page_get_mnode_anylist(ulong_t origbin, uchar_t szc, uint_t flags,
3194 3194      int mnode, int mtype, ddi_dma_attr_t *dma_attr)
3195 3195  {
3196 3196          kmutex_t                *pcm;
3197 3197          int                     i;
3198 3198          page_t                  *pp;
3199 3199          page_t                  *first_pp;
3200 3200          uint64_t                pgaddr;
3201 3201          ulong_t                 bin;
3202 3202          int                     mtypestart;
3203 3203          int                     plw_initialized;
3204 3204          page_list_walker_t      plw;
3205 3205  
3206 3206          VM_STAT_ADD(pga_vmstats.pgma_alloc);
3207 3207  
3208 3208          ASSERT((flags & PG_MATCH_COLOR) == 0);
3209 3209          ASSERT(szc == 0);
3210 3210          ASSERT(dma_attr != NULL);
3211 3211  
3212 3212          MTYPE_START(mnode, mtype, flags);
3213 3213          if (mtype < 0) {
3214 3214                  VM_STAT_ADD(pga_vmstats.pgma_allocempty);
3215 3215                  return (NULL);
3216 3216          }
3217 3217  
3218 3218          mtypestart = mtype;
3219 3219  
3220 3220          bin = origbin;
3221 3221  
3222 3222          /*
3223 3223           * check up to page_colors + 1 bins - origbin may be checked twice
3224 3224           * because of BIN_STEP skip
3225 3225           */
3226 3226          do {
3227 3227                  plw_initialized = 0;
3228 3228  
3229 3229                  for (plw.plw_count = 0;
3230 3230                      plw.plw_count < page_colors; plw.plw_count++) {
3231 3231  
3232 3232                          if (PAGE_FREELISTS(mnode, szc, bin, mtype) == NULL)
3233 3233                                  goto nextfreebin;
3234 3234  
3235 3235                          pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
3236 3236                          mutex_enter(pcm);
3237 3237                          pp = PAGE_FREELISTS(mnode, szc, bin, mtype);
3238 3238                          first_pp = pp;
3239 3239                          while (pp != NULL) {
3240 3240                                  if (IS_DUMP_PAGE(pp) || page_trylock(pp,
3241 3241                                      SE_EXCL) == 0) {
3242 3242                                          pp = pp->p_next;
3243 3243                                          if (pp == first_pp) {
3244 3244                                                  pp = NULL;
3245 3245                                          }
3246 3246                                          continue;
3247 3247                                  }
3248 3248  
3249 3249                                  ASSERT(PP_ISFREE(pp));
3250 3250                                  ASSERT(PP_ISAGED(pp));
3251 3251                                  ASSERT(pp->p_vnode == NULL);
3252 3252                                  ASSERT(pp->p_hash == NULL);
3253 3253                                  ASSERT(pp->p_offset == (u_offset_t)-1);
3254 3254                                  ASSERT(pp->p_szc == szc);
3255 3255                                  ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
3256 3256                                  /* check if page within DMA attributes */
3257 3257                                  pgaddr = pa_to_ma(pfn_to_pa(pp->p_pagenum));
3258 3258                                  if ((pgaddr >= dma_attr->dma_attr_addr_lo) &&
3259 3259                                      (pgaddr + MMU_PAGESIZE - 1 <=
3260 3260                                      dma_attr->dma_attr_addr_hi)) {
3261 3261                                          break;
3262 3262                                  }
3263 3263  
3264 3264                                  /* continue looking */
3265 3265                                  page_unlock(pp);
3266 3266                                  pp = pp->p_next;
3267 3267                                  if (pp == first_pp)
3268 3268                                          pp = NULL;
3269 3269  
3270 3270                          }
3271 3271                          if (pp != NULL) {
3272 3272                                  ASSERT(mtype == PP_2_MTYPE(pp));
3273 3273                                  ASSERT(pp->p_szc == 0);
3274 3274  
3275 3275                                  /* found a page with specified DMA attributes */
3276 3276                                  page_sub(&PAGE_FREELISTS(mnode, szc, bin,
3277 3277                                      mtype), pp);
3278 3278                                  page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST);
3279 3279  
3280 3280                                  if ((PP_ISFREE(pp) == 0) ||
3281 3281                                      (PP_ISAGED(pp) == 0)) {
3282 3282                                          cmn_err(CE_PANIC, "page %p is not free",
3283 3283                                              (void *)pp);
3284 3284                                  }
3285 3285  
3286 3286                                  mutex_exit(pcm);
3287 3287                                  check_dma(dma_attr, pp, 1);
3288 3288                                  VM_STAT_ADD(pga_vmstats.pgma_allocok);
3289 3289                                  return (pp);
3290 3290                          }
3291 3291                          mutex_exit(pcm);
3292 3292  nextfreebin:
3293 3293                          if (plw_initialized == 0) {
3294 3294                                  page_list_walk_init(szc, 0, bin, 1, 0, &plw);
3295 3295                                  ASSERT(plw.plw_ceq_dif == page_colors);
3296 3296                                  plw_initialized = 1;
3297 3297                          }
3298 3298  
3299 3299                          if (plw.plw_do_split) {
3300 3300                                  pp = page_freelist_split(szc, bin, mnode,
3301 3301                                      mtype,
3302 3302                                      mmu_btop(dma_attr->dma_attr_addr_lo),
3303 3303                                      mmu_btop(dma_attr->dma_attr_addr_hi + 1),
3304 3304                                      &plw);
3305 3305                                  if (pp != NULL) {
3306 3306                                          check_dma(dma_attr, pp, 1);
3307 3307                                          return (pp);
3308 3308                                  }
3309 3309                          }
3310 3310  
3311 3311                          bin = page_list_walk_next_bin(szc, bin, &plw);
3312 3312                  }
3313 3313  
3314 3314                  MTYPE_NEXT(mnode, mtype, flags);
3315 3315          } while (mtype >= 0);
3316 3316  
3317 3317          /* failed to find a page in the freelist; try it in the cachelist */
3318 3318  
3319 3319          /* reset mtype start for cachelist search */
3320 3320          mtype = mtypestart;
3321 3321          ASSERT(mtype >= 0);
3322 3322  
3323 3323          /* start with the bin of matching color */
3324 3324          bin = origbin;
3325 3325  
3326 3326          do {
3327 3327                  for (i = 0; i <= page_colors; i++) {
3328 3328                          if (PAGE_CACHELISTS(mnode, bin, mtype) == NULL)
3329 3329                                  goto nextcachebin;
3330 3330                          pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST);
3331 3331                          mutex_enter(pcm);
3332 3332                          pp = PAGE_CACHELISTS(mnode, bin, mtype);
3333 3333                          first_pp = pp;
3334 3334                          while (pp != NULL) {
3335 3335                                  if (IS_DUMP_PAGE(pp) || page_trylock(pp,
3336 3336                                      SE_EXCL) == 0) {
3337 3337                                          pp = pp->p_next;
3338 3338                                          if (pp == first_pp)
3339 3339                                                  pp = NULL;
3340 3340                                          continue;
3341 3341                                  }
3342 3342                                  ASSERT(pp->p_vnode);
3343 3343                                  ASSERT(PP_ISAGED(pp) == 0);
3344 3344                                  ASSERT(pp->p_szc == 0);
3345 3345                                  ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
3346 3346  
3347 3347                                  /* check if page within DMA attributes */
3348 3348  
3349 3349                                  pgaddr = pa_to_ma(pfn_to_pa(pp->p_pagenum));
3350 3350                                  if ((pgaddr >= dma_attr->dma_attr_addr_lo) &&
3351 3351                                      (pgaddr + MMU_PAGESIZE - 1 <=
3352 3352                                      dma_attr->dma_attr_addr_hi)) {
3353 3353                                          break;
3354 3354                                  }
3355 3355  
3356 3356                                  /* continue looking */
3357 3357                                  page_unlock(pp);
3358 3358                                  pp = pp->p_next;
3359 3359                                  if (pp == first_pp)
3360 3360                                          pp = NULL;
3361 3361                          }
3362 3362  
3363 3363                          if (pp != NULL) {
3364 3364                                  ASSERT(mtype == PP_2_MTYPE(pp));
3365 3365                                  ASSERT(pp->p_szc == 0);
3366 3366  
3367 3367                                  /* found a page with specified DMA attributes */
3368 3368                                  page_sub(&PAGE_CACHELISTS(mnode, bin,
3369 3369                                      mtype), pp);
3370 3370                                  page_ctr_sub(mnode, mtype, pp, PG_CACHE_LIST);
3371 3371  
3372 3372                                  mutex_exit(pcm);
3373 3373                                  ASSERT(pp->p_vnode);
3374 3374                                  ASSERT(PP_ISAGED(pp) == 0);
3375 3375                                  check_dma(dma_attr, pp, 1);
3376 3376                                  VM_STAT_ADD(pga_vmstats.pgma_allocok);
3377 3377                                  return (pp);
3378 3378                          }
3379 3379                          mutex_exit(pcm);
3380 3380  nextcachebin:
3381 3381                          bin += (i == 0) ? BIN_STEP : 1;
3382 3382                          bin &= page_colors_mask;
3383 3383                  }
3384 3384                  MTYPE_NEXT(mnode, mtype, flags);
3385 3385          } while (mtype >= 0);
3386 3386  
3387 3387          VM_STAT_ADD(pga_vmstats.pgma_allocfailed);
3388 3388          return (NULL);
3389 3389  }
3390 3390  
3391 3391  /*
3392 3392   * This function is similar to page_get_freelist()/page_get_cachelist()
3393 3393   * but it searches both the lists to find a page with the specified
3394 3394   * color (or no color) and DMA attributes. The search is done in the
3395 3395   * freelist first and then in the cache list within the highest memory
3396 3396   * range (based on DMA attributes) before searching in the lower
3397 3397   * memory ranges.
3398 3398   *
3399 3399   * Note: This function is called only by page_create_io().
3400 3400   */
3401 3401  /*ARGSUSED*/
3402 3402  static page_t *
3403 3403  page_get_anylist(struct vnode *vp, u_offset_t off, struct as *as, caddr_t vaddr,
3404 3404      size_t size, uint_t flags, ddi_dma_attr_t *dma_attr, lgrp_t *lgrp)
3405 3405  {
3406 3406          uint_t          bin;
3407 3407          int             mtype;
3408 3408          page_t          *pp;
3409 3409          int             n;
3410 3410          int             m;
3411 3411          int             szc;
3412 3412          int             fullrange;
3413 3413          int             mnode;
3414 3414          int             local_failed_stat = 0;
3415 3415          lgrp_mnode_cookie_t     lgrp_cookie;
3416 3416  
3417 3417          VM_STAT_ADD(pga_vmstats.pga_alloc);
3418 3418  
3419 3419          /* only base pagesize currently supported */
3420 3420          if (size != MMU_PAGESIZE)
3421 3421                  return (NULL);
3422 3422  
3423 3423          /*
3424 3424           * If we're passed a specific lgroup, we use it.  Otherwise,
3425 3425           * assume first-touch placement is desired.
3426 3426           */
3427 3427          if (!LGRP_EXISTS(lgrp))
3428 3428                  lgrp = lgrp_home_lgrp();
3429 3429  
3430 3430          /* LINTED */
3431 3431          AS_2_BIN(as, seg, vp, vaddr, bin, 0);
3432 3432  
3433 3433          /*
3434 3434           * Only hold one freelist or cachelist lock at a time, that way we
3435 3435           * can start anywhere and not have to worry about lock
3436 3436           * ordering.
3437 3437           */
3438 3438          if (dma_attr == NULL) {
3439 3439                  n = mtype16m;
3440 3440                  m = mtypetop;
3441 3441                  fullrange = 1;
3442 3442                  VM_STAT_ADD(pga_vmstats.pga_nulldmaattr);
3443 3443          } else {
3444 3444                  pfn_t pfnlo = mmu_btop(dma_attr->dma_attr_addr_lo);
3445 3445                  pfn_t pfnhi = mmu_btop(dma_attr->dma_attr_addr_hi);
3446 3446  
3447 3447                  /*
3448 3448                   * We can guarantee alignment only for page boundary.
3449 3449                   */
3450 3450                  if (dma_attr->dma_attr_align > MMU_PAGESIZE)
3451 3451                          return (NULL);
3452 3452  
3453 3453                  /* Sanity check the dma_attr */
3454 3454                  if (pfnlo > pfnhi)
3455 3455                          return (NULL);
3456 3456  
3457 3457                  n = pfn_2_mtype(pfnlo);
3458 3458                  m = pfn_2_mtype(pfnhi);
3459 3459  
3460 3460                  fullrange = ((pfnlo == mnoderanges[n].mnr_pfnlo) &&
3461 3461                      (pfnhi >= mnoderanges[m].mnr_pfnhi));
3462 3462          }
3463 3463          VM_STAT_COND_ADD(fullrange == 0, pga_vmstats.pga_notfullrange);
3464 3464  
3465 3465          szc = 0;
3466 3466  
3467 3467          /* cylcing thru mtype handled by RANGE0 if n == mtype16m */
3468 3468          if (n == mtype16m) {
3469 3469                  flags |= PGI_MT_RANGE0;
3470 3470                  n = m;
3471 3471          }
3472 3472  
3473 3473          /*
3474 3474           * Try local memory node first, but try remote if we can't
3475 3475           * get a page of the right color.
3476 3476           */
3477 3477          LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_HIER);
3478 3478          while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3479 3479                  /*
3480 3480                   * allocate pages from high pfn to low.
3481 3481                   */
3482 3482                  mtype = m;
3483 3483                  do {
3484 3484                          if (fullrange != 0) {
3485 3485                                  pp = page_get_mnode_freelist(mnode,
3486 3486                                      bin, mtype, szc, flags);
3487 3487                                  if (pp == NULL) {
3488 3488                                          pp = page_get_mnode_cachelist(
3489 3489                                              bin, flags, mnode, mtype);
3490 3490                                  }
3491 3491                          } else {
3492 3492                                  pp = page_get_mnode_anylist(bin, szc,
3493 3493                                      flags, mnode, mtype, dma_attr);
3494 3494                          }
3495 3495                          if (pp != NULL) {
3496 3496                                  VM_STAT_ADD(pga_vmstats.pga_allocok);
3497 3497                                  check_dma(dma_attr, pp, 1);
3498 3498                                  return (pp);
3499 3499                          }
3500 3500                  } while (mtype != n &&
3501 3501                      (mtype = mnoderanges[mtype].mnr_next) != -1);
3502 3502                  if (!local_failed_stat) {
3503 3503                          lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1);
3504 3504                          local_failed_stat = 1;
3505 3505                  }
3506 3506          }
3507 3507          VM_STAT_ADD(pga_vmstats.pga_allocfailed);
3508 3508  
3509 3509          return (NULL);
3510 3510  }
3511 3511  
3512 3512  /*
3513 3513   * page_create_io()
3514 3514   *
3515 3515   * This function is a copy of page_create_va() with an additional
3516 3516   * argument 'mattr' that specifies DMA memory requirements to
3517 3517   * the page list functions. This function is used by the segkmem
3518 3518   * allocator so it is only to create new pages (i.e PG_EXCL is
3519 3519   * set).
3520 3520   *
3521 3521   * Note: This interface is currently used by x86 PSM only and is
3522 3522   *       not fully specified so the commitment level is only for
3523 3523   *       private interface specific to x86. This interface uses PSM
3524 3524   *       specific page_get_anylist() interface.
3525 3525   */
3526 3526  
3527 3527  #define PAGE_HASH_SEARCH(index, pp, vp, off) { \
3528 3528          for ((pp) = page_hash[(index)]; (pp); (pp) = (pp)->p_hash) { \
3529 3529                  if ((pp)->p_vnode == (vp) && (pp)->p_offset == (off)) \
3530 3530                          break; \
3531 3531          } \
3532 3532  }
3533 3533  
3534 3534  
3535 3535  page_t *
3536 3536  page_create_io(
3537 3537          struct vnode    *vp,
3538 3538          u_offset_t      off,
3539 3539          uint_t          bytes,
3540 3540          uint_t          flags,
3541 3541          struct as       *as,
3542 3542          caddr_t         vaddr,
3543 3543          ddi_dma_attr_t  *mattr) /* DMA memory attributes if any */
3544 3544  {
3545 3545          page_t          *plist = NULL;
3546 3546          uint_t          plist_len = 0;
3547 3547          pgcnt_t         npages;
3548 3548          page_t          *npp = NULL;
3549 3549          uint_t          pages_req;
3550 3550          page_t          *pp;
3551 3551          kmutex_t        *phm = NULL;
3552 3552          uint_t          index;
3553 3553  
3554 3554          TRACE_4(TR_FAC_VM, TR_PAGE_CREATE_START,
3555 3555              "page_create_start:vp %p off %llx bytes %u flags %x",
3556 3556              vp, off, bytes, flags);
3557 3557  
3558 3558          ASSERT((flags & ~(PG_EXCL | PG_WAIT | PG_PHYSCONTIG)) == 0);
3559 3559  
3560 3560          pages_req = npages = mmu_btopr(bytes);
3561 3561  
3562 3562          /*
3563 3563           * Do the freemem and pcf accounting.
3564 3564           */
3565 3565          if (!page_create_wait(npages, flags)) {
3566 3566                  return (NULL);
3567 3567          }
3568 3568  
3569 3569          TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SUCCESS,
3570 3570              "page_create_success:vp %p off %llx", vp, off);
3571 3571  
3572 3572          /*
3573 3573           * If satisfying this request has left us with too little
3574 3574           * memory, start the wheels turning to get some back.  The
3575 3575           * first clause of the test prevents waking up the pageout
3576 3576           * daemon in situations where it would decide that there's
3577 3577           * nothing to do.
3578 3578           */
3579 3579          if (nscan < desscan && freemem < minfree) {
3580 3580                  TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL,
3581 3581                      "pageout_cv_signal:freemem %ld", freemem);
3582 3582                  cv_signal(&proc_pageout->p_cv);
3583 3583          }
3584 3584  
3585 3585          if (flags & PG_PHYSCONTIG) {
3586 3586  
3587 3587                  plist = page_get_contigpage(&npages, mattr, 1);
3588 3588                  if (plist == NULL) {
3589 3589                          page_create_putback(npages);
3590 3590                          return (NULL);
3591 3591                  }
3592 3592  
3593 3593                  pp = plist;
3594 3594  
3595 3595                  do {
3596 3596                          if (!page_hashin(pp, vp, off, NULL)) {
3597 3597                                  panic("pg_creat_io: hashin failed %p %p %llx",
3598 3598                                      (void *)pp, (void *)vp, off);
3599 3599                          }
3600 3600                          VM_STAT_ADD(page_create_new);
3601 3601                          off += MMU_PAGESIZE;
3602 3602                          PP_CLRFREE(pp);
3603 3603                          PP_CLRAGED(pp);
3604 3604                          page_set_props(pp, P_REF);
3605 3605                          pp = pp->p_next;
3606 3606                  } while (pp != plist);
3607 3607  
3608 3608                  if (!npages) {
3609 3609                          check_dma(mattr, plist, pages_req);
3610 3610                          return (plist);
3611 3611                  } else {
3612 3612                          vaddr += (pages_req - npages) << MMU_PAGESHIFT;
3613 3613                  }
3614 3614  
3615 3615                  /*
3616 3616                   * fall-thru:
3617 3617                   *
3618 3618                   * page_get_contigpage returns when npages <= sgllen.
3619 3619                   * Grab the rest of the non-contig pages below from anylist.
3620 3620                   */
3621 3621          }
3622 3622  
3623 3623          /*
3624 3624           * Loop around collecting the requested number of pages.
3625 3625           * Most of the time, we have to `create' a new page. With
3626 3626           * this in mind, pull the page off the free list before
3627 3627           * getting the hash lock.  This will minimize the hash
3628 3628           * lock hold time, nesting, and the like.  If it turns
3629 3629           * out we don't need the page, we put it back at the end.
3630 3630           */
3631 3631          while (npages--) {
3632 3632                  phm = NULL;
3633 3633  
3634 3634                  index = PAGE_HASH_FUNC(vp, off);
3635 3635  top:
3636 3636                  ASSERT(phm == NULL);
3637 3637                  ASSERT(index == PAGE_HASH_FUNC(vp, off));
3638 3638                  ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
3639 3639  
3640 3640                  if (npp == NULL) {
3641 3641                          /*
3642 3642                           * Try to get the page of any color either from
3643 3643                           * the freelist or from the cache list.
3644 3644                           */
3645 3645                          npp = page_get_anylist(vp, off, as, vaddr, MMU_PAGESIZE,
3646 3646                              flags & ~PG_MATCH_COLOR, mattr, NULL);
3647 3647                          if (npp == NULL) {
3648 3648                                  if (mattr == NULL) {
3649 3649                                          /*
3650 3650                                           * Not looking for a special page;
3651 3651                                           * panic!
3652 3652                                           */
3653 3653                                          panic("no page found %d", (int)npages);
3654 3654                                  }
3655 3655                                  /*
3656 3656                                   * No page found! This can happen
3657 3657                                   * if we are looking for a page
3658 3658                                   * within a specific memory range
3659 3659                                   * for DMA purposes. If PG_WAIT is
3660 3660                                   * specified then we wait for a
3661 3661                                   * while and then try again. The
3662 3662                                   * wait could be forever if we
3663 3663                                   * don't get the page(s) we need.
3664 3664                                   *
3665 3665                                   * Note: XXX We really need a mechanism
3666 3666                                   * to wait for pages in the desired
3667 3667                                   * range. For now, we wait for any
3668 3668                                   * pages and see if we can use it.
3669 3669                                   */
3670 3670  
3671 3671                                  if ((mattr != NULL) && (flags & PG_WAIT)) {
3672 3672                                          delay(10);
3673 3673                                          goto top;
3674 3674                                  }
3675 3675                                  goto fail; /* undo accounting stuff */
3676 3676                          }
3677 3677  
3678 3678                          if (PP_ISAGED(npp) == 0) {
3679 3679                                  /*
3680 3680                                   * Since this page came from the
3681 3681                                   * cachelist, we must destroy the
3682 3682                                   * old vnode association.
3683 3683                                   */
3684 3684                                  page_hashout(npp, (kmutex_t *)NULL);
3685 3685                          }
3686 3686                  }
3687 3687  
3688 3688                  /*
3689 3689                   * We own this page!
3690 3690                   */
3691 3691                  ASSERT(PAGE_EXCL(npp));
3692 3692                  ASSERT(npp->p_vnode == NULL);
3693 3693                  ASSERT(!hat_page_is_mapped(npp));
3694 3694                  PP_CLRFREE(npp);
3695 3695                  PP_CLRAGED(npp);
3696 3696  
3697 3697                  /*
3698 3698                   * Here we have a page in our hot little mits and are
3699 3699                   * just waiting to stuff it on the appropriate lists.
3700 3700                   * Get the mutex and check to see if it really does
3701 3701                   * not exist.
3702 3702                   */
3703 3703                  phm = PAGE_HASH_MUTEX(index);
3704 3704                  mutex_enter(phm);
3705 3705                  PAGE_HASH_SEARCH(index, pp, vp, off);
3706 3706                  if (pp == NULL) {
3707 3707                          VM_STAT_ADD(page_create_new);
3708 3708                          pp = npp;
3709 3709                          npp = NULL;
3710 3710                          if (!page_hashin(pp, vp, off, phm)) {
3711 3711                                  /*
3712 3712                                   * Since we hold the page hash mutex and
3713 3713                                   * just searched for this page, page_hashin
3714 3714                                   * had better not fail.  If it does, that
3715 3715                                   * means somethread did not follow the
3716 3716                                   * page hash mutex rules.  Panic now and
3717 3717                                   * get it over with.  As usual, go down
3718 3718                                   * holding all the locks.
3719 3719                                   */
3720 3720                                  ASSERT(MUTEX_HELD(phm));
3721 3721                                  panic("page_create: hashin fail %p %p %llx %p",
3722 3722                                      (void *)pp, (void *)vp, off, (void *)phm);
3723 3723  
3724 3724                          }
3725 3725                          ASSERT(MUTEX_HELD(phm));
3726 3726                          mutex_exit(phm);
3727 3727                          phm = NULL;
3728 3728  
3729 3729                          /*
3730 3730                           * Hat layer locking need not be done to set
3731 3731                           * the following bits since the page is not hashed
3732 3732                           * and was on the free list (i.e., had no mappings).
3733 3733                           *
3734 3734                           * Set the reference bit to protect
3735 3735                           * against immediate pageout
3736 3736                           *
3737 3737                           * XXXmh modify freelist code to set reference
3738 3738                           * bit so we don't have to do it here.
3739 3739                           */
3740 3740                          page_set_props(pp, P_REF);
3741 3741                  } else {
3742 3742                          ASSERT(MUTEX_HELD(phm));
3743 3743                          mutex_exit(phm);
3744 3744                          phm = NULL;
3745 3745                          /*
3746 3746                           * NOTE: This should not happen for pages associated
3747 3747                           *       with kernel vnode 'kvp'.
3748 3748                           */
3749 3749                          /* XX64 - to debug why this happens! */
3750 3750                          ASSERT(!VN_ISKAS(vp));
3751 3751                          if (VN_ISKAS(vp))
3752 3752                                  cmn_err(CE_NOTE,
3753 3753                                      "page_create: page not expected "
3754 3754                                      "in hash list for kernel vnode - pp 0x%p",
3755 3755                                      (void *)pp);
3756 3756                          VM_STAT_ADD(page_create_exists);
3757 3757                          goto fail;
3758 3758                  }
3759 3759  
3760 3760                  /*
3761 3761                   * Got a page!  It is locked.  Acquire the i/o
3762 3762                   * lock since we are going to use the p_next and
3763 3763                   * p_prev fields to link the requested pages together.
3764 3764                   */
3765 3765                  page_io_lock(pp);
3766 3766                  page_add(&plist, pp);
3767 3767                  plist = plist->p_next;
3768 3768                  off += MMU_PAGESIZE;
3769 3769                  vaddr += MMU_PAGESIZE;
3770 3770          }
3771 3771  
3772 3772          check_dma(mattr, plist, pages_req);
3773 3773          return (plist);
3774 3774  
3775 3775  fail:
3776 3776          if (npp != NULL) {
3777 3777                  /*
3778 3778                   * Did not need this page after all.
3779 3779                   * Put it back on the free list.
3780 3780                   */
3781 3781                  VM_STAT_ADD(page_create_putbacks);
3782 3782                  PP_SETFREE(npp);
3783 3783                  PP_SETAGED(npp);
3784 3784                  npp->p_offset = (u_offset_t)-1;
3785 3785                  page_list_add(npp, PG_FREE_LIST | PG_LIST_TAIL);
3786 3786                  page_unlock(npp);
3787 3787          }
3788 3788  
3789 3789          /*
3790 3790           * Give up the pages we already got.
3791 3791           */
3792 3792          while (plist != NULL) {
3793 3793                  pp = plist;
3794 3794                  page_sub(&plist, pp);
3795 3795                  page_io_unlock(pp);
3796 3796                  plist_len++;
3797 3797                  /*LINTED: constant in conditional ctx*/
3798 3798                  VN_DISPOSE(pp, B_INVAL, 0, kcred);
3799 3799          }
3800 3800  
3801 3801          /*
3802 3802           * VN_DISPOSE does freemem accounting for the pages in plist
3803 3803           * by calling page_free. So, we need to undo the pcf accounting
3804 3804           * for only the remaining pages.
3805 3805           */
3806 3806          VM_STAT_ADD(page_create_putbacks);
3807 3807          page_create_putback(pages_req - plist_len);
3808 3808  
3809 3809          return (NULL);
3810 3810  }
3811 3811  #endif /* !__xpv */
3812 3812  
3813 3813  
3814 3814  /*
3815 3815   * Copy the data from the physical page represented by "frompp" to
3816 3816   * that represented by "topp". ppcopy uses CPU->cpu_caddr1 and
3817 3817   * CPU->cpu_caddr2.  It assumes that no one uses either map at interrupt
3818 3818   * level and no one sleeps with an active mapping there.
3819 3819   *
3820 3820   * Note that the ref/mod bits in the page_t's are not affected by
3821 3821   * this operation, hence it is up to the caller to update them appropriately.
3822 3822   */
3823 3823  int
3824 3824  ppcopy(page_t *frompp, page_t *topp)
3825 3825  {
3826 3826          caddr_t         pp_addr1;
3827 3827          caddr_t         pp_addr2;
3828 3828          hat_mempte_t    pte1;
3829 3829          hat_mempte_t    pte2;
3830 3830          kmutex_t        *ppaddr_mutex;
3831 3831          label_t         ljb;
3832 3832          int             ret = 1;
3833 3833  
3834 3834          ASSERT_STACK_ALIGNED();
3835 3835          ASSERT(PAGE_LOCKED(frompp));
3836 3836          ASSERT(PAGE_LOCKED(topp));
3837 3837  
3838 3838          if (kpm_enable) {
3839 3839                  pp_addr1 = hat_kpm_page2va(frompp, 0);
3840 3840                  pp_addr2 = hat_kpm_page2va(topp, 0);
3841 3841                  kpreempt_disable();
3842 3842          } else {
3843 3843                  /*
3844 3844                   * disable pre-emption so that CPU can't change
3845 3845                   */
3846 3846                  kpreempt_disable();
3847 3847  
3848 3848                  pp_addr1 = CPU->cpu_caddr1;
3849 3849                  pp_addr2 = CPU->cpu_caddr2;
3850 3850                  pte1 = CPU->cpu_caddr1pte;
3851 3851                  pte2 = CPU->cpu_caddr2pte;
3852 3852  
3853 3853                  ppaddr_mutex = &CPU->cpu_ppaddr_mutex;
3854 3854                  mutex_enter(ppaddr_mutex);
3855 3855  
3856 3856                  hat_mempte_remap(page_pptonum(frompp), pp_addr1, pte1,
3857 3857                      PROT_READ | HAT_STORECACHING_OK, HAT_LOAD_NOCONSIST);
3858 3858                  hat_mempte_remap(page_pptonum(topp), pp_addr2, pte2,
3859 3859                      PROT_READ | PROT_WRITE | HAT_STORECACHING_OK,
3860 3860                      HAT_LOAD_NOCONSIST);
3861 3861          }
3862 3862  
3863 3863          if (on_fault(&ljb)) {
3864 3864                  ret = 0;
3865 3865                  goto faulted;
3866 3866          }
3867 3867          if (use_sse_pagecopy)
3868 3868  #ifdef __xpv
3869 3869                  page_copy_no_xmm(pp_addr2, pp_addr1);
3870 3870  #else
3871 3871                  hwblkpagecopy(pp_addr1, pp_addr2);
3872 3872  #endif
3873 3873          else
3874 3874                  bcopy(pp_addr1, pp_addr2, PAGESIZE);
3875 3875  
3876 3876          no_fault();
3877 3877  faulted:
3878 3878          if (!kpm_enable) {
3879 3879  #ifdef __xpv
3880 3880                  /*
3881 3881                   * We can't leave unused mappings laying about under the
3882 3882                   * hypervisor, so blow them away.
3883 3883                   */
3884 3884                  if (HYPERVISOR_update_va_mapping((uintptr_t)pp_addr1, 0,
3885 3885                      UVMF_INVLPG | UVMF_LOCAL) < 0)
3886 3886                          panic("HYPERVISOR_update_va_mapping() failed");
3887 3887                  if (HYPERVISOR_update_va_mapping((uintptr_t)pp_addr2, 0,
3888 3888                      UVMF_INVLPG | UVMF_LOCAL) < 0)
3889 3889                          panic("HYPERVISOR_update_va_mapping() failed");
3890 3890  #endif
3891 3891                  mutex_exit(ppaddr_mutex);
3892 3892          }
3893 3893          kpreempt_enable();
3894 3894          return (ret);
3895 3895  }
3896 3896  
3897 3897  void
3898 3898  pagezero(page_t *pp, uint_t off, uint_t len)
3899 3899  {
3900 3900          ASSERT(PAGE_LOCKED(pp));
3901 3901          pfnzero(page_pptonum(pp), off, len);
3902 3902  }
3903 3903  
3904 3904  /*
3905 3905   * Zero the physical page from off to off + len given by pfn
3906 3906   * without changing the reference and modified bits of page.
3907 3907   *
3908 3908   * We use this using CPU private page address #2, see ppcopy() for more info.
3909 3909   * pfnzero() must not be called at interrupt level.
3910 3910   */
3911 3911  void
3912 3912  pfnzero(pfn_t pfn, uint_t off, uint_t len)
3913 3913  {
3914 3914          caddr_t         pp_addr2;
3915 3915          hat_mempte_t    pte2;
3916 3916          kmutex_t        *ppaddr_mutex = NULL;
3917 3917  
3918 3918          ASSERT_STACK_ALIGNED();
3919 3919          ASSERT(len <= MMU_PAGESIZE);
3920 3920          ASSERT(off <= MMU_PAGESIZE);
3921 3921          ASSERT(off + len <= MMU_PAGESIZE);
3922 3922  
3923 3923          if (kpm_enable && !pfn_is_foreign(pfn)) {
3924 3924                  pp_addr2 = hat_kpm_pfn2va(pfn);
3925 3925                  kpreempt_disable();
3926 3926          } else {
3927 3927                  kpreempt_disable();
3928 3928  
3929 3929                  pp_addr2 = CPU->cpu_caddr2;
3930 3930                  pte2 = CPU->cpu_caddr2pte;
3931 3931  
3932 3932                  ppaddr_mutex = &CPU->cpu_ppaddr_mutex;
3933 3933                  mutex_enter(ppaddr_mutex);
3934 3934  
3935 3935                  hat_mempte_remap(pfn, pp_addr2, pte2,
3936 3936                      PROT_READ | PROT_WRITE | HAT_STORECACHING_OK,
3937 3937                      HAT_LOAD_NOCONSIST);
3938 3938          }
3939 3939  
3940 3940          if (use_sse_pagezero) {
3941 3941  #ifdef __xpv
3942 3942                  uint_t rem;
3943 3943  
3944 3944                  /*
3945 3945                   * zero a byte at a time until properly aligned for
3946 3946                   * block_zero_no_xmm().
3947 3947                   */
3948 3948                  while (!P2NPHASE(off, ((uint_t)BLOCKZEROALIGN)) && len-- > 0)
3949 3949                          pp_addr2[off++] = 0;
3950 3950  
3951 3951                  /*
3952 3952                   * Now use faster block_zero_no_xmm() for any range
3953 3953                   * that is properly aligned and sized.
3954 3954                   */
3955 3955                  rem = P2PHASE(len, ((uint_t)BLOCKZEROALIGN));
3956 3956                  len -= rem;
3957 3957                  if (len != 0) {
3958 3958                          block_zero_no_xmm(pp_addr2 + off, len);
3959 3959                          off += len;
3960 3960                  }
3961 3961  
3962 3962                  /*
3963 3963                   * zero remainder with byte stores.
3964 3964                   */
3965 3965                  while (rem-- > 0)
3966 3966                          pp_addr2[off++] = 0;
3967 3967  #else
3968 3968                  hwblkclr(pp_addr2 + off, len);
3969 3969  #endif
3970 3970          } else {
3971 3971                  bzero(pp_addr2 + off, len);
3972 3972          }
3973 3973  
3974 3974          if (!kpm_enable || pfn_is_foreign(pfn)) {
3975 3975  #ifdef __xpv
3976 3976                  /*
3977 3977                   * On the hypervisor this page might get used for a page
3978 3978                   * table before any intervening change to this mapping,
3979 3979                   * so blow it away.
3980 3980                   */
3981 3981                  if (HYPERVISOR_update_va_mapping((uintptr_t)pp_addr2, 0,
3982 3982                      UVMF_INVLPG) < 0)
3983 3983                          panic("HYPERVISOR_update_va_mapping() failed");
3984 3984  #endif
3985 3985                  mutex_exit(ppaddr_mutex);
3986 3986          }
3987 3987  
3988 3988          kpreempt_enable();
3989 3989  }
3990 3990  
3991 3991  /*
3992 3992   * Platform-dependent page scrub call.
3993 3993   */
3994 3994  void
3995 3995  pagescrub(page_t *pp, uint_t off, uint_t len)
3996 3996  {
3997 3997          /*
3998 3998           * For now, we rely on the fact that pagezero() will
3999 3999           * always clear UEs.
4000 4000           */
4001 4001          pagezero(pp, off, len);
4002 4002  }
4003 4003  
4004 4004  /*
4005 4005   * set up two private addresses for use on a given CPU for use in ppcopy()
4006 4006   */
4007 4007  void
4008 4008  setup_vaddr_for_ppcopy(struct cpu *cpup)
4009 4009  {
4010 4010          void *addr;
4011 4011          hat_mempte_t pte_pa;
4012 4012  
4013 4013          addr = vmem_alloc(heap_arena, mmu_ptob(1), VM_SLEEP);
4014 4014          pte_pa = hat_mempte_setup(addr);
4015 4015          cpup->cpu_caddr1 = addr;
4016 4016          cpup->cpu_caddr1pte = pte_pa;
4017 4017  
4018 4018          addr = vmem_alloc(heap_arena, mmu_ptob(1), VM_SLEEP);
4019 4019          pte_pa = hat_mempte_setup(addr);
4020 4020          cpup->cpu_caddr2 = addr;
4021 4021          cpup->cpu_caddr2pte = pte_pa;
4022 4022  
4023 4023          mutex_init(&cpup->cpu_ppaddr_mutex, NULL, MUTEX_DEFAULT, NULL);
4024 4024  }
4025 4025  
4026 4026  /*
4027 4027   * Undo setup_vaddr_for_ppcopy
4028 4028   */
4029 4029  void
4030 4030  teardown_vaddr_for_ppcopy(struct cpu *cpup)
4031 4031  {
4032 4032          mutex_destroy(&cpup->cpu_ppaddr_mutex);
4033 4033  
4034 4034          hat_mempte_release(cpup->cpu_caddr2, cpup->cpu_caddr2pte);
4035 4035          cpup->cpu_caddr2pte = 0;
4036 4036          vmem_free(heap_arena, cpup->cpu_caddr2, mmu_ptob(1));
4037 4037          cpup->cpu_caddr2 = 0;
4038 4038  
4039 4039          hat_mempte_release(cpup->cpu_caddr1, cpup->cpu_caddr1pte);
4040 4040          cpup->cpu_caddr1pte = 0;
4041 4041          vmem_free(heap_arena, cpup->cpu_caddr1, mmu_ptob(1));
4042 4042          cpup->cpu_caddr1 = 0;
4043 4043  }
4044 4044  
4045 4045  /*
4046 4046   * Function for flushing D-cache when performing module relocations
4047 4047   * to an alternate mapping.  Unnecessary on Intel / AMD platforms.
4048 4048   */
4049 4049  void
4050 4050  dcache_flushall()
4051 4051  {}
4052 4052  
4053 4053  /*
4054 4054   * Allocate a memory page.  The argument 'seed' can be any pseudo-random
4055 4055   * number to vary where the pages come from.  This is quite a hacked up
4056 4056   * method -- it works for now, but really needs to be fixed up a bit.
4057 4057   *
4058 4058   * We currently use page_create_va() on the kvp with fake offsets,
4059 4059   * segments and virt address.  This is pretty bogus, but was copied from the
4060 4060   * old hat_i86.c code.  A better approach would be to specify either mnode
4061 4061   * random or mnode local and takes a page from whatever color has the MOST
4062 4062   * available - this would have a minimal impact on page coloring.
4063 4063   */
4064 4064  page_t *
4065 4065  page_get_physical(uintptr_t seed)
4066 4066  {
4067 4067          page_t *pp;
4068 4068          u_offset_t offset;
4069 4069          static struct seg tmpseg;
4070 4070          static uintptr_t ctr = 0;
4071 4071  
4072 4072          /*
4073 4073           * This code is gross, we really need a simpler page allocator.
4074 4074           *
4075 4075           * We need to assign an offset for the page to call page_create_va()
4076 4076           * To avoid conflicts with other pages, we get creative with the offset.
4077 4077           * For 32 bits, we need an offset > 4Gig
4078 4078           * For 64 bits, need an offset somewhere in the VA hole.
4079 4079           */
4080 4080          offset = seed;
4081 4081          if (offset > kernelbase)
4082 4082                  offset -= kernelbase;
4083 4083          offset <<= MMU_PAGESHIFT;
4084 4084  #if defined(__amd64)
4085 4085          offset += mmu.hole_start;       /* something in VA hole */
4086 4086  #else
4087 4087          offset += 1ULL << 40;   /* something > 4 Gig */
4088 4088  #endif
4089 4089  
4090 4090          if (page_resv(1, KM_NOSLEEP) == 0)
4091 4091                  return (NULL);
4092 4092  
4093 4093  #ifdef  DEBUG
4094 4094          pp = page_exists(&kvp, offset);
4095 4095          if (pp != NULL)
4096 4096                  panic("page already exists %p", (void *)pp);
4097 4097  #endif
4098 4098  
4099 4099          pp = page_create_va(&kvp, offset, MMU_PAGESIZE, PG_EXCL,
4100 4100              &tmpseg, (caddr_t)(ctr += MMU_PAGESIZE));   /* changing VA usage */
4101 4101          if (pp != NULL) {
4102 4102                  page_io_unlock(pp);
4103 4103                  page_downgrade(pp);
4104 4104          }
4105 4105          return (pp);
4106 4106  }

↓ open down ↓

2472 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX