illumos-gate Wdiff usr/src/uts/i86pc/vm/vm_machdep.c

Print this page

10806 mnode_range_setup() makes assumptions about mnodes
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Toomas Soome <tsoome@me.com>

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/i86pc/vm/vm_machdep.c
          +++ new/usr/src/uts/i86pc/vm/vm_machdep.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying

↓ open down ↓

16 lines elided

↑ open up ↑

  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   */
  24   24  /*
  25   25   * Copyright (c) 2010, Intel Corporation.
  26   26   * All rights reserved.
  27      - * Copyright 2018 Joyent, Inc.
       27 + * Copyright 2019, Joyent, Inc.
  28   28   */
  29   29  
  30   30  /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  31   31  /*      All Rights Reserved   */
  32   32  
  33   33  /*
  34   34   * Portions of this source code were derived from Berkeley 4.3 BSD
  35   35   * under license from the Regents of the University of California.
  36   36   */
  37   37

  38   38  /*
  39   39   * UNIX machine dependent virtual memory support.
  40   40   */
  41   41  
  42   42  #include <sys/types.h>
  43   43  #include <sys/param.h>
  44   44  #include <sys/systm.h>
  45   45  #include <sys/user.h>
  46   46  #include <sys/proc.h>
  47   47  #include <sys/kmem.h>
  48   48  #include <sys/vmem.h>
  49   49  #include <sys/buf.h>
  50   50  #include <sys/cpuvar.h>
  51   51  #include <sys/lgrp.h>
  52   52  #include <sys/disp.h>
  53   53  #include <sys/vm.h>
  54   54  #include <sys/mman.h>
  55   55  #include <sys/vnode.h>
  56   56  #include <sys/cred.h>
  57   57  #include <sys/exec.h>
  58   58  #include <sys/exechdr.h>
  59   59  #include <sys/debug.h>
  60   60  #include <sys/vmsystm.h>
  61   61  #include <sys/swap.h>
  62   62  #include <sys/dumphdr.h>
  63   63  #include <sys/random.h>
  64   64  
  65   65  #include <vm/hat.h>
  66   66  #include <vm/as.h>
  67   67  #include <vm/seg.h>
  68   68  #include <vm/seg_kp.h>
  69   69  #include <vm/seg_vn.h>
  70   70  #include <vm/page.h>
  71   71  #include <vm/seg_kmem.h>
  72   72  #include <vm/seg_kpm.h>
  73   73  #include <vm/vm_dep.h>
  74   74  
  75   75  #include <sys/cpu.h>
  76   76  #include <sys/vm_machparam.h>
  77   77  #include <sys/memlist.h>
  78   78  #include <sys/bootconf.h> /* XXX the memlist stuff belongs in memlist_plat.h */
  79   79  #include <vm/hat_i86.h>
  80   80  #include <sys/x86_archext.h>
  81   81  #include <sys/elf_386.h>
  82   82  #include <sys/cmn_err.h>
  83   83  #include <sys/archsystm.h>
  84   84  #include <sys/machsystm.h>
  85   85  #include <sys/secflags.h>
  86   86  
  87   87  #include <sys/vtrace.h>
  88   88  #include <sys/ddidmareq.h>
  89   89  #include <sys/promif.h>
  90   90  #include <sys/memnode.h>
  91   91  #include <sys/stack.h>
  92   92  #include <util/qsort.h>
  93   93  #include <sys/taskq.h>
  94   94  
  95   95  #ifdef __xpv
  96   96  
  97   97  #include <sys/hypervisor.h>
  98   98  #include <sys/xen_mmu.h>
  99   99  #include <sys/balloon_impl.h>
 100  100  
 101  101  /*
 102  102   * domain 0 pages usable for DMA are kept pre-allocated and kept in
 103  103   * distinct lists, ordered by increasing mfn.
 104  104   */
 105  105  static kmutex_t io_pool_lock;
 106  106  static kmutex_t contig_list_lock;
 107  107  static page_t *io_pool_4g;      /* pool for 32 bit dma limited devices */
 108  108  static page_t *io_pool_16m;     /* pool for 24 bit dma limited legacy devices */
 109  109  static long io_pool_cnt;
 110  110  static long io_pool_cnt_max = 0;
 111  111  #define DEFAULT_IO_POOL_MIN     128
 112  112  static long io_pool_cnt_min = DEFAULT_IO_POOL_MIN;
 113  113  static long io_pool_cnt_lowater = 0;
 114  114  static long io_pool_shrink_attempts; /* how many times did we try to shrink */
 115  115  static long io_pool_shrinks;    /* how many times did we really shrink */
 116  116  static long io_pool_grows;      /* how many times did we grow */
 117  117  static mfn_t start_mfn = 1;
 118  118  static caddr_t io_pool_kva;     /* use to alloc pages when needed */
 119  119  
 120  120  static int create_contig_pfnlist(uint_t);
 121  121  
 122  122  /*
 123  123   * percentage of phys mem to hold in the i/o pool
 124  124   */
 125  125  #define DEFAULT_IO_POOL_PCT     2
 126  126  static long io_pool_physmem_pct = DEFAULT_IO_POOL_PCT;
 127  127  static void page_io_pool_sub(page_t **, page_t *, page_t *);
 128  128  int ioalloc_dbg = 0;
 129  129  
 130  130  #endif /* __xpv */
 131  131  
 132  132  uint_t vac_colors = 1;
 133  133  
 134  134  int largepagesupport = 0;
 135  135  extern uint_t page_create_new;
 136  136  extern uint_t page_create_exists;
 137  137  extern uint_t page_create_putbacks;
 138  138  /*
 139  139   * Allow users to disable the kernel's use of SSE.
 140  140   */
 141  141  extern int use_sse_pagecopy, use_sse_pagezero;
 142  142  
 143  143  /*
 144  144   * combined memory ranges from mnode and memranges[] to manage single
 145  145   * mnode/mtype dimension in the page lists.
 146  146   */
 147  147  typedef struct {
 148  148          pfn_t   mnr_pfnlo;
 149  149          pfn_t   mnr_pfnhi;
 150  150          int     mnr_mnode;
 151  151          int     mnr_memrange;           /* index into memranges[] */
 152  152          int     mnr_next;               /* next lower PA mnoderange */

↓ open down ↓

115 lines elided

↑ open up ↑

 153  153          int     mnr_exists;
 154  154          /* maintain page list stats */
 155  155          pgcnt_t mnr_mt_clpgcnt;         /* cache list cnt */
 156  156          pgcnt_t mnr_mt_flpgcnt[MMU_PAGE_SIZES]; /* free list cnt per szc */
 157  157          pgcnt_t mnr_mt_totcnt;          /* sum of cache and free lists */
 158  158  #ifdef DEBUG
 159  159          struct mnr_mts {                /* mnode/mtype szc stats */
 160  160                  pgcnt_t mnr_mts_pgcnt;
 161  161                  int     mnr_mts_colors;
 162  162                  pgcnt_t *mnr_mtsc_pgcnt;
 163      -        }       *mnr_mts;
      163 +        }       *mnr_mts;
 164  164  #endif
 165  165  } mnoderange_t;
 166  166  
 167  167  #define MEMRANGEHI(mtype)                                               \
 168  168          ((mtype > 0) ? memranges[mtype - 1] - 1: physmax)
 169  169  #define MEMRANGELO(mtype)       (memranges[mtype])
 170  170  
 171  171  #define MTYPE_FREEMEM(mt)       (mnoderanges[mt].mnr_mt_totcnt)
 172  172  
 173  173  /*

 174  174   * As the PC architecture evolved memory up was clumped into several
 175  175   * ranges for various historical I/O devices to do DMA.
 176  176   * < 16Meg - ISA bus
 177  177   * < 2Gig - ???
 178  178   * < 4Gig - PCI bus or drivers that don't understand PAE mode
 179  179   *
 180  180   * These are listed in reverse order, so that we can skip over unused
 181  181   * ranges on machines with small memories.
 182  182   *
 183  183   * For now under the Hypervisor, we'll only ever have one memrange.
 184  184   */
 185  185  #define PFN_4GIG        0x100000
 186  186  #define PFN_16MEG       0x1000
 187  187  /* Indices into the memory range (arch_memranges) array. */
 188  188  #define MRI_4G          0
 189  189  #define MRI_2G          1
 190  190  #define MRI_16M         2
 191  191  #define MRI_0           3
 192  192  static pfn_t arch_memranges[NUM_MEM_RANGES] = {
 193  193      PFN_4GIG,   /* pfn range for 4G and above */
 194  194      0x80000,    /* pfn range for 2G-4G */

↓ open down ↓

21 lines elided

↑ open up ↑

 195  195      PFN_16MEG,  /* pfn range for 16M-2G */
 196  196      0x00000,    /* pfn range for 0-16M */
 197  197  };
 198  198  pfn_t *memranges = &arch_memranges[0];
 199  199  int nranges = NUM_MEM_RANGES;
 200  200  
 201  201  /*
 202  202   * This combines mem_node_config and memranges into one data
 203  203   * structure to be used for page list management.
 204  204   */
 205      -mnoderange_t    *mnoderanges;
 206      -int             mnoderangecnt;
 207      -int             mtype4g;
 208      -int             mtype16m;
 209      -int             mtypetop;       /* index of highest pfn'ed mnoderange */
      205 +static mnoderange_t *mnoderanges;
      206 +static int mnoderangecnt;
      207 +static int mtype4g;
      208 +static int mtype16m;
      209 +static int mtypetop;
 210  210  
 211  211  /*
 212  212   * 4g memory management variables for systems with more than 4g of memory:
 213  213   *
 214  214   * physical memory below 4g is required for 32bit dma devices and, currently,
 215  215   * for kmem memory. On systems with more than 4g of memory, the pool of memory
 216  216   * below 4g can be depleted without any paging activity given that there is
 217  217   * likely to be sufficient memory above 4g.
 218  218   *
 219  219   * physmax4g is set true if the largest pfn is over 4g. The rest of the

 220  220   * 4g memory management code is enabled only when physmax4g is true.
 221  221   *
 222  222   * maxmem4g is the count of the maximum number of pages on the page lists
 223  223   * with physical addresses below 4g. It can be a lot less then 4g given that
 224  224   * BIOS may reserve large chunks of space below 4g for hot plug pci devices,
 225  225   * agp aperture etc.
 226  226   *
 227  227   * freemem4g maintains the count of the number of available pages on the
 228  228   * page lists with physical addresses below 4g.
 229  229   *
 230  230   * DESFREE4G specifies the desired amount of below 4g memory. It defaults to
 231  231   * 6% (desfree4gshift = 4) of maxmem4g.
 232  232   *
 233  233   * RESTRICT4G_ALLOC returns true if freemem4g falls below DESFREE4G
 234  234   * and the amount of physical memory above 4g is greater than freemem4g.
 235  235   * In this case, page_get_* routines will restrict below 4g allocations
 236  236   * for requests that don't specifically require it.
 237  237   */
 238  238  
 239  239  #define DESFREE4G       (maxmem4g >> desfree4gshift)
 240  240  
 241  241  #define RESTRICT4G_ALLOC                                        \
 242  242          (physmax4g && (freemem4g < DESFREE4G) && ((freemem4g << 1) < freemem))
 243  243  
 244  244  static pgcnt_t  maxmem4g;
 245  245  static pgcnt_t  freemem4g;
 246  246  static int      physmax4g;
 247  247  static int      desfree4gshift = 4;     /* maxmem4g shift to derive DESFREE4G */
 248  248  
 249  249  /*
 250  250   * 16m memory management:
 251  251   *
 252  252   * reserve some amount of physical memory below 16m for legacy devices.
 253  253   *
 254  254   * RESTRICT16M_ALLOC returns true if an there are sufficient free pages above

↓ open down ↓

35 lines elided

↑ open up ↑

 255  255   * 16m or if the 16m pool drops below DESFREE16M.
 256  256   *
 257  257   * In this case, general page allocations via page_get_{free,cache}list
 258  258   * routines will be restricted from allocating from the 16m pool. Allocations
 259  259   * that require specific pfn ranges (page_get_anylist) and PG_PANIC allocations
 260  260   * are not restricted.
 261  261   */
 262  262  
 263  263  #define FREEMEM16M      MTYPE_FREEMEM(mtype16m)
 264  264  #define DESFREE16M      desfree16m
 265      -#define RESTRICT16M_ALLOC(freemem, pgcnt, flags)                \
 266      -        ((freemem != 0) && ((flags & PG_PANIC) == 0) &&         \
 267      -            ((freemem >= (FREEMEM16M)) ||                       \
      265 +#define RESTRICT16M_ALLOC(freemem, pgcnt, flags) \
      266 +        (mtype16m != -1 && (freemem != 0) && ((flags & PG_PANIC) == 0) && \
      267 +            ((freemem >= (FREEMEM16M)) || \
 268  268              (FREEMEM16M  < (DESFREE16M + pgcnt))))
 269  269  
 270  270  static pgcnt_t  desfree16m = 0x380;
 271  271  
 272  272  /*
 273  273   * This can be patched via /etc/system to allow old non-PAE aware device
 274  274   * drivers to use kmem_alloc'd memory on 32 bit systems with > 4Gig RAM.
 275  275   */
 276  276  int restricted_kmemalloc = 0;
 277  277

 278  278  #ifdef VM_STATS
 279  279  struct {
 280  280          ulong_t pga_alloc;
 281  281          ulong_t pga_notfullrange;
 282  282          ulong_t pga_nulldmaattr;
 283  283          ulong_t pga_allocok;
 284  284          ulong_t pga_allocfailed;
 285  285          ulong_t pgma_alloc;
 286  286          ulong_t pgma_allocok;
 287  287          ulong_t pgma_allocfailed;
 288  288          ulong_t pgma_allocempty;
 289  289  } pga_vmstats;
 290  290  #endif
 291  291  
 292  292  uint_t mmu_page_sizes;
 293  293  
 294  294  /* How many page sizes the users can see */
 295  295  uint_t mmu_exported_page_sizes;
 296  296  
 297  297  /* page sizes that legacy applications can see */
 298  298  uint_t mmu_legacy_page_sizes;
 299  299  
 300  300  /*
 301  301   * Number of pages in 1 GB.  Don't enable automatic large pages if we have
 302  302   * fewer than this many pages.
 303  303   */
 304  304  pgcnt_t shm_lpg_min_physmem = 1 << (30 - MMU_PAGESHIFT);
 305  305  pgcnt_t privm_lpg_min_physmem = 1 << (30 - MMU_PAGESHIFT);
 306  306  
 307  307  /*
 308  308   * Maximum and default segment size tunables for user private
 309  309   * and shared anon memory, and user text and initialized data.
 310  310   * These can be patched via /etc/system to allow large pages
 311  311   * to be used for mapping application private and shared anon memory.
 312  312   */
 313  313  size_t mcntl0_lpsize = MMU_PAGESIZE;
 314  314  size_t max_uheap_lpsize = MMU_PAGESIZE;
 315  315  size_t default_uheap_lpsize = MMU_PAGESIZE;
 316  316  size_t max_ustack_lpsize = MMU_PAGESIZE;
 317  317  size_t default_ustack_lpsize = MMU_PAGESIZE;
 318  318  size_t max_privmap_lpsize = MMU_PAGESIZE;
 319  319  size_t max_uidata_lpsize = MMU_PAGESIZE;
 320  320  size_t max_utext_lpsize = MMU_PAGESIZE;
 321  321  size_t max_shm_lpsize = MMU_PAGESIZE;
 322  322  
 323  323  
 324  324  /*
 325  325   * initialized by page_coloring_init().
 326  326   */
 327  327  uint_t  page_colors;
 328  328  uint_t  page_colors_mask;
 329  329  uint_t  page_coloring_shift;
 330  330  int     cpu_page_colors;
 331  331  static uint_t   l2_colors;
 332  332  
 333  333  /*
 334  334   * Page freelists and cachelists are dynamically allocated once mnoderangecnt
 335  335   * and page_colors are calculated from the l2 cache n-way set size.  Within a
 336  336   * mnode range, the page freelist and cachelist are hashed into bins based on
 337  337   * color. This makes it easier to search for a page within a specific memory
 338  338   * range.
 339  339   */
 340  340  #define PAGE_COLORS_MIN 16
 341  341  
 342  342  page_t ****page_freelists;
 343  343  page_t ***page_cachelists;
 344  344  
 345  345  
 346  346  /*
 347  347   * Used by page layer to know about page sizes
 348  348   */
 349  349  hw_pagesize_t hw_page_array[MAX_NUM_LEVEL + 1];
 350  350  
 351  351  kmutex_t        *fpc_mutex[NPC_MUTEX];
 352  352  kmutex_t        *cpc_mutex[NPC_MUTEX];
 353  353  
 354  354  /* Lock to protect mnoderanges array for memory DR operations. */
 355  355  static kmutex_t mnoderange_lock;
 356  356  
 357  357  /*
 358  358   * Only let one thread at a time try to coalesce large pages, to
 359  359   * prevent them from working against each other.
 360  360   */
 361  361  static kmutex_t contig_lock;
 362  362  #define CONTIG_LOCK()   mutex_enter(&contig_lock);
 363  363  #define CONTIG_UNLOCK() mutex_exit(&contig_lock);
 364  364  
 365  365  #define PFN_16M         (mmu_btop((uint64_t)0x1000000))
 366  366  
 367  367  caddr_t
 368  368  i86devmap(pfn_t pf, pgcnt_t pgcnt, uint_t prot)
 369  369  {
 370  370          caddr_t addr;
 371  371          caddr_t addr1;
 372  372          page_t *pp;
 373  373  
 374  374          addr1 = addr = vmem_alloc(heap_arena, mmu_ptob(pgcnt), VM_SLEEP);
 375  375  
 376  376          for (; pgcnt != 0; addr += MMU_PAGESIZE, ++pf, --pgcnt) {
 377  377                  pp = page_numtopp_nolock(pf);
 378  378                  if (pp == NULL) {
 379  379                          hat_devload(kas.a_hat, addr, MMU_PAGESIZE, pf,
 380  380                              prot | HAT_NOSYNC, HAT_LOAD_LOCK);
 381  381                  } else {
 382  382                          hat_memload(kas.a_hat, addr, pp,
 383  383                              prot | HAT_NOSYNC, HAT_LOAD_LOCK);
 384  384                  }
 385  385          }
 386  386  
 387  387          return (addr1);
 388  388  }
 389  389  
 390  390  /*
 391  391   * This routine is like page_numtopp, but accepts only free pages, which
 392  392   * it allocates (unfrees) and returns with the exclusive lock held.
 393  393   * It is used by machdep.c/dma_init() to find contiguous free pages.
 394  394   */
 395  395  page_t *
 396  396  page_numtopp_alloc(pfn_t pfnum)
 397  397  {
 398  398          page_t *pp;
 399  399  
 400  400  retry:
 401  401          pp = page_numtopp_nolock(pfnum);
 402  402          if (pp == NULL) {
 403  403                  return (NULL);
 404  404          }
 405  405  
 406  406          if (!page_trylock(pp, SE_EXCL)) {
 407  407                  return (NULL);
 408  408          }
 409  409  
 410  410          if (page_pptonum(pp) != pfnum) {
 411  411                  page_unlock(pp);
 412  412                  goto retry;
 413  413          }
 414  414  
 415  415          if (!PP_ISFREE(pp)) {
 416  416                  page_unlock(pp);
 417  417                  return (NULL);
 418  418          }
 419  419          if (pp->p_szc) {
 420  420                  page_demote_free_pages(pp);
 421  421                  page_unlock(pp);
 422  422                  goto retry;
 423  423          }
 424  424  
 425  425          /* If associated with a vnode, destroy mappings */
 426  426  
 427  427          if (pp->p_vnode) {
 428  428  
 429  429                  page_destroy_free(pp);
 430  430  
 431  431                  if (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_NO_RECLAIM)) {
 432  432                          return (NULL);
 433  433                  }
 434  434  
 435  435                  if (page_pptonum(pp) != pfnum) {
 436  436                          page_unlock(pp);
 437  437                          goto retry;
 438  438                  }
 439  439          }
 440  440  
 441  441          if (!PP_ISFREE(pp)) {
 442  442                  page_unlock(pp);
 443  443                  return (NULL);
 444  444          }
 445  445  
 446  446          if (!page_reclaim(pp, (kmutex_t *)NULL))
 447  447                  return (NULL);
 448  448  
 449  449          return (pp);
 450  450  }
 451  451  
 452  452  /*
 453  453   * Return the optimum page size for a given mapping
 454  454   */
 455  455  /*ARGSUSED*/
 456  456  size_t
 457  457  map_pgsz(int maptype, struct proc *p, caddr_t addr, size_t len, int memcntl)
 458  458  {
 459  459          level_t l = 0;
 460  460          size_t pgsz = MMU_PAGESIZE;
 461  461          size_t max_lpsize;
 462  462          uint_t mszc;
 463  463  
 464  464          ASSERT(maptype != MAPPGSZ_VA);
 465  465  
 466  466          if (maptype != MAPPGSZ_ISM && physmem < privm_lpg_min_physmem) {
 467  467                  return (MMU_PAGESIZE);
 468  468          }
 469  469  
 470  470          switch (maptype) {
 471  471          case MAPPGSZ_HEAP:
 472  472          case MAPPGSZ_STK:
 473  473                  max_lpsize = memcntl ? mcntl0_lpsize : (maptype ==
 474  474                      MAPPGSZ_HEAP ? max_uheap_lpsize : max_ustack_lpsize);
 475  475                  if (max_lpsize == MMU_PAGESIZE) {
 476  476                          return (MMU_PAGESIZE);
 477  477                  }
 478  478                  if (len == 0) {
 479  479                          len = (maptype == MAPPGSZ_HEAP) ? p->p_brkbase +
 480  480                              p->p_brksize - p->p_bssbase : p->p_stksize;
 481  481                  }
 482  482                  len = (maptype == MAPPGSZ_HEAP) ? MAX(len,
 483  483                      default_uheap_lpsize) : MAX(len, default_ustack_lpsize);
 484  484  
 485  485                  /*
 486  486                   * use the pages size that best fits len
 487  487                   */
 488  488                  for (l = mmu.umax_page_level; l > 0; --l) {
 489  489                          if (LEVEL_SIZE(l) > max_lpsize || len < LEVEL_SIZE(l)) {
 490  490                                  continue;
 491  491                          } else {
 492  492                                  pgsz = LEVEL_SIZE(l);
 493  493                          }
 494  494                          break;
 495  495                  }
 496  496  
 497  497                  mszc = (maptype == MAPPGSZ_HEAP ? p->p_brkpageszc :
 498  498                      p->p_stkpageszc);
 499  499                  if (addr == 0 && (pgsz < hw_page_array[mszc].hp_size)) {
 500  500                          pgsz = hw_page_array[mszc].hp_size;
 501  501                  }
 502  502                  return (pgsz);
 503  503  
 504  504          case MAPPGSZ_ISM:
 505  505                  for (l = mmu.umax_page_level; l > 0; --l) {
 506  506                          if (len >= LEVEL_SIZE(l))
 507  507                                  return (LEVEL_SIZE(l));
 508  508                  }
 509  509                  return (LEVEL_SIZE(0));
 510  510          }
 511  511          return (pgsz);
 512  512  }
 513  513  
 514  514  static uint_t
 515  515  map_szcvec(caddr_t addr, size_t size, uintptr_t off, size_t max_lpsize,
 516  516      size_t min_physmem)
 517  517  {
 518  518          caddr_t eaddr = addr + size;
 519  519          uint_t szcvec = 0;
 520  520          caddr_t raddr;
 521  521          caddr_t readdr;
 522  522          size_t  pgsz;
 523  523          int i;
 524  524  
 525  525          if (physmem < min_physmem || max_lpsize <= MMU_PAGESIZE) {
 526  526                  return (0);
 527  527          }
 528  528  
 529  529          for (i = mmu_exported_page_sizes - 1; i > 0; i--) {
 530  530                  pgsz = page_get_pagesize(i);
 531  531                  if (pgsz > max_lpsize) {
 532  532                          continue;
 533  533                  }
 534  534                  raddr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
 535  535                  readdr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
 536  536                  if (raddr < addr || raddr >= readdr) {
 537  537                          continue;
 538  538                  }
 539  539                  if (P2PHASE((uintptr_t)addr ^ off, pgsz)) {
 540  540                          continue;
 541  541                  }
 542  542                  /*
 543  543                   * Set szcvec to the remaining page sizes.
 544  544                   */
 545  545                  szcvec = ((1 << (i + 1)) - 1) & ~1;
 546  546                  break;
 547  547          }
 548  548          return (szcvec);
 549  549  }
 550  550  
 551  551  /*
 552  552   * Return a bit vector of large page size codes that
 553  553   * can be used to map [addr, addr + len) region.
 554  554   */
 555  555  /*ARGSUSED*/
 556  556  uint_t
 557  557  map_pgszcvec(caddr_t addr, size_t size, uintptr_t off, int flags, int type,
 558  558      int memcntl)
 559  559  {
 560  560          size_t max_lpsize = mcntl0_lpsize;
 561  561  
 562  562          if (mmu.max_page_level == 0)
 563  563                  return (0);
 564  564  
 565  565          if (flags & MAP_TEXT) {
 566  566                  if (!memcntl)
 567  567                          max_lpsize = max_utext_lpsize;
 568  568                  return (map_szcvec(addr, size, off, max_lpsize,
 569  569                      shm_lpg_min_physmem));
 570  570  
 571  571          } else if (flags & MAP_INITDATA) {
 572  572                  if (!memcntl)
 573  573                          max_lpsize = max_uidata_lpsize;
 574  574                  return (map_szcvec(addr, size, off, max_lpsize,
 575  575                      privm_lpg_min_physmem));
 576  576  
 577  577          } else if (type == MAPPGSZC_SHM) {
 578  578                  if (!memcntl)
 579  579                          max_lpsize = max_shm_lpsize;
 580  580                  return (map_szcvec(addr, size, off, max_lpsize,
 581  581                      shm_lpg_min_physmem));
 582  582  
 583  583          } else if (type == MAPPGSZC_HEAP) {
 584  584                  if (!memcntl)
 585  585                          max_lpsize = max_uheap_lpsize;
 586  586                  return (map_szcvec(addr, size, off, max_lpsize,
 587  587                      privm_lpg_min_physmem));
 588  588  
 589  589          } else if (type == MAPPGSZC_STACK) {
 590  590                  if (!memcntl)
 591  591                          max_lpsize = max_ustack_lpsize;
 592  592                  return (map_szcvec(addr, size, off, max_lpsize,
 593  593                      privm_lpg_min_physmem));
 594  594  
 595  595          } else {
 596  596                  if (!memcntl)
 597  597                          max_lpsize = max_privmap_lpsize;
 598  598                  return (map_szcvec(addr, size, off, max_lpsize,
 599  599                      privm_lpg_min_physmem));
 600  600          }
 601  601  }
 602  602  
 603  603  /*
 604  604   * Handle a pagefault.
 605  605   */
 606  606  faultcode_t
 607  607  pagefault(
 608  608          caddr_t addr,
 609  609          enum fault_type type,
 610  610          enum seg_rw rw,
 611  611          int iskernel)
 612  612  {
 613  613          struct as *as;
 614  614          struct hat *hat;
 615  615          struct proc *p;
 616  616          kthread_t *t;
 617  617          faultcode_t res;
 618  618          caddr_t base;
 619  619          size_t len;
 620  620          int err;
 621  621          int mapped_red;
 622  622          uintptr_t ea;
 623  623  
 624  624          ASSERT_STACK_ALIGNED();
 625  625  
 626  626          if (INVALID_VADDR(addr))
 627  627                  return (FC_NOMAP);
 628  628  
 629  629          mapped_red = segkp_map_red();
 630  630  
 631  631          if (iskernel) {
 632  632                  as = &kas;
 633  633                  hat = as->a_hat;
 634  634          } else {
 635  635                  t = curthread;
 636  636                  p = ttoproc(t);
 637  637                  as = p->p_as;
 638  638                  hat = as->a_hat;
 639  639          }
 640  640  
 641  641          /*
 642  642           * Dispatch pagefault.
 643  643           */
 644  644          res = as_fault(hat, as, addr, 1, type, rw);
 645  645  
 646  646          /*
 647  647           * If this isn't a potential unmapped hole in the user's
 648  648           * UNIX data or stack segments, just return status info.
 649  649           */
 650  650          if (res != FC_NOMAP || iskernel)
 651  651                  goto out;
 652  652  
 653  653          /*
 654  654           * Check to see if we happened to faulted on a currently unmapped
 655  655           * part of the UNIX data or stack segments.  If so, create a zfod
 656  656           * mapping there and then try calling the fault routine again.
 657  657           */
 658  658          base = p->p_brkbase;
 659  659          len = p->p_brksize;
 660  660  
 661  661          if (addr < base || addr >= base + len) {                /* data seg? */
 662  662                  base = (caddr_t)p->p_usrstack - p->p_stksize;
 663  663                  len = p->p_stksize;
 664  664                  if (addr < base || addr >= p->p_usrstack) {     /* stack seg? */
 665  665                          /* not in either UNIX data or stack segments */
 666  666                          res = FC_NOMAP;
 667  667                          goto out;
 668  668                  }
 669  669          }
 670  670  
 671  671          /*
 672  672           * the rest of this function implements a 3.X 4.X 5.X compatibility
 673  673           * This code is probably not needed anymore
 674  674           */
 675  675          if (p->p_model == DATAMODEL_ILP32) {
 676  676  
 677  677                  /* expand the gap to the page boundaries on each side */
 678  678                  ea = P2ROUNDUP((uintptr_t)base + len, MMU_PAGESIZE);
 679  679                  base = (caddr_t)P2ALIGN((uintptr_t)base, MMU_PAGESIZE);
 680  680                  len = ea - (uintptr_t)base;
 681  681  
 682  682                  as_rangelock(as);
 683  683                  if (as_gap(as, MMU_PAGESIZE, &base, &len, AH_CONTAIN, addr) ==
 684  684                      0) {
 685  685                          err = as_map(as, base, len, segvn_create, zfod_argsp);
 686  686                          as_rangeunlock(as);
 687  687                          if (err) {
 688  688                                  res = FC_MAKE_ERR(err);
 689  689                                  goto out;
 690  690                          }
 691  691                  } else {
 692  692                          /*
 693  693                           * This page is already mapped by another thread after
 694  694                           * we returned from as_fault() above.  We just fall
 695  695                           * through as_fault() below.
 696  696                           */
 697  697                          as_rangeunlock(as);
 698  698                  }
 699  699  
 700  700                  res = as_fault(hat, as, addr, 1, F_INVAL, rw);
 701  701          }
 702  702  
 703  703  out:
 704  704          if (mapped_red)
 705  705                  segkp_unmap_red();
 706  706  
 707  707          return (res);
 708  708  }
 709  709  
 710  710  void
 711  711  map_addr(caddr_t *addrp, size_t len, offset_t off, int vacalign, uint_t flags)
 712  712  {
 713  713          struct proc *p = curproc;
 714  714          caddr_t userlimit = (flags & _MAP_LOW32) ?
 715  715              (caddr_t)_userlimit32 : p->p_as->a_userlimit;
 716  716  
 717  717          map_addr_proc(addrp, len, off, vacalign, userlimit, curproc, flags);
 718  718  }
 719  719  
 720  720  /*ARGSUSED*/
 721  721  int
 722  722  map_addr_vacalign_check(caddr_t addr, u_offset_t off)
 723  723  {
 724  724          return (0);
 725  725  }
 726  726  
 727  727  /*
 728  728   * The maximum amount a randomized mapping will be slewed.  We should perhaps
 729  729   * arrange things so these tunables can be separate for mmap, mmapobj, and
 730  730   * ld.so
 731  731   */
 732  732  size_t aslr_max_map_skew = 256 * 1024 * 1024; /* 256MB */
 733  733  
 734  734  /*
 735  735   * map_addr_proc() is the routine called when the system is to
 736  736   * choose an address for the user.  We will pick an address
 737  737   * range which is the highest available below userlimit.
 738  738   *
 739  739   * Every mapping will have a redzone of a single page on either side of
 740  740   * the request. This is done to leave one page unmapped between segments.
 741  741   * This is not required, but it's useful for the user because if their
 742  742   * program strays across a segment boundary, it will catch a fault
 743  743   * immediately making debugging a little easier.  Currently the redzone
 744  744   * is mandatory.
 745  745   *
 746  746   * addrp is a value/result parameter.
 747  747   *      On input it is a hint from the user to be used in a completely
 748  748   *      machine dependent fashion.  We decide to completely ignore this hint.
 749  749   *      If MAP_ALIGN was specified, addrp contains the minimal alignment, which
 750  750   *      must be some "power of two" multiple of pagesize.
 751  751   *
 752  752   *      On output it is NULL if no address can be found in the current
 753  753   *      processes address space or else an address that is currently
 754  754   *      not mapped for len bytes with a page of red zone on either side.
 755  755   *
 756  756   *      vacalign is not needed on x86 (it's for viturally addressed caches)
 757  757   */
 758  758  /*ARGSUSED*/
 759  759  void
 760  760  map_addr_proc(
 761  761          caddr_t *addrp,
 762  762          size_t len,
 763  763          offset_t off,
 764  764          int vacalign,
 765  765          caddr_t userlimit,
 766  766          struct proc *p,
 767  767          uint_t flags)
 768  768  {
 769  769          struct as *as = p->p_as;
 770  770          caddr_t addr;
 771  771          caddr_t base;
 772  772          size_t slen;
 773  773          size_t align_amount;
 774  774  
 775  775          ASSERT32(userlimit == as->a_userlimit);
 776  776  
 777  777          base = p->p_brkbase;
 778  778  #if defined(__amd64)
 779  779          if (p->p_model == DATAMODEL_NATIVE) {
 780  780                  if (userlimit < as->a_userlimit) {
 781  781                          /*
 782  782                           * This happens when a program wants to map
 783  783                           * something in a range that's accessible to a
 784  784                           * program in a smaller address space.  For example,
 785  785                           * a 64-bit program calling mmap32(2) to guarantee
 786  786                           * that the returned address is below 4Gbytes.
 787  787                           */
 788  788                          ASSERT((uintptr_t)userlimit < ADDRESS_C(0xffffffff));
 789  789  
 790  790                          if (userlimit > base)
 791  791                                  slen = userlimit - base;
 792  792                          else {
 793  793                                  *addrp = NULL;
 794  794                                  return;
 795  795                          }
 796  796                  } else {
 797  797                          /*
 798  798                           * With the stack positioned at a higher address than
 799  799                           * the heap for 64-bit processes, it is necessary to be
 800  800                           * mindful of its location and potential size.
 801  801                           *
 802  802                           * Unallocated space above the top of the stack (that
 803  803                           * is, at a lower address) but still within the bounds
 804  804                           * of the stack limit should be considered unavailable.
 805  805                           *
 806  806                           * As the 64-bit stack guard is mapped in immediately
 807  807                           * adjacent to the stack limit boundary, this prevents
 808  808                           * new mappings from having accidentally dangerous
 809  809                           * proximity to the stack.
 810  810                           */
 811  811                          slen = p->p_usrstack - base -
 812  812                              ((p->p_stk_ctl + PAGEOFFSET) & PAGEMASK);
 813  813                  }
 814  814          } else
 815  815  #endif /* defined(__amd64) */
 816  816                  slen = userlimit - base;
 817  817  
 818  818          /* Make len be a multiple of PAGESIZE */
 819  819          len = (len + PAGEOFFSET) & PAGEMASK;
 820  820  
 821  821          /*
 822  822           * figure out what the alignment should be
 823  823           *
 824  824           * XX64 -- is there an ELF_AMD64_MAXPGSZ or is it the same????
 825  825           */
 826  826          if (len <= ELF_386_MAXPGSZ) {
 827  827                  /*
 828  828                   * Align virtual addresses to ensure that ELF shared libraries
 829  829                   * are mapped with the appropriate alignment constraints by
 830  830                   * the run-time linker.
 831  831                   */
 832  832                  align_amount = ELF_386_MAXPGSZ;
 833  833          } else {
 834  834                  /*
 835  835                   * For 32-bit processes, only those which have specified
 836  836                   * MAP_ALIGN and an addr will be aligned on a larger page size.
 837  837                   * Not doing so can potentially waste up to 1G of process
 838  838                   * address space.
 839  839                   */
 840  840                  int lvl = (p->p_model == DATAMODEL_ILP32) ? 1 :
 841  841                      mmu.umax_page_level;
 842  842  
 843  843                  while (lvl && len < LEVEL_SIZE(lvl))
 844  844                          --lvl;
 845  845  
 846  846                  align_amount = LEVEL_SIZE(lvl);
 847  847          }
 848  848          if ((flags & MAP_ALIGN) && ((uintptr_t)*addrp > align_amount))
 849  849                  align_amount = (uintptr_t)*addrp;
 850  850  
 851  851          ASSERT(ISP2(align_amount));
 852  852          ASSERT(align_amount == 0 || align_amount >= PAGESIZE);
 853  853  
 854  854          off = off & (align_amount - 1);
 855  855  
 856  856          /*
 857  857           * Look for a large enough hole starting below userlimit.
 858  858           * After finding it, use the upper part.
 859  859           */
 860  860          if (as_gap_aligned(as, len, &base, &slen, AH_HI, NULL, align_amount,
 861  861              PAGESIZE, off) == 0) {
 862  862                  caddr_t as_addr;
 863  863  
 864  864                  /*
 865  865                   * addr is the highest possible address to use since we have
 866  866                   * a PAGESIZE redzone at the beginning and end.
 867  867                   */
 868  868                  addr = base + slen - (PAGESIZE + len);
 869  869                  as_addr = addr;
 870  870                  /*
 871  871                   * Round address DOWN to the alignment amount and
 872  872                   * add the offset in.
 873  873                   * If addr is greater than as_addr, len would not be large
 874  874                   * enough to include the redzone, so we must adjust down
 875  875                   * by the alignment amount.
 876  876                   */
 877  877                  addr = (caddr_t)((uintptr_t)addr & (~(align_amount - 1)));
 878  878                  addr += (uintptr_t)off;
 879  879                  if (addr > as_addr) {
 880  880                          addr -= align_amount;
 881  881                  }
 882  882  
 883  883                  /*
 884  884                   * If randomization is requested, slew the allocation
 885  885                   * backwards, within the same gap, by a random amount.
 886  886                   */
 887  887                  if (flags & _MAP_RANDOMIZE) {
 888  888                          uint32_t slew;
 889  889  
 890  890                          (void) random_get_pseudo_bytes((uint8_t *)&slew,
 891  891                              sizeof (slew));
 892  892  
 893  893                          slew = slew % MIN(aslr_max_map_skew, (addr - base));
 894  894                          addr -= P2ALIGN(slew, align_amount);
 895  895                  }
 896  896  
 897  897                  ASSERT(addr > base);
 898  898                  ASSERT(addr + len < base + slen);
 899  899                  ASSERT(((uintptr_t)addr & (align_amount - 1)) ==
 900  900                      ((uintptr_t)(off)));
 901  901                  *addrp = addr;
 902  902          } else {
 903  903                  *addrp = NULL;  /* no more virtual space */
 904  904          }
 905  905  }
 906  906  
 907  907  int valid_va_range_aligned_wraparound;
 908  908  
 909  909  /*
 910  910   * Determine whether [*basep, *basep + *lenp) contains a mappable range of
 911  911   * addresses at least "minlen" long, where the base of the range is at "off"
 912  912   * phase from an "align" boundary and there is space for a "redzone"-sized
 913  913   * redzone on either side of the range.  On success, 1 is returned and *basep
 914  914   * and *lenp are adjusted to describe the acceptable range (including
 915  915   * the redzone).  On failure, 0 is returned.
 916  916   */
 917  917  /*ARGSUSED3*/
 918  918  int
 919  919  valid_va_range_aligned(caddr_t *basep, size_t *lenp, size_t minlen, int dir,
 920  920      size_t align, size_t redzone, size_t off)
 921  921  {
 922  922          uintptr_t hi, lo;
 923  923          size_t tot_len;
 924  924  
 925  925          ASSERT(align == 0 ? off == 0 : off < align);
 926  926          ASSERT(ISP2(align));
 927  927          ASSERT(align == 0 || align >= PAGESIZE);
 928  928  
 929  929          lo = (uintptr_t)*basep;
 930  930          hi = lo + *lenp;
 931  931          tot_len = minlen + 2 * redzone; /* need at least this much space */
 932  932  
 933  933          /*
 934  934           * If hi rolled over the top, try cutting back.
 935  935           */
 936  936          if (hi < lo) {
 937  937                  *lenp = 0UL - lo - 1UL;
 938  938                  /* See if this really happens. If so, then we figure out why */
 939  939                  valid_va_range_aligned_wraparound++;
 940  940                  hi = lo + *lenp;
 941  941          }
 942  942          if (*lenp < tot_len) {
 943  943                  return (0);
 944  944          }
 945  945  
 946  946  #if defined(__amd64)
 947  947          /*
 948  948           * Deal with a possible hole in the address range between
 949  949           * hole_start and hole_end that should never be mapped.
 950  950           */
 951  951          if (lo < hole_start) {
 952  952                  if (hi > hole_start) {
 953  953                          if (hi < hole_end) {
 954  954                                  hi = hole_start;
 955  955                          } else {
 956  956                                  /* lo < hole_start && hi >= hole_end */
 957  957                                  if (dir == AH_LO) {
 958  958                                          /*
 959  959                                           * prefer lowest range
 960  960                                           */
 961  961                                          if (hole_start - lo >= tot_len)
 962  962                                                  hi = hole_start;
 963  963                                          else if (hi - hole_end >= tot_len)
 964  964                                                  lo = hole_end;
 965  965                                          else
 966  966                                                  return (0);
 967  967                                  } else {
 968  968                                          /*
 969  969                                           * prefer highest range
 970  970                                           */
 971  971                                          if (hi - hole_end >= tot_len)
 972  972                                                  lo = hole_end;
 973  973                                          else if (hole_start - lo >= tot_len)
 974  974                                                  hi = hole_start;
 975  975                                          else
 976  976                                                  return (0);
 977  977                                  }
 978  978                          }
 979  979                  }
 980  980          } else {
 981  981                  /* lo >= hole_start */
 982  982                  if (hi < hole_end)
 983  983                          return (0);
 984  984                  if (lo < hole_end)
 985  985                          lo = hole_end;
 986  986          }
 987  987  #endif
 988  988  
 989  989          if (hi - lo < tot_len)
 990  990                  return (0);
 991  991  
 992  992          if (align > 1) {
 993  993                  uintptr_t tlo = lo + redzone;
 994  994                  uintptr_t thi = hi - redzone;
 995  995                  tlo = (uintptr_t)P2PHASEUP(tlo, align, off);
 996  996                  if (tlo < lo + redzone) {
 997  997                          return (0);
 998  998                  }
 999  999                  if (thi < tlo || thi - tlo < minlen) {
1000 1000                          return (0);
1001 1001                  }
1002 1002          }
1003 1003  
1004 1004          *basep = (caddr_t)lo;
1005 1005          *lenp = hi - lo;
1006 1006          return (1);
1007 1007  }
1008 1008  
1009 1009  /*
1010 1010   * Determine whether [*basep, *basep + *lenp) contains a mappable range of
1011 1011   * addresses at least "minlen" long.  On success, 1 is returned and *basep
1012 1012   * and *lenp are adjusted to describe the acceptable range.  On failure, 0
1013 1013   * is returned.
1014 1014   */
1015 1015  int
1016 1016  valid_va_range(caddr_t *basep, size_t *lenp, size_t minlen, int dir)
1017 1017  {
1018 1018          return (valid_va_range_aligned(basep, lenp, minlen, dir, 0, 0, 0));
1019 1019  }
1020 1020  
1021 1021  /*
1022 1022   * Default to forbidding the first 64k of address space.  This protects most
1023 1023   * reasonably sized structures from dereferences through NULL:
1024 1024   *     ((foo_t *)0)->bar
1025 1025   */
1026 1026  uintptr_t forbidden_null_mapping_sz = 0x10000;
1027 1027  
1028 1028  /*
1029 1029   * Determine whether [addr, addr+len] are valid user addresses.
1030 1030   */
1031 1031  /*ARGSUSED*/
1032 1032  int
1033 1033  valid_usr_range(caddr_t addr, size_t len, uint_t prot, struct as *as,
1034 1034      caddr_t userlimit)
1035 1035  {
1036 1036          caddr_t eaddr = addr + len;
1037 1037  
1038 1038          if (eaddr <= addr || addr >= userlimit || eaddr > userlimit)
1039 1039                  return (RANGE_BADADDR);
1040 1040  
1041 1041          if ((addr <= (caddr_t)forbidden_null_mapping_sz) &&
1042 1042              as->a_proc != NULL &&
1043 1043              secflag_enabled(as->a_proc, PROC_SEC_FORBIDNULLMAP))
1044 1044                  return (RANGE_BADADDR);
1045 1045  
1046 1046  #if defined(__amd64)
1047 1047          /*
1048 1048           * Check for the VA hole
1049 1049           */
1050 1050          if (eaddr > (caddr_t)hole_start && addr < (caddr_t)hole_end)
1051 1051                  return (RANGE_BADADDR);
1052 1052  #endif
1053 1053  
1054 1054          return (RANGE_OKAY);
1055 1055  }
1056 1056  
1057 1057  /*
1058 1058   * Return 1 if the page frame is onboard memory, else 0.
1059 1059   */
1060 1060  int
1061 1061  pf_is_memory(pfn_t pf)
1062 1062  {
1063 1063          if (pfn_is_foreign(pf))
1064 1064                  return (0);
1065 1065          return (address_in_memlist(phys_install, pfn_to_pa(pf), 1));
1066 1066  }
1067 1067  
1068 1068  /*
1069 1069   * return the memrange containing pfn
1070 1070   */
1071 1071  int
1072 1072  memrange_num(pfn_t pfn)
1073 1073  {
1074 1074          int n;
1075 1075  
1076 1076          for (n = 0; n < nranges - 1; ++n) {
1077 1077                  if (pfn >= memranges[n])
1078 1078                          break;
1079 1079          }
1080 1080          return (n);
1081 1081  }
1082 1082  
1083 1083  /*
1084 1084   * return the mnoderange containing pfn
1085 1085   */
1086 1086  /*ARGSUSED*/
1087 1087  int
1088 1088  pfn_2_mtype(pfn_t pfn)
1089 1089  {
1090 1090  #if defined(__xpv)
1091 1091          return (0);
1092 1092  #else
1093 1093          int     n;
1094 1094  
1095 1095          /* Always start from highest pfn and work our way down */
1096 1096          for (n = mtypetop; n != -1; n = mnoderanges[n].mnr_next) {
1097 1097                  if (pfn >= mnoderanges[n].mnr_pfnlo) {
1098 1098                          break;
1099 1099                  }
1100 1100          }
1101 1101          return (n);
1102 1102  #endif
1103 1103  }
1104 1104  
1105 1105  #if !defined(__xpv)
1106 1106  /*
1107 1107   * is_contigpage_free:
1108 1108   *      returns a page list of contiguous pages. It minimally has to return
1109 1109   *      minctg pages. Caller determines minctg based on the scatter-gather
1110 1110   *      list length.
1111 1111   *
1112 1112   *      pfnp is set to the next page frame to search on return.
1113 1113   */
1114 1114  static page_t *
1115 1115  is_contigpage_free(
1116 1116          pfn_t *pfnp,
1117 1117          pgcnt_t *pgcnt,
1118 1118          pgcnt_t minctg,
1119 1119          uint64_t pfnseg,
1120 1120          int iolock)
1121 1121  {
1122 1122          int     i = 0;
1123 1123          pfn_t   pfn = *pfnp;
1124 1124          page_t  *pp;
1125 1125          page_t  *plist = NULL;
1126 1126  
1127 1127          /*
1128 1128           * fail if pfn + minctg crosses a segment boundary.
1129 1129           * Adjust for next starting pfn to begin at segment boundary.
1130 1130           */
1131 1131  
1132 1132          if (((*pfnp + minctg - 1) & pfnseg) < (*pfnp & pfnseg)) {
1133 1133                  *pfnp = roundup(*pfnp, pfnseg + 1);
1134 1134                  return (NULL);
1135 1135          }
1136 1136  
1137 1137          do {
1138 1138  retry:
1139 1139                  pp = page_numtopp_nolock(pfn + i);
1140 1140                  if ((pp == NULL) || IS_DUMP_PAGE(pp) ||
1141 1141                      (page_trylock(pp, SE_EXCL) == 0)) {
1142 1142                          (*pfnp)++;
1143 1143                          break;
1144 1144                  }
1145 1145                  if (page_pptonum(pp) != pfn + i) {
1146 1146                          page_unlock(pp);
1147 1147                          goto retry;
1148 1148                  }
1149 1149  
1150 1150                  if (!(PP_ISFREE(pp))) {
1151 1151                          page_unlock(pp);
1152 1152                          (*pfnp)++;
1153 1153                          break;
1154 1154                  }
1155 1155  
1156 1156                  if (!PP_ISAGED(pp)) {
1157 1157                          page_list_sub(pp, PG_CACHE_LIST);
1158 1158                          page_hashout(pp, (kmutex_t *)NULL);
1159 1159                  } else {
1160 1160                          page_list_sub(pp, PG_FREE_LIST);
1161 1161                  }
1162 1162  
1163 1163                  if (iolock)
1164 1164                          page_io_lock(pp);
1165 1165                  page_list_concat(&plist, &pp);
1166 1166  
1167 1167                  /*
1168 1168                   * exit loop when pgcnt satisfied or segment boundary reached.
1169 1169                   */
1170 1170  
1171 1171          } while ((++i < *pgcnt) && ((pfn + i) & pfnseg));
1172 1172  
1173 1173          *pfnp += i;             /* set to next pfn to search */
1174 1174  
1175 1175          if (i >= minctg) {
1176 1176                  *pgcnt -= i;
1177 1177                  return (plist);
1178 1178          }
1179 1179  
1180 1180          /*
1181 1181           * failure: minctg not satisfied.
1182 1182           *
1183 1183           * if next request crosses segment boundary, set next pfn
1184 1184           * to search from the segment boundary.
1185 1185           */
1186 1186          if (((*pfnp + minctg - 1) & pfnseg) < (*pfnp & pfnseg))
1187 1187                  *pfnp = roundup(*pfnp, pfnseg + 1);
1188 1188  
1189 1189          /* clean up any pages already allocated */
1190 1190  
1191 1191          while (plist) {
1192 1192                  pp = plist;
1193 1193                  page_sub(&plist, pp);
1194 1194                  page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
1195 1195                  if (iolock)
1196 1196                          page_io_unlock(pp);
1197 1197                  page_unlock(pp);
1198 1198          }
1199 1199  
1200 1200          return (NULL);
1201 1201  }
1202 1202  #endif  /* !__xpv */
1203 1203  
1204 1204  /*
1205 1205   * verify that pages being returned from allocator have correct DMA attribute
1206 1206   */
1207 1207  #ifndef DEBUG
1208 1208  #define check_dma(a, b, c) (void)(0)
1209 1209  #else
1210 1210  static void
1211 1211  check_dma(ddi_dma_attr_t *dma_attr, page_t *pp, int cnt)
1212 1212  {
1213 1213          if (dma_attr == NULL)
1214 1214                  return;
1215 1215  
1216 1216          while (cnt-- > 0) {
1217 1217                  if (pa_to_ma(pfn_to_pa(pp->p_pagenum)) <
1218 1218                      dma_attr->dma_attr_addr_lo)
1219 1219                          panic("PFN (pp=%p) below dma_attr_addr_lo", (void *)pp);
1220 1220                  if (pa_to_ma(pfn_to_pa(pp->p_pagenum)) >=
1221 1221                      dma_attr->dma_attr_addr_hi)
1222 1222                          panic("PFN (pp=%p) above dma_attr_addr_hi", (void *)pp);
1223 1223                  pp = pp->p_next;
1224 1224          }
1225 1225  }
1226 1226  #endif
1227 1227  
1228 1228  #if !defined(__xpv)
1229 1229  static page_t *
1230 1230  page_get_contigpage(pgcnt_t *pgcnt, ddi_dma_attr_t *mattr, int iolock)
1231 1231  {
1232 1232          pfn_t           pfn;
1233 1233          int             sgllen;
1234 1234          uint64_t        pfnseg;
1235 1235          pgcnt_t         minctg;
1236 1236          page_t          *pplist = NULL, *plist;
1237 1237          uint64_t        lo, hi;
1238 1238          pgcnt_t         pfnalign = 0;
1239 1239          static pfn_t    startpfn;
1240 1240          static pgcnt_t  lastctgcnt;
1241 1241          uintptr_t       align;
1242 1242  
1243 1243          CONTIG_LOCK();
1244 1244  
1245 1245          if (mattr) {
1246 1246                  lo = mmu_btop((mattr->dma_attr_addr_lo + MMU_PAGEOFFSET));
1247 1247                  hi = mmu_btop(mattr->dma_attr_addr_hi);
1248 1248                  if (hi >= physmax)
1249 1249                          hi = physmax - 1;
1250 1250                  sgllen = mattr->dma_attr_sgllen;
1251 1251                  pfnseg = mmu_btop(mattr->dma_attr_seg);
1252 1252  
1253 1253                  align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer);
1254 1254                  if (align > MMU_PAGESIZE)
1255 1255                          pfnalign = mmu_btop(align);
1256 1256  
1257 1257                  /*
1258 1258                   * in order to satisfy the request, must minimally
1259 1259                   * acquire minctg contiguous pages
1260 1260                   */
1261 1261                  minctg = howmany(*pgcnt, sgllen);
1262 1262  
1263 1263                  ASSERT(hi >= lo);
1264 1264  
1265 1265                  /*
1266 1266                   * start from where last searched if the minctg >= lastctgcnt
1267 1267                   */
1268 1268                  if (minctg < lastctgcnt || startpfn < lo || startpfn > hi)
1269 1269                          startpfn = lo;
1270 1270          } else {
1271 1271                  hi = physmax - 1;
1272 1272                  lo = 0;
1273 1273                  sgllen = 1;
1274 1274                  pfnseg = mmu.highest_pfn;
1275 1275                  minctg = *pgcnt;
1276 1276  
1277 1277                  if (minctg < lastctgcnt)
1278 1278                          startpfn = lo;
1279 1279          }
1280 1280          lastctgcnt = minctg;
1281 1281  
1282 1282          ASSERT(pfnseg + 1 >= (uint64_t)minctg);
1283 1283  
1284 1284          /* conserve 16m memory - start search above 16m when possible */
1285 1285          if (hi > PFN_16M && startpfn < PFN_16M)
1286 1286                  startpfn = PFN_16M;
1287 1287  
1288 1288          pfn = startpfn;
1289 1289          if (pfnalign)
1290 1290                  pfn = P2ROUNDUP(pfn, pfnalign);
1291 1291  
1292 1292          while (pfn + minctg - 1 <= hi) {
1293 1293  
1294 1294                  plist = is_contigpage_free(&pfn, pgcnt, minctg, pfnseg, iolock);
1295 1295                  if (plist) {
1296 1296                          page_list_concat(&pplist, &plist);
1297 1297                          sgllen--;
1298 1298                          /*
1299 1299                           * return when contig pages no longer needed
1300 1300                           */
1301 1301                          if (!*pgcnt || ((*pgcnt <= sgllen) && !pfnalign)) {
1302 1302                                  startpfn = pfn;
1303 1303                                  CONTIG_UNLOCK();
1304 1304                                  check_dma(mattr, pplist, *pgcnt);
1305 1305                                  return (pplist);
1306 1306                          }
1307 1307                          minctg = howmany(*pgcnt, sgllen);
1308 1308                  }
1309 1309                  if (pfnalign)
1310 1310                          pfn = P2ROUNDUP(pfn, pfnalign);
1311 1311          }
1312 1312  
1313 1313          /* cannot find contig pages in specified range */
1314 1314          if (startpfn == lo) {
1315 1315                  CONTIG_UNLOCK();
1316 1316                  return (NULL);
1317 1317          }
1318 1318  
1319 1319          /* did not start with lo previously */
1320 1320          pfn = lo;
1321 1321          if (pfnalign)
1322 1322                  pfn = P2ROUNDUP(pfn, pfnalign);
1323 1323  
1324 1324          /* allow search to go above startpfn */
1325 1325          while (pfn < startpfn) {
1326 1326  
1327 1327                  plist = is_contigpage_free(&pfn, pgcnt, minctg, pfnseg, iolock);
1328 1328                  if (plist != NULL) {
1329 1329  
1330 1330                          page_list_concat(&pplist, &plist);
1331 1331                          sgllen--;
1332 1332  
1333 1333                          /*
1334 1334                           * return when contig pages no longer needed
1335 1335                           */
1336 1336                          if (!*pgcnt || ((*pgcnt <= sgllen) && !pfnalign)) {
1337 1337                                  startpfn = pfn;
1338 1338                                  CONTIG_UNLOCK();
1339 1339                                  check_dma(mattr, pplist, *pgcnt);
1340 1340                                  return (pplist);
1341 1341                          }
1342 1342                          minctg = howmany(*pgcnt, sgllen);
1343 1343                  }
1344 1344                  if (pfnalign)
1345 1345                          pfn = P2ROUNDUP(pfn, pfnalign);
1346 1346          }
1347 1347          CONTIG_UNLOCK();
1348 1348          return (NULL);
1349 1349  }
1350 1350  #endif  /* !__xpv */
1351 1351  
1352 1352  /*
1353 1353   * mnode_range_cnt() calculates the number of memory ranges for mnode and
1354 1354   * memranges[]. Used to determine the size of page lists and mnoderanges.
1355 1355   */
1356 1356  int
1357 1357  mnode_range_cnt(int mnode)
1358 1358  {
1359 1359  #if defined(__xpv)
1360 1360          ASSERT(mnode == 0);
1361 1361          return (1);
1362 1362  #else   /* __xpv */
1363 1363          int     mri;
1364 1364          int     mnrcnt = 0;
1365 1365  
1366 1366          if (mem_node_config[mnode].exists != 0) {
1367 1367                  mri = nranges - 1;
1368 1368  
1369 1369                  /* find the memranges index below contained in mnode range */
1370 1370  
1371 1371                  while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase)
1372 1372                          mri--;
1373 1373  
1374 1374                  /*
1375 1375                   * increment mnode range counter when memranges or mnode
1376 1376                   * boundary is reached.
1377 1377                   */
1378 1378                  while (mri >= 0 &&
1379 1379                      mem_node_config[mnode].physmax >= MEMRANGELO(mri)) {
1380 1380                          mnrcnt++;
1381 1381                          if (mem_node_config[mnode].physmax > MEMRANGEHI(mri))

↓ open down ↓

1104 lines elided

↑ open up ↑

1382 1382                                  mri--;
1383 1383                          else
1384 1384                                  break;
1385 1385                  }
1386 1386          }
1387 1387          ASSERT(mnrcnt <= MAX_MNODE_MRANGES);
1388 1388          return (mnrcnt);
1389 1389  #endif  /* __xpv */
1390 1390  }
1391 1391  
1392      -/*
1393      - * mnode_range_setup() initializes mnoderanges.
1394      - */
     1392 +static int
     1393 +mnoderange_cmp(const void *v1, const void *v2)
     1394 +{
     1395 +        const mnoderange_t *m1 = v1;
     1396 +        const mnoderange_t *m2 = v2;
     1397 +
     1398 +        if (m1->mnr_pfnlo < m2->mnr_pfnlo)
     1399 +                return (-1);
     1400 +        return (m1->mnr_pfnlo > m2->mnr_pfnlo);
     1401 +}
     1402 +
1395 1403  void
1396 1404  mnode_range_setup(mnoderange_t *mnoderanges)
1397 1405  {
1398      -        mnoderange_t *mp = mnoderanges;
1399      -        int     mnode, mri;
1400      -        int     mindex = 0;     /* current index into mnoderanges array */
1401      -        int     i, j;
1402      -        pfn_t   hipfn;
1403      -        int     last, hi;
     1406 +        mnoderange_t *mp;
     1407 +        size_t nr_ranges;
     1408 +        size_t mnode;
1404 1409  
1405      -        for (mnode = 0; mnode < max_mem_nodes; mnode++) {
     1410 +        for (mnode = 0, nr_ranges = 0, mp = mnoderanges;
     1411 +            mnode < max_mem_nodes; mnode++) {
     1412 +                size_t mri = nranges - 1;
     1413 +
1406 1414                  if (mem_node_config[mnode].exists == 0)
1407 1415                          continue;
1408 1416  
1409      -                mri = nranges - 1;
1410      -
1411 1417                  while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase)
1412 1418                          mri--;
1413 1419  
1414 1420                  while (mri >= 0 && mem_node_config[mnode].physmax >=
1415 1421                      MEMRANGELO(mri)) {
1416      -                        mnoderanges->mnr_pfnlo = MAX(MEMRANGELO(mri),
     1422 +                        mp->mnr_pfnlo = MAX(MEMRANGELO(mri),
1417 1423                              mem_node_config[mnode].physbase);
1418      -                        mnoderanges->mnr_pfnhi = MIN(MEMRANGEHI(mri),
     1424 +                        mp->mnr_pfnhi = MIN(MEMRANGEHI(mri),
1419 1425                              mem_node_config[mnode].physmax);
1420      -                        mnoderanges->mnr_mnode = mnode;
1421      -                        mnoderanges->mnr_memrange = mri;
1422      -                        mnoderanges->mnr_exists = 1;
1423      -                        mnoderanges++;
1424      -                        mindex++;
     1426 +                        mp->mnr_mnode = mnode;
     1427 +                        mp->mnr_memrange = mri;
     1428 +                        mp->mnr_next = -1;
     1429 +                        mp->mnr_exists = 1;
     1430 +                        mp++;
     1431 +                        nr_ranges++;
1425 1432                          if (mem_node_config[mnode].physmax > MEMRANGEHI(mri))
1426 1433                                  mri--;
1427 1434                          else
1428 1435                                  break;
1429 1436                  }
1430 1437          }
1431 1438  
1432 1439          /*
1433      -         * For now do a simple sort of the mnoderanges array to fill in
1434      -         * the mnr_next fields.  Since mindex is expected to be relatively
1435      -         * small, using a simple O(N^2) algorithm.
     1440 +         * mnoderangecnt can be larger than nr_ranges when memory DR is
     1441 +         * supposedly supported.
1436 1442           */
1437      -        for (i = 0; i < mindex; i++) {
1438      -                if (mp[i].mnr_pfnlo == 0)       /* find lowest */
1439      -                        break;
1440      -        }
1441      -        ASSERT(i < mindex);
1442      -        last = i;
1443      -        mtype16m = last;
1444      -        mp[last].mnr_next = -1;
1445      -        for (i = 0; i < mindex - 1; i++) {
1446      -                hipfn = (pfn_t)(-1);
1447      -                hi = -1;
1448      -                /* find next highest mnode range */
1449      -                for (j = 0; j < mindex; j++) {
1450      -                        if (mp[j].mnr_pfnlo > mp[last].mnr_pfnlo &&
1451      -                            mp[j].mnr_pfnlo < hipfn) {
1452      -                                hipfn = mp[j].mnr_pfnlo;
1453      -                                hi = j;
1454      -                        }
1455      -                }
1456      -                mp[hi].mnr_next = last;
1457      -                last = hi;
1458      -        }
1459      -        mtypetop = last;
     1443 +        VERIFY3U(nr_ranges, <=, mnoderangecnt);
     1444 +
     1445 +        qsort(mnoderanges, nr_ranges, sizeof (mnoderange_t), mnoderange_cmp);
     1446 +
     1447 +        /*
     1448 +         * If some intrepid soul takes the axe to the memory DR code, we can
     1449 +         * remove ->mnr_next altogether, as we just sorted by ->mnr_pfnlo order.
     1450 +         *
     1451 +         * The VERIFY3U() above can be "==" then too.
     1452 +         */
     1453 +        for (size_t i = 1; i < nr_ranges; i++)
     1454 +                mnoderanges[i].mnr_next = i - 1;
     1455 +
     1456 +        mtypetop = nr_ranges - 1;
     1457 +        mtype16m = pfn_2_mtype(PFN_16MEG - 1); /* Can be -1 ... */
     1458 +        if (physmax4g)
     1459 +                mtype4g = pfn_2_mtype(0xfffff);
1460 1460  }
1461 1461  
1462 1462  #ifndef __xpv
1463 1463  /*
1464 1464   * Update mnoderanges for memory hot-add DR operations.
1465 1465   */
1466 1466  static void
1467 1467  mnode_range_add(int mnode)
1468 1468  {
1469 1469          int     *prev;

1470 1470          int     n, mri;
1471 1471          pfn_t   start, end;
1472 1472          extern  void membar_sync(void);
1473 1473  
1474 1474          ASSERT(0 <= mnode && mnode < max_mem_nodes);
1475 1475          ASSERT(mem_node_config[mnode].exists);
1476 1476          start = mem_node_config[mnode].physbase;
1477 1477          end = mem_node_config[mnode].physmax;
1478 1478          ASSERT(start <= end);
1479 1479          mutex_enter(&mnoderange_lock);
1480 1480  
1481 1481  #ifdef  DEBUG
1482 1482          /* Check whether it interleaves with other memory nodes. */
1483 1483          for (n = mtypetop; n != -1; n = mnoderanges[n].mnr_next) {
1484 1484                  ASSERT(mnoderanges[n].mnr_exists);
1485 1485                  if (mnoderanges[n].mnr_mnode == mnode)
1486 1486                          continue;
1487 1487                  ASSERT(start > mnoderanges[n].mnr_pfnhi ||
1488 1488                      end < mnoderanges[n].mnr_pfnlo);
1489 1489          }
1490 1490  #endif  /* DEBUG */
1491 1491  
1492 1492          mri = nranges - 1;
1493 1493          while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase)
1494 1494                  mri--;
1495 1495          while (mri >= 0 && mem_node_config[mnode].physmax >= MEMRANGELO(mri)) {
1496 1496                  /* Check whether mtype already exists. */
1497 1497                  for (n = mtypetop; n != -1; n = mnoderanges[n].mnr_next) {
1498 1498                          if (mnoderanges[n].mnr_mnode == mnode &&
1499 1499                              mnoderanges[n].mnr_memrange == mri) {
1500 1500                                  mnoderanges[n].mnr_pfnlo = MAX(MEMRANGELO(mri),
1501 1501                                      start);
1502 1502                                  mnoderanges[n].mnr_pfnhi = MIN(MEMRANGEHI(mri),
1503 1503                                      end);
1504 1504                                  break;
1505 1505                          }
1506 1506                  }
1507 1507  
1508 1508                  /* Add a new entry if it doesn't exist yet. */
1509 1509                  if (n == -1) {
1510 1510                          /* Try to find an unused entry in mnoderanges array. */
1511 1511                          for (n = 0; n < mnoderangecnt; n++) {
1512 1512                                  if (mnoderanges[n].mnr_exists == 0)
1513 1513                                          break;
1514 1514                          }
1515 1515                          ASSERT(n < mnoderangecnt);
1516 1516                          mnoderanges[n].mnr_pfnlo = MAX(MEMRANGELO(mri), start);
1517 1517                          mnoderanges[n].mnr_pfnhi = MIN(MEMRANGEHI(mri), end);
1518 1518                          mnoderanges[n].mnr_mnode = mnode;
1519 1519                          mnoderanges[n].mnr_memrange = mri;
1520 1520                          mnoderanges[n].mnr_exists = 1;
1521 1521                          /* Page 0 should always be present. */
1522 1522                          for (prev = &mtypetop;
1523 1523                              mnoderanges[*prev].mnr_pfnlo > start;
1524 1524                              prev = &mnoderanges[*prev].mnr_next) {
1525 1525                                  ASSERT(mnoderanges[*prev].mnr_next >= 0);
1526 1526                                  ASSERT(mnoderanges[*prev].mnr_pfnlo > end);
1527 1527                          }
1528 1528                          mnoderanges[n].mnr_next = *prev;
1529 1529                          membar_sync();
1530 1530                          *prev = n;
1531 1531                  }
1532 1532  
1533 1533                  if (mem_node_config[mnode].physmax > MEMRANGEHI(mri))
1534 1534                          mri--;
1535 1535                  else
1536 1536                          break;
1537 1537          }
1538 1538  
1539 1539          mutex_exit(&mnoderange_lock);
1540 1540  }
1541 1541  
1542 1542  /*
1543 1543   * Update mnoderanges for memory hot-removal DR operations.
1544 1544   */
1545 1545  static void
1546 1546  mnode_range_del(int mnode)
1547 1547  {
1548 1548          _NOTE(ARGUNUSED(mnode));
1549 1549          ASSERT(0 <= mnode && mnode < max_mem_nodes);
1550 1550          /* TODO: support deletion operation. */
1551 1551          ASSERT(0);
1552 1552  }
1553 1553  
1554 1554  void
1555 1555  plat_slice_add(pfn_t start, pfn_t end)
1556 1556  {
1557 1557          mem_node_add_slice(start, end);
1558 1558          if (plat_dr_enabled()) {
1559 1559                  mnode_range_add(PFN_2_MEM_NODE(start));
1560 1560          }
1561 1561  }
1562 1562  
1563 1563  void
1564 1564  plat_slice_del(pfn_t start, pfn_t end)
1565 1565  {
1566 1566          ASSERT(PFN_2_MEM_NODE(start) == PFN_2_MEM_NODE(end));
1567 1567          ASSERT(plat_dr_enabled());
1568 1568          mnode_range_del(PFN_2_MEM_NODE(start));
1569 1569          mem_node_del_slice(start, end);
1570 1570  }
1571 1571  #endif  /* __xpv */
1572 1572  
1573 1573  /*ARGSUSED*/
1574 1574  int
1575 1575  mtype_init(vnode_t *vp, caddr_t vaddr, uint_t *flags, size_t pgsz)
1576 1576  {
1577 1577          int mtype = mtypetop;
1578 1578  
1579 1579  #if !defined(__xpv)
1580 1580  #if defined(__i386)
1581 1581          /*
1582 1582           * set the mtype range
1583 1583           * - kmem requests need to be below 4g if restricted_kmemalloc is set.
1584 1584           * - for non kmem requests, set range to above 4g if memory below 4g
1585 1585           * runs low.
1586 1586           */
1587 1587          if (restricted_kmemalloc && VN_ISKAS(vp) &&
1588 1588              (caddr_t)(vaddr) >= kernelheap &&
1589 1589              (caddr_t)(vaddr) < ekernelheap) {
1590 1590                  ASSERT(physmax4g);
1591 1591                  mtype = mtype4g;
1592 1592                  if (RESTRICT16M_ALLOC(freemem4g - btop(pgsz),
1593 1593                      btop(pgsz), *flags)) {
1594 1594                          *flags |= PGI_MT_RANGE16M;
1595 1595                  } else {
1596 1596                          VM_STAT_ADD(vmm_vmstats.unrestrict16mcnt);
1597 1597                          VM_STAT_COND_ADD((*flags & PG_PANIC),
1598 1598                              vmm_vmstats.pgpanicalloc);
1599 1599                          *flags |= PGI_MT_RANGE0;
1600 1600                  }
1601 1601                  return (mtype);
1602 1602          }
1603 1603  #endif  /* __i386 */
1604 1604  
1605 1605          if (RESTRICT4G_ALLOC) {
1606 1606                  VM_STAT_ADD(vmm_vmstats.restrict4gcnt);
1607 1607                  /* here only for > 4g systems */
1608 1608                  *flags |= PGI_MT_RANGE4G;
1609 1609          } else if (RESTRICT16M_ALLOC(freemem, btop(pgsz), *flags)) {
1610 1610                  *flags |= PGI_MT_RANGE16M;
1611 1611          } else {
1612 1612                  VM_STAT_ADD(vmm_vmstats.unrestrict16mcnt);
1613 1613                  VM_STAT_COND_ADD((*flags & PG_PANIC), vmm_vmstats.pgpanicalloc);
1614 1614                  *flags |= PGI_MT_RANGE0;
1615 1615          }
1616 1616  #endif /* !__xpv */
1617 1617          return (mtype);
1618 1618  }
1619 1619  
1620 1620  
1621 1621  /* mtype init for page_get_replacement_page */
1622 1622  /*ARGSUSED*/
1623 1623  int
1624 1624  mtype_pgr_init(int *flags, page_t *pp, int mnode, pgcnt_t pgcnt)
1625 1625  {
1626 1626          int mtype = mtypetop;
1627 1627  #if !defined(__xpv)
1628 1628          if (RESTRICT16M_ALLOC(freemem, pgcnt, *flags)) {
1629 1629                  *flags |= PGI_MT_RANGE16M;
1630 1630          } else {
1631 1631                  VM_STAT_ADD(vmm_vmstats.unrestrict16mcnt);
1632 1632                  *flags |= PGI_MT_RANGE0;
1633 1633          }
1634 1634  #endif
1635 1635          return (mtype);
1636 1636  }
1637 1637  
1638 1638  /*
1639 1639   * Determine if the mnode range specified in mtype contains memory belonging
1640 1640   * to memory node mnode.  If flags & PGI_MT_RANGE is set then mtype contains
1641 1641   * the range from high pfn to 0, 16m or 4g.
1642 1642   *
1643 1643   * Return first mnode range type index found otherwise return -1 if none found.
1644 1644   */
1645 1645  int
1646 1646  mtype_func(int mnode, int mtype, uint_t flags)
1647 1647  {
1648 1648          if (flags & PGI_MT_RANGE) {
1649 1649                  int     mnr_lim = MRI_0;
1650 1650  
1651 1651                  if (flags & PGI_MT_NEXT) {
1652 1652                          mtype = mnoderanges[mtype].mnr_next;
1653 1653                  }
1654 1654                  if (flags & PGI_MT_RANGE4G)
1655 1655                          mnr_lim = MRI_4G;       /* exclude 0-4g range */
1656 1656                  else if (flags & PGI_MT_RANGE16M)
1657 1657                          mnr_lim = MRI_16M;      /* exclude 0-16m range */
1658 1658                  while (mtype != -1 &&
1659 1659                      mnoderanges[mtype].mnr_memrange <= mnr_lim) {
1660 1660                          if (mnoderanges[mtype].mnr_mnode == mnode)
1661 1661                                  return (mtype);
1662 1662                          mtype = mnoderanges[mtype].mnr_next;
1663 1663                  }
1664 1664          } else if (mnoderanges[mtype].mnr_mnode == mnode) {
1665 1665                  return (mtype);
1666 1666          }
1667 1667          return (-1);
1668 1668  }
1669 1669  
1670 1670  /*
1671 1671   * Update the page list max counts with the pfn range specified by the
1672 1672   * input parameters.
1673 1673   */
1674 1674  void
1675 1675  mtype_modify_max(pfn_t startpfn, long cnt)
1676 1676  {
1677 1677          int             mtype;
1678 1678          pgcnt_t         inc;
1679 1679          spgcnt_t        scnt = (spgcnt_t)(cnt);
1680 1680          pgcnt_t         acnt = ABS(scnt);
1681 1681          pfn_t           endpfn = startpfn + acnt;
1682 1682          pfn_t           pfn, lo;
1683 1683  
1684 1684          if (!physmax4g)
1685 1685                  return;
1686 1686  
1687 1687          mtype = mtypetop;
1688 1688          for (pfn = endpfn; pfn > startpfn; ) {
1689 1689                  ASSERT(mtype != -1);
1690 1690                  lo = mnoderanges[mtype].mnr_pfnlo;
1691 1691                  if (pfn > lo) {
1692 1692                          if (startpfn >= lo) {
1693 1693                                  inc = pfn - startpfn;
1694 1694                          } else {
1695 1695                                  inc = pfn - lo;
1696 1696                          }
1697 1697                          if (mnoderanges[mtype].mnr_memrange != MRI_4G) {
1698 1698                                  if (scnt > 0)
1699 1699                                          maxmem4g += inc;
1700 1700                                  else
1701 1701                                          maxmem4g -= inc;
1702 1702                          }
1703 1703                          pfn -= inc;
1704 1704                  }
1705 1705                  mtype = mnoderanges[mtype].mnr_next;
1706 1706          }
1707 1707  }
1708 1708  
1709 1709  int
1710 1710  mtype_2_mrange(int mtype)
1711 1711  {
1712 1712          return (mnoderanges[mtype].mnr_memrange);
1713 1713  }
1714 1714  
1715 1715  void
1716 1716  mnodetype_2_pfn(int mnode, int mtype, pfn_t *pfnlo, pfn_t *pfnhi)
1717 1717  {
1718 1718          _NOTE(ARGUNUSED(mnode));
1719 1719          ASSERT(mnoderanges[mtype].mnr_mnode == mnode);
1720 1720          *pfnlo = mnoderanges[mtype].mnr_pfnlo;
1721 1721          *pfnhi = mnoderanges[mtype].mnr_pfnhi;
1722 1722  }
1723 1723  
1724 1724  size_t
1725 1725  plcnt_sz(size_t ctrs_sz)
1726 1726  {
1727 1727  #ifdef DEBUG
1728 1728          int     szc, colors;
1729 1729  
1730 1730          ctrs_sz += mnoderangecnt * sizeof (struct mnr_mts) * mmu_page_sizes;
1731 1731          for (szc = 0; szc < mmu_page_sizes; szc++) {
1732 1732                  colors = page_get_pagecolors(szc);
1733 1733                  ctrs_sz += mnoderangecnt * sizeof (pgcnt_t) * colors;
1734 1734          }
1735 1735  #endif
1736 1736          return (ctrs_sz);
1737 1737  }
1738 1738  
1739 1739  caddr_t
1740 1740  plcnt_init(caddr_t addr)
1741 1741  {
1742 1742  #ifdef DEBUG
1743 1743          int     mt, szc, colors;
1744 1744  
1745 1745          for (mt = 0; mt < mnoderangecnt; mt++) {
1746 1746                  mnoderanges[mt].mnr_mts = (struct mnr_mts *)addr;
1747 1747                  addr += (sizeof (struct mnr_mts) * mmu_page_sizes);
1748 1748                  for (szc = 0; szc < mmu_page_sizes; szc++) {
1749 1749                          colors = page_get_pagecolors(szc);
1750 1750                          mnoderanges[mt].mnr_mts[szc].mnr_mts_colors = colors;
1751 1751                          mnoderanges[mt].mnr_mts[szc].mnr_mtsc_pgcnt =
1752 1752                              (pgcnt_t *)addr;
1753 1753                          addr += (sizeof (pgcnt_t) * colors);
1754 1754                  }
1755 1755          }
1756 1756  #endif
1757 1757          return (addr);
1758 1758  }
1759 1759  
1760 1760  void
1761 1761  plcnt_inc_dec(page_t *pp, int mtype, int szc, long cnt, int flags)
1762 1762  {
1763 1763          _NOTE(ARGUNUSED(pp));
1764 1764  #ifdef DEBUG
1765 1765          int     bin = PP_2_BIN(pp);
1766 1766  
1767 1767          atomic_add_long(&mnoderanges[mtype].mnr_mts[szc].mnr_mts_pgcnt, cnt);
1768 1768          atomic_add_long(&mnoderanges[mtype].mnr_mts[szc].mnr_mtsc_pgcnt[bin],
1769 1769              cnt);
1770 1770  #endif
1771 1771          ASSERT(mtype == PP_2_MTYPE(pp));
1772 1772          if (physmax4g && mnoderanges[mtype].mnr_memrange != MRI_4G)
1773 1773                  atomic_add_long(&freemem4g, cnt);
1774 1774          if (flags & PG_CACHE_LIST)
1775 1775                  atomic_add_long(&mnoderanges[mtype].mnr_mt_clpgcnt, cnt);
1776 1776          else
1777 1777                  atomic_add_long(&mnoderanges[mtype].mnr_mt_flpgcnt[szc], cnt);
1778 1778          atomic_add_long(&mnoderanges[mtype].mnr_mt_totcnt, cnt);
1779 1779  }
1780 1780  
1781 1781  /*
1782 1782   * Returns the free page count for mnode
1783 1783   */
1784 1784  int
1785 1785  mnode_pgcnt(int mnode)
1786 1786  {
1787 1787          int     mtype = mtypetop;
1788 1788          int     flags = PGI_MT_RANGE0;
1789 1789          pgcnt_t pgcnt = 0;
1790 1790  
1791 1791          mtype = mtype_func(mnode, mtype, flags);
1792 1792  
1793 1793          while (mtype != -1) {
1794 1794                  pgcnt += MTYPE_FREEMEM(mtype);
1795 1795                  mtype = mtype_func(mnode, mtype, flags | PGI_MT_NEXT);
1796 1796          }
1797 1797          return (pgcnt);
1798 1798  }
1799 1799  
1800 1800  /*
1801 1801   * Initialize page coloring variables based on the l2 cache parameters.
1802 1802   * Calculate and return memory needed for page coloring data structures.
1803 1803   */
1804 1804  size_t
1805 1805  page_coloring_init(uint_t l2_sz, int l2_linesz, int l2_assoc)
1806 1806  {
1807 1807          _NOTE(ARGUNUSED(l2_linesz));
1808 1808          size_t  colorsz = 0;
1809 1809          int     i;
1810 1810          int     colors;
1811 1811  
1812 1812  #if defined(__xpv)
1813 1813          /*
1814 1814           * Hypervisor domains currently don't have any concept of NUMA.
1815 1815           * Hence we'll act like there is only 1 memrange.
1816 1816           */
1817 1817          i = memrange_num(1);
1818 1818  #else /* !__xpv */
1819 1819          /*
1820 1820           * Reduce the memory ranges lists if we don't have large amounts
1821 1821           * of memory. This avoids searching known empty free lists.
1822 1822           * To support memory DR operations, we need to keep memory ranges
1823 1823           * for possible memory hot-add operations.
1824 1824           */
1825 1825          if (plat_dr_physmax > physmax)
1826 1826                  i = memrange_num(plat_dr_physmax);
1827 1827          else
1828 1828                  i = memrange_num(physmax);
1829 1829  #if defined(__i386)
1830 1830          if (i > MRI_4G)
1831 1831                  restricted_kmemalloc = 0;
1832 1832  #endif
1833 1833          /* physmax greater than 4g */
1834 1834          if (i == MRI_4G)
1835 1835                  physmax4g = 1;
1836 1836  #endif /* !__xpv */
1837 1837          memranges += i;
1838 1838          nranges -= i;
1839 1839  
1840 1840          ASSERT(mmu_page_sizes <= MMU_PAGE_SIZES);
1841 1841  
1842 1842          ASSERT(ISP2(l2_linesz));
1843 1843          ASSERT(l2_sz > MMU_PAGESIZE);
1844 1844  
1845 1845          /* l2_assoc is 0 for fully associative l2 cache */
1846 1846          if (l2_assoc)
1847 1847                  l2_colors = MAX(1, l2_sz / (l2_assoc * MMU_PAGESIZE));
1848 1848          else
1849 1849                  l2_colors = 1;
1850 1850  
1851 1851          ASSERT(ISP2(l2_colors));
1852 1852  
1853 1853          /* for scalability, configure at least PAGE_COLORS_MIN color bins */
1854 1854          page_colors = MAX(l2_colors, PAGE_COLORS_MIN);
1855 1855  
1856 1856          /*
1857 1857           * cpu_page_colors is non-zero when a page color may be spread across
1858 1858           * multiple bins.
1859 1859           */
1860 1860          if (l2_colors < page_colors)
1861 1861                  cpu_page_colors = l2_colors;
1862 1862  
1863 1863          ASSERT(ISP2(page_colors));
1864 1864  
1865 1865          page_colors_mask = page_colors - 1;
1866 1866  
1867 1867          ASSERT(ISP2(CPUSETSIZE()));
1868 1868          page_coloring_shift = lowbit(CPUSETSIZE());
1869 1869  
1870 1870          /* initialize number of colors per page size */
1871 1871          for (i = 0; i <= mmu.max_page_level; i++) {
1872 1872                  hw_page_array[i].hp_size = LEVEL_SIZE(i);
1873 1873                  hw_page_array[i].hp_shift = LEVEL_SHIFT(i);
1874 1874                  hw_page_array[i].hp_pgcnt = LEVEL_SIZE(i) >> LEVEL_SHIFT(0);
1875 1875                  hw_page_array[i].hp_colors = (page_colors_mask >>
1876 1876                      (hw_page_array[i].hp_shift - hw_page_array[0].hp_shift))
1877 1877                      + 1;
1878 1878                  colorequivszc[i] = 0;
1879 1879          }
1880 1880  
1881 1881          /*
1882 1882           * The value of cpu_page_colors determines if additional color bins
1883 1883           * need to be checked for a particular color in the page_get routines.
1884 1884           */
1885 1885          if (cpu_page_colors != 0) {
1886 1886  
1887 1887                  int a = lowbit(page_colors) - lowbit(cpu_page_colors);
1888 1888                  ASSERT(a > 0);
1889 1889                  ASSERT(a < 16);
1890 1890  
1891 1891                  for (i = 0; i <= mmu.max_page_level; i++) {
1892 1892                          if ((colors = hw_page_array[i].hp_colors) <= 1) {
1893 1893                                  colorequivszc[i] = 0;
1894 1894                                  continue;
1895 1895                          }
1896 1896                          while ((colors >> a) == 0)
1897 1897                                  a--;
1898 1898                          ASSERT(a >= 0);
1899 1899  
1900 1900                          /* higher 4 bits encodes color equiv mask */
1901 1901                          colorequivszc[i] = (a << 4);
1902 1902                  }
1903 1903          }
1904 1904  
1905 1905          /* factor in colorequiv to check additional 'equivalent' bins. */
1906 1906          if (colorequiv > 1) {
1907 1907  
1908 1908                  int a = lowbit(colorequiv) - 1;
1909 1909                  if (a > 15)
1910 1910                          a = 15;
1911 1911  
1912 1912                  for (i = 0; i <= mmu.max_page_level; i++) {
1913 1913                          if ((colors = hw_page_array[i].hp_colors) <= 1) {
1914 1914                                  continue;
1915 1915                          }
1916 1916                          while ((colors >> a) == 0)
1917 1917                                  a--;
1918 1918                          if ((a << 4) > colorequivszc[i]) {
1919 1919                                  colorequivszc[i] = (a << 4);
1920 1920                          }
1921 1921                  }
1922 1922          }
1923 1923  
1924 1924          /* size for mnoderanges */
1925 1925          for (mnoderangecnt = 0, i = 0; i < max_mem_nodes; i++)
1926 1926                  mnoderangecnt += mnode_range_cnt(i);
1927 1927          if (plat_dr_support_memory()) {
1928 1928                  /*
1929 1929                   * Reserve enough space for memory DR operations.
1930 1930                   * Two extra mnoderanges for possbile fragmentations,
1931 1931                   * one for the 2G boundary and the other for the 4G boundary.
1932 1932                   * We don't expect a memory board crossing the 16M boundary
1933 1933                   * for memory hot-add operations on x86 platforms.
1934 1934                   */
1935 1935                  mnoderangecnt += 2 + max_mem_nodes - lgrp_plat_node_cnt;
1936 1936          }
1937 1937          colorsz = mnoderangecnt * sizeof (mnoderange_t);
1938 1938  
1939 1939          /* size for fpc_mutex and cpc_mutex */
1940 1940          colorsz += (2 * max_mem_nodes * sizeof (kmutex_t) * NPC_MUTEX);
1941 1941  
1942 1942          /* size of page_freelists */
1943 1943          colorsz += mnoderangecnt * sizeof (page_t ***);
1944 1944          colorsz += mnoderangecnt * mmu_page_sizes * sizeof (page_t **);
1945 1945  
1946 1946          for (i = 0; i < mmu_page_sizes; i++) {
1947 1947                  colors = page_get_pagecolors(i);
1948 1948                  colorsz += mnoderangecnt * colors * sizeof (page_t *);
1949 1949          }
1950 1950  
1951 1951          /* size of page_cachelists */
1952 1952          colorsz += mnoderangecnt * sizeof (page_t **);
1953 1953          colorsz += mnoderangecnt * page_colors * sizeof (page_t *);
1954 1954  
1955 1955          return (colorsz);
1956 1956  }
1957 1957  
1958 1958  /*
1959 1959   * Called once at startup to configure page_coloring data structures and
1960 1960   * does the 1st page_free()/page_freelist_add().
1961 1961   */
1962 1962  void
1963 1963  page_coloring_setup(caddr_t pcmemaddr)
1964 1964  {
1965 1965          int     i;
1966 1966          int     j;
1967 1967          int     k;
1968 1968          caddr_t addr;
1969 1969          int     colors;
1970 1970

↓ open down ↓

501 lines elided

↑ open up ↑

1971 1971          /*
1972 1972           * do page coloring setup
1973 1973           */
1974 1974          addr = pcmemaddr;
1975 1975  
1976 1976          mnoderanges = (mnoderange_t *)addr;
1977 1977          addr += (mnoderangecnt * sizeof (mnoderange_t));
1978 1978  
1979 1979          mnode_range_setup(mnoderanges);
1980 1980  
1981      -        if (physmax4g)
1982      -                mtype4g = pfn_2_mtype(0xfffff);
1983      -
1984 1981          for (k = 0; k < NPC_MUTEX; k++) {
1985 1982                  fpc_mutex[k] = (kmutex_t *)addr;
1986 1983                  addr += (max_mem_nodes * sizeof (kmutex_t));
1987 1984          }
1988 1985          for (k = 0; k < NPC_MUTEX; k++) {
1989 1986                  cpc_mutex[k] = (kmutex_t *)addr;
1990 1987                  addr += (max_mem_nodes * sizeof (kmutex_t));
1991 1988          }
1992 1989          page_freelists = (page_t ****)addr;
1993 1990          addr += (mnoderangecnt * sizeof (page_t ***));

1994 1991  
1995 1992          page_cachelists = (page_t ***)addr;
1996 1993          addr += (mnoderangecnt * sizeof (page_t **));
1997 1994  
1998 1995          for (i = 0; i < mnoderangecnt; i++) {
1999 1996                  page_freelists[i] = (page_t ***)addr;
2000 1997                  addr += (mmu_page_sizes * sizeof (page_t **));
2001 1998  
2002 1999                  for (j = 0; j < mmu_page_sizes; j++) {
2003 2000                          colors = page_get_pagecolors(j);
2004 2001                          page_freelists[i][j] = (page_t **)addr;
2005 2002                          addr += (colors * sizeof (page_t *));
2006 2003                  }
2007 2004                  page_cachelists[i] = (page_t **)addr;
2008 2005                  addr += (page_colors * sizeof (page_t *));
2009 2006          }
2010 2007  }
2011 2008  
2012 2009  #if defined(__xpv)
2013 2010  /*
2014 2011   * Give back 10% of the io_pool pages to the free list.
2015 2012   * Don't shrink the pool below some absolute minimum.
2016 2013   */
2017 2014  static void
2018 2015  page_io_pool_shrink()
2019 2016  {
2020 2017          int retcnt;
2021 2018          page_t *pp, *pp_first, *pp_last, **curpool;
2022 2019          mfn_t mfn;
2023 2020          int bothpools = 0;
2024 2021  
2025 2022          mutex_enter(&io_pool_lock);
2026 2023          io_pool_shrink_attempts++;      /* should be a kstat? */
2027 2024          retcnt = io_pool_cnt / 10;
2028 2025          if (io_pool_cnt - retcnt < io_pool_cnt_min)
2029 2026                  retcnt = io_pool_cnt - io_pool_cnt_min;
2030 2027          if (retcnt <= 0)
2031 2028                  goto done;
2032 2029          io_pool_shrinks++;      /* should be a kstat? */
2033 2030          curpool = &io_pool_4g;
2034 2031  domore:
2035 2032          /*
2036 2033           * Loop through taking pages from the end of the list
2037 2034           * (highest mfns) till amount to return reached.
2038 2035           */
2039 2036          for (pp = *curpool; pp && retcnt > 0; ) {
2040 2037                  pp_first = pp_last = pp->p_prev;
2041 2038                  if (pp_first == *curpool)
2042 2039                          break;
2043 2040                  retcnt--;
2044 2041                  io_pool_cnt--;
2045 2042                  page_io_pool_sub(curpool, pp_first, pp_last);
2046 2043                  if ((mfn = pfn_to_mfn(pp->p_pagenum)) < start_mfn)
2047 2044                          start_mfn = mfn;
2048 2045                  page_free(pp_first, 1);
2049 2046                  pp = *curpool;
2050 2047          }
2051 2048          if (retcnt != 0 && !bothpools) {
2052 2049                  /*
2053 2050                   * If not enough found in less constrained pool try the
2054 2051                   * more constrained one.
2055 2052                   */
2056 2053                  curpool = &io_pool_16m;
2057 2054                  bothpools = 1;
2058 2055                  goto domore;
2059 2056          }
2060 2057  done:
2061 2058          mutex_exit(&io_pool_lock);
2062 2059  }
2063 2060  
2064 2061  #endif  /* __xpv */
2065 2062  
2066 2063  uint_t
2067 2064  page_create_update_flags_x86(uint_t flags)
2068 2065  {
2069 2066  #if defined(__xpv)
2070 2067          /*
2071 2068           * Check this is an urgent allocation and free pages are depleted.
2072 2069           */
2073 2070          if (!(flags & PG_WAIT) && freemem < desfree)
2074 2071                  page_io_pool_shrink();
2075 2072  #else /* !__xpv */
2076 2073          /*
2077 2074           * page_create_get_something may call this because 4g memory may be
2078 2075           * depleted. Set flags to allow for relocation of base page below
2079 2076           * 4g if necessary.
2080 2077           */
2081 2078          if (physmax4g)
2082 2079                  flags |= (PGI_PGCPSZC0 | PGI_PGCPHIPRI);
2083 2080  #endif /* __xpv */
2084 2081          return (flags);
2085 2082  }
2086 2083  
2087 2084  /*ARGSUSED*/
2088 2085  int
2089 2086  bp_color(struct buf *bp)
2090 2087  {
2091 2088          return (0);
2092 2089  }
2093 2090  
2094 2091  #if defined(__xpv)
2095 2092  
2096 2093  /*
2097 2094   * Take pages out of an io_pool
2098 2095   */
2099 2096  static void
2100 2097  page_io_pool_sub(page_t **poolp, page_t *pp_first, page_t *pp_last)
2101 2098  {
2102 2099          if (*poolp == pp_first) {
2103 2100                  *poolp = pp_last->p_next;
2104 2101                  if (*poolp == pp_first)
2105 2102                          *poolp = NULL;
2106 2103          }
2107 2104          pp_first->p_prev->p_next = pp_last->p_next;
2108 2105          pp_last->p_next->p_prev = pp_first->p_prev;
2109 2106          pp_first->p_prev = pp_last;
2110 2107          pp_last->p_next = pp_first;
2111 2108  }
2112 2109  
2113 2110  /*
2114 2111   * Put a page on the io_pool list. The list is ordered by increasing MFN.
2115 2112   */
2116 2113  static void
2117 2114  page_io_pool_add(page_t **poolp, page_t *pp)
2118 2115  {
2119 2116          page_t  *look;
2120 2117          mfn_t   mfn = mfn_list[pp->p_pagenum];
2121 2118  
2122 2119          if (*poolp == NULL) {
2123 2120                  *poolp = pp;
2124 2121                  pp->p_next = pp;
2125 2122                  pp->p_prev = pp;
2126 2123                  return;
2127 2124          }
2128 2125  
2129 2126          /*
2130 2127           * Since we try to take pages from the high end of the pool
2131 2128           * chances are good that the pages to be put on the list will
2132 2129           * go at or near the end of the list. so start at the end and
2133 2130           * work backwards.
2134 2131           */
2135 2132          look = (*poolp)->p_prev;
2136 2133          while (mfn < mfn_list[look->p_pagenum]) {
2137 2134                  look = look->p_prev;
2138 2135                  if (look == (*poolp)->p_prev)
2139 2136                          break; /* backed all the way to front of list */
2140 2137          }
2141 2138  
2142 2139          /* insert after look */
2143 2140          pp->p_prev = look;
2144 2141          pp->p_next = look->p_next;
2145 2142          pp->p_next->p_prev = pp;
2146 2143          look->p_next = pp;
2147 2144          if (mfn < mfn_list[(*poolp)->p_pagenum]) {
2148 2145                  /*
2149 2146                   * we inserted a new first list element
2150 2147                   * adjust pool pointer to newly inserted element
2151 2148                   */
2152 2149                  *poolp = pp;
2153 2150          }
2154 2151  }
2155 2152  
2156 2153  /*
2157 2154   * Add a page to the io_pool.  Setting the force flag will force the page
2158 2155   * into the io_pool no matter what.
2159 2156   */
2160 2157  static void
2161 2158  add_page_to_pool(page_t *pp, int force)
2162 2159  {
2163 2160          page_t *highest;
2164 2161          page_t *freep = NULL;
2165 2162  
2166 2163          mutex_enter(&io_pool_lock);
2167 2164          /*
2168 2165           * Always keep the scarce low memory pages
2169 2166           */
2170 2167          if (mfn_list[pp->p_pagenum] < PFN_16MEG) {
2171 2168                  ++io_pool_cnt;
2172 2169                  page_io_pool_add(&io_pool_16m, pp);
2173 2170                  goto done;
2174 2171          }
2175 2172          if (io_pool_cnt < io_pool_cnt_max || force || io_pool_4g == NULL) {
2176 2173                  ++io_pool_cnt;
2177 2174                  page_io_pool_add(&io_pool_4g, pp);
2178 2175          } else {
2179 2176                  highest = io_pool_4g->p_prev;
2180 2177                  if (mfn_list[pp->p_pagenum] < mfn_list[highest->p_pagenum]) {
2181 2178                          page_io_pool_sub(&io_pool_4g, highest, highest);
2182 2179                          page_io_pool_add(&io_pool_4g, pp);
2183 2180                          freep = highest;
2184 2181                  } else {
2185 2182                          freep = pp;
2186 2183                  }
2187 2184          }
2188 2185  done:
2189 2186          mutex_exit(&io_pool_lock);
2190 2187          if (freep)
2191 2188                  page_free(freep, 1);
2192 2189  }
2193 2190  
2194 2191  
2195 2192  int contig_pfn_cnt;     /* no of pfns in the contig pfn list */
2196 2193  int contig_pfn_max;     /* capacity of the contig pfn list */
2197 2194  int next_alloc_pfn;     /* next position in list to start a contig search */
2198 2195  int contig_pfnlist_updates;     /* pfn list update count */
2199 2196  int contig_pfnlist_builds;      /* how many times have we (re)built list */
2200 2197  int contig_pfnlist_buildfailed; /* how many times has list build failed */
2201 2198  int create_contig_pending;      /* nonzero means taskq creating contig list */
2202 2199  pfn_t *contig_pfn_list = NULL;  /* list of contig pfns in ascending mfn order */
2203 2200  
2204 2201  /*
2205 2202   * Function to use in sorting a list of pfns by their underlying mfns.
2206 2203   */
2207 2204  static int
2208 2205  mfn_compare(const void *pfnp1, const void *pfnp2)
2209 2206  {
2210 2207          mfn_t mfn1 = mfn_list[*(pfn_t *)pfnp1];
2211 2208          mfn_t mfn2 = mfn_list[*(pfn_t *)pfnp2];
2212 2209  
2213 2210          if (mfn1 > mfn2)
2214 2211                  return (1);
2215 2212          if (mfn1 < mfn2)
2216 2213                  return (-1);
2217 2214          return (0);
2218 2215  }
2219 2216  
2220 2217  /*
2221 2218   * Compact the contig_pfn_list by tossing all the non-contiguous
2222 2219   * elements from the list.
2223 2220   */
2224 2221  static void
2225 2222  compact_contig_pfn_list(void)
2226 2223  {
2227 2224          pfn_t pfn, lapfn, prev_lapfn;
2228 2225          mfn_t mfn;
2229 2226          int i, newcnt = 0;
2230 2227  
2231 2228          prev_lapfn = 0;
2232 2229          for (i = 0; i < contig_pfn_cnt - 1; i++) {
2233 2230                  pfn = contig_pfn_list[i];
2234 2231                  lapfn = contig_pfn_list[i + 1];
2235 2232                  mfn = mfn_list[pfn];
2236 2233                  /*
2237 2234                   * See if next pfn is for a contig mfn
2238 2235                   */
2239 2236                  if (mfn_list[lapfn] != mfn + 1)
2240 2237                          continue;
2241 2238                  /*
2242 2239                   * pfn and lookahead are both put in list
2243 2240                   * unless pfn is the previous lookahead.
2244 2241                   */
2245 2242                  if (pfn != prev_lapfn)
2246 2243                          contig_pfn_list[newcnt++] = pfn;
2247 2244                  contig_pfn_list[newcnt++] = lapfn;
2248 2245                  prev_lapfn = lapfn;
2249 2246          }
2250 2247          for (i = newcnt; i < contig_pfn_cnt; i++)
2251 2248                  contig_pfn_list[i] = 0;
2252 2249          contig_pfn_cnt = newcnt;
2253 2250  }
2254 2251  
2255 2252  /*ARGSUSED*/
2256 2253  static void
2257 2254  call_create_contiglist(void *arg)
2258 2255  {
2259 2256          (void) create_contig_pfnlist(PG_WAIT);
2260 2257  }
2261 2258  
2262 2259  /*
2263 2260   * Create list of freelist pfns that have underlying
2264 2261   * contiguous mfns.  The list is kept in ascending mfn order.
2265 2262   * returns 1 if list created else 0.
2266 2263   */
2267 2264  static int
2268 2265  create_contig_pfnlist(uint_t flags)
2269 2266  {
2270 2267          pfn_t pfn;
2271 2268          page_t *pp;
2272 2269          int ret = 1;
2273 2270  
2274 2271          mutex_enter(&contig_list_lock);
2275 2272          if (contig_pfn_list != NULL)
2276 2273                  goto out;
2277 2274          contig_pfn_max = freemem + (freemem / 10);
2278 2275          contig_pfn_list = kmem_zalloc(contig_pfn_max * sizeof (pfn_t),
2279 2276              (flags & PG_WAIT) ? KM_SLEEP : KM_NOSLEEP);
2280 2277          if (contig_pfn_list == NULL) {
2281 2278                  /*
2282 2279                   * If we could not create the contig list (because
2283 2280                   * we could not sleep for memory).  Dispatch a taskq that can
2284 2281                   * sleep to get the memory.
2285 2282                   */
2286 2283                  if (!create_contig_pending) {
2287 2284                          if (taskq_dispatch(system_taskq, call_create_contiglist,
2288 2285                              NULL, TQ_NOSLEEP) != TASKQID_INVALID)
2289 2286                                  create_contig_pending = 1;
2290 2287                  }
2291 2288                  contig_pfnlist_buildfailed++;   /* count list build failures */
2292 2289                  ret = 0;
2293 2290                  goto out;
2294 2291          }
2295 2292          create_contig_pending = 0;
2296 2293          ASSERT(contig_pfn_cnt == 0);
2297 2294          for (pfn = 0; pfn < mfn_count; pfn++) {
2298 2295                  pp = page_numtopp_nolock(pfn);
2299 2296                  if (pp == NULL || !PP_ISFREE(pp))
2300 2297                          continue;
2301 2298                  contig_pfn_list[contig_pfn_cnt] = pfn;
2302 2299                  if (++contig_pfn_cnt == contig_pfn_max)
2303 2300                          break;
2304 2301          }
2305 2302          /*
2306 2303           * Sanity check the new list.
2307 2304           */
2308 2305          if (contig_pfn_cnt < 2) { /* no contig pfns */
2309 2306                  contig_pfn_cnt = 0;
2310 2307                  contig_pfnlist_buildfailed++;
2311 2308                  kmem_free(contig_pfn_list, contig_pfn_max * sizeof (pfn_t));
2312 2309                  contig_pfn_list = NULL;
2313 2310                  contig_pfn_max = 0;
2314 2311                  ret = 0;
2315 2312                  goto out;
2316 2313          }
2317 2314          qsort(contig_pfn_list, contig_pfn_cnt, sizeof (pfn_t), mfn_compare);
2318 2315          compact_contig_pfn_list();
2319 2316          /*
2320 2317           * Make sure next search of the newly created contiguous pfn
2321 2318           * list starts at the beginning of the list.
2322 2319           */
2323 2320          next_alloc_pfn = 0;
2324 2321          contig_pfnlist_builds++;        /* count list builds */
2325 2322  out:
2326 2323          mutex_exit(&contig_list_lock);
2327 2324          return (ret);
2328 2325  }
2329 2326  
2330 2327  
2331 2328  /*
2332 2329   * Toss the current contig pfnlist.  Someone is about to do a massive
2333 2330   * update to pfn<->mfn mappings.  So we have them destroy the list and lock
2334 2331   * it till they are done with their update.
2335 2332   */
2336 2333  void
2337 2334  clear_and_lock_contig_pfnlist()
2338 2335  {
2339 2336          pfn_t *listp = NULL;
2340 2337          size_t listsize;
2341 2338  
2342 2339          mutex_enter(&contig_list_lock);
2343 2340          if (contig_pfn_list != NULL) {
2344 2341                  listp = contig_pfn_list;
2345 2342                  listsize = contig_pfn_max * sizeof (pfn_t);
2346 2343                  contig_pfn_list = NULL;
2347 2344                  contig_pfn_max = contig_pfn_cnt = 0;
2348 2345          }
2349 2346          if (listp != NULL)
2350 2347                  kmem_free(listp, listsize);
2351 2348  }
2352 2349  
2353 2350  /*
2354 2351   * Unlock the contig_pfn_list.  The next attempted use of it will cause
2355 2352   * it to be re-created.
2356 2353   */
2357 2354  void
2358 2355  unlock_contig_pfnlist()
2359 2356  {
2360 2357          mutex_exit(&contig_list_lock);
2361 2358  }
2362 2359  
2363 2360  /*
2364 2361   * Update the contiguous pfn list in response to a pfn <-> mfn reassignment
2365 2362   */
2366 2363  void
2367 2364  update_contig_pfnlist(pfn_t pfn, mfn_t oldmfn, mfn_t newmfn)
2368 2365  {
2369 2366          int probe_hi, probe_lo, probe_pos, insert_after, insert_point;
2370 2367          pfn_t probe_pfn;
2371 2368          mfn_t probe_mfn;
2372 2369          int drop_lock = 0;
2373 2370  
2374 2371          if (mutex_owner(&contig_list_lock) != curthread) {
2375 2372                  drop_lock = 1;
2376 2373                  mutex_enter(&contig_list_lock);
2377 2374          }
2378 2375          if (contig_pfn_list == NULL)
2379 2376                  goto done;
2380 2377          contig_pfnlist_updates++;
2381 2378          /*
2382 2379           * Find the pfn in the current list.  Use a binary chop to locate it.
2383 2380           */
2384 2381          probe_hi = contig_pfn_cnt - 1;
2385 2382          probe_lo = 0;
2386 2383          probe_pos = (probe_hi + probe_lo) / 2;
2387 2384          while ((probe_pfn = contig_pfn_list[probe_pos]) != pfn) {
2388 2385                  if (probe_pos == probe_lo) { /* pfn not in list */
2389 2386                          probe_pos = -1;
2390 2387                          break;
2391 2388                  }
2392 2389                  if (pfn_to_mfn(probe_pfn) <= oldmfn)
2393 2390                          probe_lo = probe_pos;
2394 2391                  else
2395 2392                          probe_hi = probe_pos;
2396 2393                  probe_pos = (probe_hi + probe_lo) / 2;
2397 2394          }
2398 2395          if (probe_pos >= 0) {
2399 2396                  /*
2400 2397                   * Remove pfn from list and ensure next alloc
2401 2398                   * position stays in bounds.
2402 2399                   */
2403 2400                  if (--contig_pfn_cnt <= next_alloc_pfn)
2404 2401                          next_alloc_pfn = 0;
2405 2402                  if (contig_pfn_cnt < 2) { /* no contig pfns */
2406 2403                          contig_pfn_cnt = 0;
2407 2404                          kmem_free(contig_pfn_list,
2408 2405                              contig_pfn_max * sizeof (pfn_t));
2409 2406                          contig_pfn_list = NULL;
2410 2407                          contig_pfn_max = 0;
2411 2408                          goto done;
2412 2409                  }
2413 2410                  ovbcopy(&contig_pfn_list[probe_pos + 1],
2414 2411                      &contig_pfn_list[probe_pos],
2415 2412                      (contig_pfn_cnt - probe_pos) * sizeof (pfn_t));
2416 2413          }
2417 2414          if (newmfn == MFN_INVALID)
2418 2415                  goto done;
2419 2416          /*
2420 2417           * Check if new mfn has adjacent mfns in the list
2421 2418           */
2422 2419          probe_hi = contig_pfn_cnt - 1;
2423 2420          probe_lo = 0;
2424 2421          insert_after = -2;
2425 2422          do {
2426 2423                  probe_pos = (probe_hi + probe_lo) / 2;
2427 2424                  probe_mfn = pfn_to_mfn(contig_pfn_list[probe_pos]);
2428 2425                  if (newmfn == probe_mfn + 1)
2429 2426                          insert_after = probe_pos;
2430 2427                  else if (newmfn == probe_mfn - 1)
2431 2428                          insert_after = probe_pos - 1;
2432 2429                  if (probe_pos == probe_lo)
2433 2430                          break;
2434 2431                  if (probe_mfn <= newmfn)
2435 2432                          probe_lo = probe_pos;
2436 2433                  else
2437 2434                          probe_hi = probe_pos;
2438 2435          } while (insert_after == -2);
2439 2436          /*
2440 2437           * If there is space in the list and there are adjacent mfns
2441 2438           * insert the pfn in to its proper place in the list.
2442 2439           */
2443 2440          if (insert_after != -2 && contig_pfn_cnt + 1 <= contig_pfn_max) {
2444 2441                  insert_point = insert_after + 1;
2445 2442                  ovbcopy(&contig_pfn_list[insert_point],
2446 2443                      &contig_pfn_list[insert_point + 1],
2447 2444                      (contig_pfn_cnt - insert_point) * sizeof (pfn_t));
2448 2445                  contig_pfn_list[insert_point] = pfn;
2449 2446                  contig_pfn_cnt++;
2450 2447          }
2451 2448  done:
2452 2449          if (drop_lock)
2453 2450                  mutex_exit(&contig_list_lock);
2454 2451  }
2455 2452  
2456 2453  /*
2457 2454   * Called to (re-)populate the io_pool from the free page lists.
2458 2455   */
2459 2456  long
2460 2457  populate_io_pool(void)
2461 2458  {
2462 2459          pfn_t pfn;
2463 2460          mfn_t mfn, max_mfn;
2464 2461          page_t *pp;
2465 2462  
2466 2463          /*
2467 2464           * Figure out the bounds of the pool on first invocation.
2468 2465           * We use a percentage of memory for the io pool size.
2469 2466           * we allow that to shrink, but not to less than a fixed minimum
2470 2467           */
2471 2468          if (io_pool_cnt_max == 0) {
2472 2469                  io_pool_cnt_max = physmem / (100 / io_pool_physmem_pct);
2473 2470                  io_pool_cnt_lowater = io_pool_cnt_max;
2474 2471                  /*
2475 2472                   * This is the first time in populate_io_pool, grab a va to use
2476 2473                   * when we need to allocate pages.
2477 2474                   */
2478 2475                  io_pool_kva = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP);
2479 2476          }
2480 2477          /*
2481 2478           * If we are out of pages in the pool, then grow the size of the pool
2482 2479           */
2483 2480          if (io_pool_cnt == 0) {
2484 2481                  /*
2485 2482                   * Grow the max size of the io pool by 5%, but never more than
2486 2483                   * 25% of physical memory.
2487 2484                   */
2488 2485                  if (io_pool_cnt_max < physmem / 4)
2489 2486                          io_pool_cnt_max += io_pool_cnt_max / 20;
2490 2487          }
2491 2488          io_pool_grows++;        /* should be a kstat? */
2492 2489  
2493 2490          /*
2494 2491           * Get highest mfn on this platform, but limit to the 32 bit DMA max.
2495 2492           */
2496 2493          (void) mfn_to_pfn(start_mfn);
2497 2494          max_mfn = MIN(cached_max_mfn, PFN_4GIG);
2498 2495          for (mfn = start_mfn; mfn < max_mfn; start_mfn = ++mfn) {
2499 2496                  pfn = mfn_to_pfn(mfn);
2500 2497                  if (pfn & PFN_IS_FOREIGN_MFN)
2501 2498                          continue;
2502 2499                  /*
2503 2500                   * try to allocate it from free pages
2504 2501                   */
2505 2502                  pp = page_numtopp_alloc(pfn);
2506 2503                  if (pp == NULL)
2507 2504                          continue;
2508 2505                  PP_CLRFREE(pp);
2509 2506                  add_page_to_pool(pp, 1);
2510 2507                  if (io_pool_cnt >= io_pool_cnt_max)
2511 2508                          break;
2512 2509          }
2513 2510  
2514 2511          return (io_pool_cnt);
2515 2512  }
2516 2513  
2517 2514  /*
2518 2515   * Destroy a page that was being used for DMA I/O. It may or
2519 2516   * may not actually go back to the io_pool.
2520 2517   */
2521 2518  void
2522 2519  page_destroy_io(page_t *pp)
2523 2520  {
2524 2521          mfn_t mfn = mfn_list[pp->p_pagenum];
2525 2522  
2526 2523          /*
2527 2524           * When the page was alloc'd a reservation was made, release it now
2528 2525           */
2529 2526          page_unresv(1);
2530 2527          /*
2531 2528           * Unload translations, if any, then hash out the
2532 2529           * page to erase its identity.
2533 2530           */
2534 2531          (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
2535 2532          page_hashout(pp, NULL);
2536 2533  
2537 2534          /*
2538 2535           * If the page came from the free lists, just put it back to them.
2539 2536           * DomU pages always go on the free lists as well.
2540 2537           */
2541 2538          if (!DOMAIN_IS_INITDOMAIN(xen_info) || mfn >= PFN_4GIG) {
2542 2539                  page_free(pp, 1);
2543 2540                  return;
2544 2541          }
2545 2542  
2546 2543          add_page_to_pool(pp, 0);
2547 2544  }
2548 2545  
2549 2546  
2550 2547  long contig_searches;           /* count of times contig pages requested */
2551 2548  long contig_search_restarts;    /* count of contig ranges tried */
2552 2549  long contig_search_failed;      /* count of contig alloc failures */
2553 2550  
2554 2551  /*
2555 2552   * Free partial page list
2556 2553   */
2557 2554  static void
2558 2555  free_partial_list(page_t **pplist)
2559 2556  {
2560 2557          page_t *pp;
2561 2558  
2562 2559          while (*pplist != NULL) {
2563 2560                  pp = *pplist;
2564 2561                  page_io_pool_sub(pplist, pp, pp);
2565 2562                  page_free(pp, 1);
2566 2563          }
2567 2564  }
2568 2565  
2569 2566  /*
2570 2567   * Look thru the contiguous pfns that are not part of the io_pool for
2571 2568   * contiguous free pages.  Return a list of the found pages or NULL.
2572 2569   */
2573 2570  page_t *
2574 2571  find_contig_free(uint_t npages, uint_t flags, uint64_t pfnseg,
2575 2572      pgcnt_t pfnalign)
2576 2573  {
2577 2574          page_t *pp, *plist = NULL;
2578 2575          mfn_t mfn, prev_mfn, start_mfn;
2579 2576          pfn_t pfn;
2580 2577          int pages_needed, pages_requested;
2581 2578          int search_start;
2582 2579  
2583 2580          /*
2584 2581           * create the contig pfn list if not already done
2585 2582           */
2586 2583  retry:
2587 2584          mutex_enter(&contig_list_lock);
2588 2585          if (contig_pfn_list == NULL) {
2589 2586                  mutex_exit(&contig_list_lock);
2590 2587                  if (!create_contig_pfnlist(flags)) {
2591 2588                          return (NULL);
2592 2589                  }
2593 2590                  goto retry;
2594 2591          }
2595 2592          contig_searches++;
2596 2593          /*
2597 2594           * Search contiguous pfn list for physically contiguous pages not in
2598 2595           * the io_pool.  Start the search where the last search left off.
2599 2596           */
2600 2597          pages_requested = pages_needed = npages;
2601 2598          search_start = next_alloc_pfn;
2602 2599          start_mfn = prev_mfn = 0;
2603 2600          while (pages_needed) {
2604 2601                  pfn = contig_pfn_list[next_alloc_pfn];
2605 2602                  mfn = pfn_to_mfn(pfn);
2606 2603                  /*
2607 2604                   * Check if mfn is first one or contig to previous one and
2608 2605                   * if page corresponding to mfn is free and that mfn
2609 2606                   * range is not crossing a segment boundary.
2610 2607                   */
2611 2608                  if ((prev_mfn == 0 || mfn == prev_mfn + 1) &&
2612 2609                      (pp = page_numtopp_alloc(pfn)) != NULL &&
2613 2610                      !((mfn & pfnseg) < (start_mfn & pfnseg))) {
2614 2611                          PP_CLRFREE(pp);
2615 2612                          page_io_pool_add(&plist, pp);
2616 2613                          pages_needed--;
2617 2614                          if (prev_mfn == 0) {
2618 2615                                  if (pfnalign &&
2619 2616                                      mfn != P2ROUNDUP(mfn, pfnalign)) {
2620 2617                                          /*
2621 2618                                           * not properly aligned
2622 2619                                           */
2623 2620                                          contig_search_restarts++;
2624 2621                                          free_partial_list(&plist);
2625 2622                                          pages_needed = pages_requested;
2626 2623                                          start_mfn = prev_mfn = 0;
2627 2624                                          goto skip;
2628 2625                                  }
2629 2626                                  start_mfn = mfn;
2630 2627                          }
2631 2628                          prev_mfn = mfn;
2632 2629                  } else {
2633 2630                          contig_search_restarts++;
2634 2631                          free_partial_list(&plist);
2635 2632                          pages_needed = pages_requested;
2636 2633                          start_mfn = prev_mfn = 0;
2637 2634                  }
2638 2635  skip:
2639 2636                  if (++next_alloc_pfn == contig_pfn_cnt)
2640 2637                          next_alloc_pfn = 0;
2641 2638                  if (next_alloc_pfn == search_start)
2642 2639                          break; /* all pfns searched */
2643 2640          }
2644 2641          mutex_exit(&contig_list_lock);
2645 2642          if (pages_needed) {
2646 2643                  contig_search_failed++;
2647 2644                  /*
2648 2645                   * Failed to find enough contig pages.
2649 2646                   * free partial page list
2650 2647                   */
2651 2648                  free_partial_list(&plist);
2652 2649          }
2653 2650          return (plist);
2654 2651  }
2655 2652  
2656 2653  /*
2657 2654   * Search the reserved io pool pages for a page range with the
2658 2655   * desired characteristics.
2659 2656   */
2660 2657  page_t *
2661 2658  page_io_pool_alloc(ddi_dma_attr_t *mattr, int contig, pgcnt_t minctg)
2662 2659  {
2663 2660          page_t *pp_first, *pp_last;
2664 2661          page_t *pp, **poolp;
2665 2662          pgcnt_t nwanted, pfnalign;
2666 2663          uint64_t pfnseg;
2667 2664          mfn_t mfn, tmfn, hi_mfn, lo_mfn;
2668 2665          int align, attempt = 0;
2669 2666  
2670 2667          if (minctg == 1)
2671 2668                  contig = 0;
2672 2669          lo_mfn = mmu_btop(mattr->dma_attr_addr_lo);
2673 2670          hi_mfn = mmu_btop(mattr->dma_attr_addr_hi);
2674 2671          pfnseg = mmu_btop(mattr->dma_attr_seg);
2675 2672          align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer);
2676 2673          if (align > MMU_PAGESIZE)
2677 2674                  pfnalign = mmu_btop(align);
2678 2675          else
2679 2676                  pfnalign = 0;
2680 2677  
2681 2678  try_again:
2682 2679          /*
2683 2680           * See if we want pages for a legacy device
2684 2681           */
2685 2682          if (hi_mfn < PFN_16MEG)
2686 2683                  poolp = &io_pool_16m;
2687 2684          else
2688 2685                  poolp = &io_pool_4g;
2689 2686  try_smaller:
2690 2687          /*
2691 2688           * Take pages from I/O pool. We'll use pages from the highest
2692 2689           * MFN range possible.
2693 2690           */
2694 2691          pp_first = pp_last = NULL;
2695 2692          mutex_enter(&io_pool_lock);
2696 2693          nwanted = minctg;
2697 2694          for (pp = *poolp; pp && nwanted > 0; ) {
2698 2695                  pp = pp->p_prev;
2699 2696  
2700 2697                  /*
2701 2698                   * skip pages above allowable range
2702 2699                   */
2703 2700                  mfn = mfn_list[pp->p_pagenum];
2704 2701                  if (hi_mfn < mfn)
2705 2702                          goto skip;
2706 2703  
2707 2704                  /*
2708 2705                   * stop at pages below allowable range
2709 2706                   */
2710 2707                  if (lo_mfn > mfn)
2711 2708                          break;
2712 2709  restart:
2713 2710                  if (pp_last == NULL) {
2714 2711                          /*
2715 2712                           * Check alignment
2716 2713                           */
2717 2714                          tmfn = mfn - (minctg - 1);
2718 2715                          if (pfnalign && tmfn != P2ROUNDUP(tmfn, pfnalign))
2719 2716                                  goto skip; /* not properly aligned */
2720 2717                          /*
2721 2718                           * Check segment
2722 2719                           */
2723 2720                          if ((mfn & pfnseg) < (tmfn & pfnseg))
2724 2721                                  goto skip; /* crosses seg boundary */
2725 2722                          /*
2726 2723                           * Start building page list
2727 2724                           */
2728 2725                          pp_first = pp_last = pp;
2729 2726                          nwanted--;
2730 2727                  } else {
2731 2728                          /*
2732 2729                           * check physical contiguity if required
2733 2730                           */
2734 2731                          if (contig &&
2735 2732                              mfn_list[pp_first->p_pagenum] != mfn + 1) {
2736 2733                                  /*
2737 2734                                   * not a contiguous page, restart list.
2738 2735                                   */
2739 2736                                  pp_last = NULL;
2740 2737                                  nwanted = minctg;
2741 2738                                  goto restart;
2742 2739                          } else { /* add page to list */
2743 2740                                  pp_first = pp;
2744 2741                                  nwanted--;
2745 2742                          }
2746 2743                  }
2747 2744  skip:
2748 2745                  if (pp == *poolp)
2749 2746                          break;
2750 2747          }
2751 2748  
2752 2749          /*
2753 2750           * If we didn't find memory. Try the more constrained pool, then
2754 2751           * sweep free pages into the DMA pool and try again.
2755 2752           */
2756 2753          if (nwanted != 0) {
2757 2754                  mutex_exit(&io_pool_lock);
2758 2755                  /*
2759 2756                   * If we were looking in the less constrained pool and
2760 2757                   * didn't find pages, try the more constrained pool.
2761 2758                   */
2762 2759                  if (poolp == &io_pool_4g) {
2763 2760                          poolp = &io_pool_16m;
2764 2761                          goto try_smaller;
2765 2762                  }
2766 2763                  kmem_reap();
2767 2764                  if (++attempt < 4) {
2768 2765                          /*
2769 2766                           * Grab some more io_pool pages
2770 2767                           */
2771 2768                          (void) populate_io_pool();
2772 2769                          goto try_again; /* go around and retry */
2773 2770                  }
2774 2771                  return (NULL);
2775 2772          }
2776 2773          /*
2777 2774           * Found the pages, now snip them from the list
2778 2775           */
2779 2776          page_io_pool_sub(poolp, pp_first, pp_last);
2780 2777          io_pool_cnt -= minctg;
2781 2778          /*
2782 2779           * reset low water mark
2783 2780           */
2784 2781          if (io_pool_cnt < io_pool_cnt_lowater)
2785 2782                  io_pool_cnt_lowater = io_pool_cnt;
2786 2783          mutex_exit(&io_pool_lock);
2787 2784          return (pp_first);
2788 2785  }
2789 2786  
2790 2787  page_t *
2791 2788  page_swap_with_hypervisor(struct vnode *vp, u_offset_t off, caddr_t vaddr,
2792 2789      ddi_dma_attr_t *mattr, uint_t flags, pgcnt_t minctg)
2793 2790  {
2794 2791          uint_t kflags;
2795 2792          int order, extra, extpages, i, contig, nbits, extents;
2796 2793          page_t *pp, *expp, *pp_first, **pplist = NULL;
2797 2794          mfn_t *mfnlist = NULL;
2798 2795  
2799 2796          contig = flags & PG_PHYSCONTIG;
2800 2797          if (minctg == 1)
2801 2798                  contig = 0;
2802 2799          flags &= ~PG_PHYSCONTIG;
2803 2800          kflags = flags & PG_WAIT ? KM_SLEEP : KM_NOSLEEP;
2804 2801          /*
2805 2802           * Hypervisor will allocate extents, if we want contig
2806 2803           * pages extent must be >= minctg
2807 2804           */
2808 2805          if (contig) {
2809 2806                  order = highbit(minctg) - 1;
2810 2807                  if (minctg & ((1 << order) - 1))
2811 2808                          order++;
2812 2809                  extpages = 1 << order;
2813 2810          } else {
2814 2811                  order = 0;
2815 2812                  extpages = minctg;
2816 2813          }
2817 2814          if (extpages > minctg) {
2818 2815                  extra = extpages - minctg;
2819 2816                  if (!page_resv(extra, kflags))
2820 2817                          return (NULL);
2821 2818          }
2822 2819          pp_first = NULL;
2823 2820          pplist = kmem_alloc(extpages * sizeof (page_t *), kflags);
2824 2821          if (pplist == NULL)
2825 2822                  goto balloon_fail;
2826 2823          mfnlist = kmem_alloc(extpages * sizeof (mfn_t), kflags);
2827 2824          if (mfnlist == NULL)
2828 2825                  goto balloon_fail;
2829 2826          pp = page_create_va(vp, off, minctg * PAGESIZE, flags, &kvseg, vaddr);
2830 2827          if (pp == NULL)
2831 2828                  goto balloon_fail;
2832 2829          pp_first = pp;
2833 2830          if (extpages > minctg) {
2834 2831                  /*
2835 2832                   * fill out the rest of extent pages to swap
2836 2833                   * with the hypervisor
2837 2834                   */
2838 2835                  for (i = 0; i < extra; i++) {
2839 2836                          expp = page_create_va(vp,
2840 2837                              (u_offset_t)(uintptr_t)io_pool_kva,
2841 2838                              PAGESIZE, flags, &kvseg, io_pool_kva);
2842 2839                          if (expp == NULL)
2843 2840                                  goto balloon_fail;
2844 2841                          (void) hat_pageunload(expp, HAT_FORCE_PGUNLOAD);
2845 2842                          page_io_unlock(expp);
2846 2843                          page_hashout(expp, NULL);
2847 2844                          page_io_lock(expp);
2848 2845                          /*
2849 2846                           * add page to end of list
2850 2847                           */
2851 2848                          expp->p_prev = pp_first->p_prev;
2852 2849                          expp->p_next = pp_first;
2853 2850                          expp->p_prev->p_next = expp;
2854 2851                          pp_first->p_prev = expp;
2855 2852                  }
2856 2853  
2857 2854          }
2858 2855          for (i = 0; i < extpages; i++) {
2859 2856                  pplist[i] = pp;
2860 2857                  pp = pp->p_next;
2861 2858          }
2862 2859          nbits = highbit(mattr->dma_attr_addr_hi);
2863 2860          extents = contig ? 1 : minctg;
2864 2861          if (balloon_replace_pages(extents, pplist, nbits, order,
2865 2862              mfnlist) != extents) {
2866 2863                  if (ioalloc_dbg)
2867 2864                          cmn_err(CE_NOTE, "request to hypervisor"
2868 2865                              " for %d pages, maxaddr %" PRIx64 " failed",
2869 2866                              extpages, mattr->dma_attr_addr_hi);
2870 2867                  goto balloon_fail;
2871 2868          }
2872 2869  
2873 2870          kmem_free(pplist, extpages * sizeof (page_t *));
2874 2871          kmem_free(mfnlist, extpages * sizeof (mfn_t));
2875 2872          /*
2876 2873           * Return any excess pages to free list
2877 2874           */
2878 2875          if (extpages > minctg) {
2879 2876                  for (i = 0; i < extra; i++) {
2880 2877                          pp = pp_first->p_prev;
2881 2878                          page_sub(&pp_first, pp);
2882 2879                          page_io_unlock(pp);
2883 2880                          page_unresv(1);
2884 2881                          page_free(pp, 1);
2885 2882                  }
2886 2883          }
2887 2884          return (pp_first);
2888 2885  balloon_fail:
2889 2886          /*
2890 2887           * Return pages to free list and return failure
2891 2888           */
2892 2889          while (pp_first != NULL) {
2893 2890                  pp = pp_first;
2894 2891                  page_sub(&pp_first, pp);
2895 2892                  page_io_unlock(pp);
2896 2893                  if (pp->p_vnode != NULL)
2897 2894                          page_hashout(pp, NULL);
2898 2895                  page_free(pp, 1);
2899 2896          }
2900 2897          if (pplist)
2901 2898                  kmem_free(pplist, extpages * sizeof (page_t *));
2902 2899          if (mfnlist)
2903 2900                  kmem_free(mfnlist, extpages * sizeof (mfn_t));
2904 2901          page_unresv(extpages - minctg);
2905 2902          return (NULL);
2906 2903  }
2907 2904  
2908 2905  static void
2909 2906  return_partial_alloc(page_t *plist)
2910 2907  {
2911 2908          page_t *pp;
2912 2909  
2913 2910          while (plist != NULL) {
2914 2911                  pp = plist;
2915 2912                  page_sub(&plist, pp);
2916 2913                  page_io_unlock(pp);
2917 2914                  page_destroy_io(pp);
2918 2915          }
2919 2916  }
2920 2917  
2921 2918  static page_t *
2922 2919  page_get_contigpages(
2923 2920          struct vnode    *vp,
2924 2921          u_offset_t      off,
2925 2922          int             *npagesp,
2926 2923          uint_t          flags,
2927 2924          caddr_t         vaddr,
2928 2925          ddi_dma_attr_t  *mattr)
2929 2926  {
2930 2927          mfn_t   max_mfn = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL);
2931 2928          page_t  *plist; /* list to return */
2932 2929          page_t  *pp, *mcpl;
2933 2930          int     contig, anyaddr, npages, getone = 0;
2934 2931          mfn_t   lo_mfn;
2935 2932          mfn_t   hi_mfn;
2936 2933          pgcnt_t pfnalign = 0;
2937 2934          int     align, sgllen;
2938 2935          uint64_t pfnseg;
2939 2936          pgcnt_t minctg;
2940 2937  
2941 2938          npages = *npagesp;
2942 2939          ASSERT(mattr != NULL);
2943 2940          lo_mfn = mmu_btop(mattr->dma_attr_addr_lo);
2944 2941          hi_mfn = mmu_btop(mattr->dma_attr_addr_hi);
2945 2942          sgllen = mattr->dma_attr_sgllen;
2946 2943          pfnseg = mmu_btop(mattr->dma_attr_seg);
2947 2944          align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer);
2948 2945          if (align > MMU_PAGESIZE)
2949 2946                  pfnalign = mmu_btop(align);
2950 2947  
2951 2948          contig = flags & PG_PHYSCONTIG;
2952 2949          if (npages == -1) {
2953 2950                  npages = 1;
2954 2951                  pfnalign = 0;
2955 2952          }
2956 2953          /*
2957 2954           * Clear the contig flag if only one page is needed.
2958 2955           */
2959 2956          if (npages == 1) {
2960 2957                  getone = 1;
2961 2958                  contig = 0;
2962 2959          }
2963 2960  
2964 2961          /*
2965 2962           * Check if any page in the system is fine.
2966 2963           */
2967 2964          anyaddr = lo_mfn == 0 && hi_mfn >= max_mfn;
2968 2965          if (!contig && anyaddr && !pfnalign) {
2969 2966                  flags &= ~PG_PHYSCONTIG;
2970 2967                  plist = page_create_va(vp, off, npages * MMU_PAGESIZE,
2971 2968                      flags, &kvseg, vaddr);
2972 2969                  if (plist != NULL) {
2973 2970                          *npagesp = 0;
2974 2971                          return (plist);
2975 2972                  }
2976 2973          }
2977 2974          plist = NULL;
2978 2975          minctg = howmany(npages, sgllen);
2979 2976          while (npages > sgllen || getone) {
2980 2977                  if (minctg > npages)
2981 2978                          minctg = npages;
2982 2979                  mcpl = NULL;
2983 2980                  /*
2984 2981                   * We could want contig pages with no address range limits.
2985 2982                   */
2986 2983                  if (anyaddr && contig) {
2987 2984                          /*
2988 2985                           * Look for free contig pages to satisfy the request.
2989 2986                           */
2990 2987                          mcpl = find_contig_free(minctg, flags, pfnseg,
2991 2988                              pfnalign);
2992 2989                  }
2993 2990                  /*
2994 2991                   * Try the reserved io pools next
2995 2992                   */
2996 2993                  if (mcpl == NULL)
2997 2994                          mcpl = page_io_pool_alloc(mattr, contig, minctg);
2998 2995                  if (mcpl != NULL) {
2999 2996                          pp = mcpl;
3000 2997                          do {
3001 2998                                  if (!page_hashin(pp, vp, off, NULL)) {
3002 2999                                          panic("page_get_contigpages:"
3003 3000                                              " hashin failed"
3004 3001                                              " pp %p, vp %p, off %llx",
3005 3002                                              (void *)pp, (void *)vp, off);
3006 3003                                  }
3007 3004                                  off += MMU_PAGESIZE;
3008 3005                                  PP_CLRFREE(pp);
3009 3006                                  PP_CLRAGED(pp);
3010 3007                                  page_set_props(pp, P_REF);
3011 3008                                  page_io_lock(pp);
3012 3009                                  pp = pp->p_next;
3013 3010                          } while (pp != mcpl);
3014 3011                  } else {
3015 3012                          /*
3016 3013                           * Hypervisor exchange doesn't handle segment or
3017 3014                           * alignment constraints
3018 3015                           */
3019 3016                          if (mattr->dma_attr_seg < mattr->dma_attr_addr_hi ||
3020 3017                              pfnalign)
3021 3018                                  goto fail;
3022 3019                          /*
3023 3020                           * Try exchanging pages with the hypervisor
3024 3021                           */
3025 3022                          mcpl = page_swap_with_hypervisor(vp, off, vaddr, mattr,
3026 3023                              flags, minctg);
3027 3024                          if (mcpl == NULL)
3028 3025                                  goto fail;
3029 3026                          off += minctg * MMU_PAGESIZE;
3030 3027                  }
3031 3028                  check_dma(mattr, mcpl, minctg);
3032 3029                  /*
3033 3030                   * Here with a minctg run of contiguous pages, add them to the
3034 3031                   * list we will return for this request.
3035 3032                   */
3036 3033                  page_list_concat(&plist, &mcpl);
3037 3034                  npages -= minctg;
3038 3035                  *npagesp = npages;
3039 3036                  sgllen--;
3040 3037                  if (getone)
3041 3038                          break;
3042 3039          }
3043 3040          return (plist);
3044 3041  fail:
3045 3042          return_partial_alloc(plist);
3046 3043          return (NULL);
3047 3044  }
3048 3045  
3049 3046  /*
3050 3047   * Allocator for domain 0 I/O pages. We match the required
3051 3048   * DMA attributes and contiguity constraints.
3052 3049   */
3053 3050  /*ARGSUSED*/
3054 3051  page_t *
3055 3052  page_create_io(
3056 3053          struct vnode    *vp,
3057 3054          u_offset_t      off,
3058 3055          uint_t          bytes,
3059 3056          uint_t          flags,
3060 3057          struct as       *as,
3061 3058          caddr_t         vaddr,
3062 3059          ddi_dma_attr_t  *mattr)
3063 3060  {
3064 3061          page_t  *plist = NULL, *pp;
3065 3062          int     npages = 0, contig, anyaddr, pages_req;
3066 3063          mfn_t   lo_mfn;
3067 3064          mfn_t   hi_mfn;
3068 3065          pgcnt_t pfnalign = 0;
3069 3066          int     align;
3070 3067          int     is_domu = 0;
3071 3068          int     dummy, bytes_got;
3072 3069          mfn_t   max_mfn = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL);
3073 3070  
3074 3071          ASSERT(mattr != NULL);
3075 3072          lo_mfn = mmu_btop(mattr->dma_attr_addr_lo);
3076 3073          hi_mfn = mmu_btop(mattr->dma_attr_addr_hi);
3077 3074          align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer);
3078 3075          if (align > MMU_PAGESIZE)
3079 3076                  pfnalign = mmu_btop(align);
3080 3077  
3081 3078          /*
3082 3079           * Clear the contig flag if only one page is needed or the scatter
3083 3080           * gather list length is >= npages.
3084 3081           */
3085 3082          pages_req = npages = mmu_btopr(bytes);
3086 3083          contig = (flags & PG_PHYSCONTIG);
3087 3084          bytes = P2ROUNDUP(bytes, MMU_PAGESIZE);
3088 3085          if (bytes == MMU_PAGESIZE || mattr->dma_attr_sgllen >= npages)
3089 3086                  contig = 0;
3090 3087  
3091 3088          /*
3092 3089           * Check if any old page in the system is fine.
3093 3090           * DomU should always go down this path.
3094 3091           */
3095 3092          is_domu = !DOMAIN_IS_INITDOMAIN(xen_info);
3096 3093          anyaddr = lo_mfn == 0 && hi_mfn >= max_mfn && !pfnalign;
3097 3094          if ((!contig && anyaddr) || is_domu) {
3098 3095                  flags &= ~PG_PHYSCONTIG;
3099 3096                  plist = page_create_va(vp, off, bytes, flags, &kvseg, vaddr);
3100 3097                  if (plist != NULL)
3101 3098                          return (plist);
3102 3099                  else if (is_domu)
3103 3100                          return (NULL); /* no memory available */
3104 3101          }
3105 3102          /*
3106 3103           * DomU should never reach here
3107 3104           */
3108 3105          if (contig) {
3109 3106                  plist = page_get_contigpages(vp, off, &npages, flags, vaddr,
3110 3107                      mattr);
3111 3108                  if (plist == NULL)
3112 3109                          goto fail;
3113 3110                  bytes_got = (pages_req - npages) << MMU_PAGESHIFT;
3114 3111                  vaddr += bytes_got;
3115 3112                  off += bytes_got;
3116 3113                  /*
3117 3114                   * We now have all the contiguous pages we need, but
3118 3115                   * we may still need additional non-contiguous pages.
3119 3116                   */
3120 3117          }
3121 3118          /*
3122 3119           * now loop collecting the requested number of pages, these do
3123 3120           * not have to be contiguous pages but we will use the contig
3124 3121           * page alloc code to get the pages since it will honor any
3125 3122           * other constraints the pages may have.
3126 3123           */
3127 3124          while (npages--) {
3128 3125                  dummy = -1;
3129 3126                  pp = page_get_contigpages(vp, off, &dummy, flags, vaddr, mattr);
3130 3127                  if (pp == NULL)
3131 3128                          goto fail;
3132 3129                  page_add(&plist, pp);
3133 3130                  vaddr += MMU_PAGESIZE;
3134 3131                  off += MMU_PAGESIZE;
3135 3132          }
3136 3133          return (plist);
3137 3134  fail:
3138 3135          /*
3139 3136           * Failed to get enough pages, return ones we did get
3140 3137           */
3141 3138          return_partial_alloc(plist);
3142 3139          return (NULL);
3143 3140  }
3144 3141  
3145 3142  /*
3146 3143   * Lock and return the page with the highest mfn that we can find.  last_mfn
3147 3144   * holds the last one found, so the next search can start from there.  We
3148 3145   * also keep a counter so that we don't loop forever if the machine has no
3149 3146   * free pages.
3150 3147   *
3151 3148   * This is called from the balloon thread to find pages to give away.  new_high
3152 3149   * is used when new mfn's have been added to the system - we will reset our
3153 3150   * search if the new mfn's are higher than our current search position.
3154 3151   */
3155 3152  page_t *
3156 3153  page_get_high_mfn(mfn_t new_high)
3157 3154  {
3158 3155          static mfn_t last_mfn = 0;
3159 3156          pfn_t pfn;
3160 3157          page_t *pp;
3161 3158          ulong_t loop_count = 0;
3162 3159  
3163 3160          if (new_high > last_mfn)
3164 3161                  last_mfn = new_high;
3165 3162  
3166 3163          for (; loop_count < mfn_count; loop_count++, last_mfn--) {
3167 3164                  if (last_mfn == 0) {
3168 3165                          last_mfn = cached_max_mfn;
3169 3166                  }
3170 3167  
3171 3168                  pfn = mfn_to_pfn(last_mfn);
3172 3169                  if (pfn & PFN_IS_FOREIGN_MFN)
3173 3170                          continue;
3174 3171  
3175 3172                  /* See if the page is free.  If so, lock it. */
3176 3173                  pp = page_numtopp_alloc(pfn);
3177 3174                  if (pp == NULL)
3178 3175                          continue;
3179 3176                  PP_CLRFREE(pp);
3180 3177  
3181 3178                  ASSERT(PAGE_EXCL(pp));
3182 3179                  ASSERT(pp->p_vnode == NULL);
3183 3180                  ASSERT(!hat_page_is_mapped(pp));
3184 3181                  last_mfn--;
3185 3182                  return (pp);
3186 3183          }
3187 3184          return (NULL);
3188 3185  }
3189 3186  
3190 3187  #else /* !__xpv */
3191 3188  
3192 3189  /*
3193 3190   * get a page from any list with the given mnode
3194 3191   */
3195 3192  static page_t *
3196 3193  page_get_mnode_anylist(ulong_t origbin, uchar_t szc, uint_t flags,
3197 3194      int mnode, int mtype, ddi_dma_attr_t *dma_attr)
3198 3195  {
3199 3196          kmutex_t                *pcm;
3200 3197          int                     i;
3201 3198          page_t                  *pp;
3202 3199          page_t                  *first_pp;
3203 3200          uint64_t                pgaddr;
3204 3201          ulong_t                 bin;
3205 3202          int                     mtypestart;
3206 3203          int                     plw_initialized;
3207 3204          page_list_walker_t      plw;
3208 3205  
3209 3206          VM_STAT_ADD(pga_vmstats.pgma_alloc);
3210 3207  
3211 3208          ASSERT((flags & PG_MATCH_COLOR) == 0);
3212 3209          ASSERT(szc == 0);
3213 3210          ASSERT(dma_attr != NULL);
3214 3211  
3215 3212          MTYPE_START(mnode, mtype, flags);
3216 3213          if (mtype < 0) {
3217 3214                  VM_STAT_ADD(pga_vmstats.pgma_allocempty);
3218 3215                  return (NULL);
3219 3216          }
3220 3217  
3221 3218          mtypestart = mtype;
3222 3219  
3223 3220          bin = origbin;
3224 3221  
3225 3222          /*
3226 3223           * check up to page_colors + 1 bins - origbin may be checked twice
3227 3224           * because of BIN_STEP skip
3228 3225           */
3229 3226          do {
3230 3227                  plw_initialized = 0;
3231 3228  
3232 3229                  for (plw.plw_count = 0;
3233 3230                      plw.plw_count < page_colors; plw.plw_count++) {
3234 3231  
3235 3232                          if (PAGE_FREELISTS(mnode, szc, bin, mtype) == NULL)
3236 3233                                  goto nextfreebin;
3237 3234  
3238 3235                          pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
3239 3236                          mutex_enter(pcm);
3240 3237                          pp = PAGE_FREELISTS(mnode, szc, bin, mtype);
3241 3238                          first_pp = pp;
3242 3239                          while (pp != NULL) {
3243 3240                                  if (IS_DUMP_PAGE(pp) || page_trylock(pp,
3244 3241                                      SE_EXCL) == 0) {
3245 3242                                          pp = pp->p_next;
3246 3243                                          if (pp == first_pp) {
3247 3244                                                  pp = NULL;
3248 3245                                          }
3249 3246                                          continue;
3250 3247                                  }
3251 3248  
3252 3249                                  ASSERT(PP_ISFREE(pp));
3253 3250                                  ASSERT(PP_ISAGED(pp));
3254 3251                                  ASSERT(pp->p_vnode == NULL);
3255 3252                                  ASSERT(pp->p_hash == NULL);
3256 3253                                  ASSERT(pp->p_offset == (u_offset_t)-1);
3257 3254                                  ASSERT(pp->p_szc == szc);
3258 3255                                  ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
3259 3256                                  /* check if page within DMA attributes */
3260 3257                                  pgaddr = pa_to_ma(pfn_to_pa(pp->p_pagenum));
3261 3258                                  if ((pgaddr >= dma_attr->dma_attr_addr_lo) &&
3262 3259                                      (pgaddr + MMU_PAGESIZE - 1 <=
3263 3260                                      dma_attr->dma_attr_addr_hi)) {
3264 3261                                          break;
3265 3262                                  }
3266 3263  
3267 3264                                  /* continue looking */
3268 3265                                  page_unlock(pp);
3269 3266                                  pp = pp->p_next;
3270 3267                                  if (pp == first_pp)
3271 3268                                          pp = NULL;
3272 3269  
3273 3270                          }
3274 3271                          if (pp != NULL) {
3275 3272                                  ASSERT(mtype == PP_2_MTYPE(pp));
3276 3273                                  ASSERT(pp->p_szc == 0);
3277 3274  
3278 3275                                  /* found a page with specified DMA attributes */
3279 3276                                  page_sub(&PAGE_FREELISTS(mnode, szc, bin,
3280 3277                                      mtype), pp);
3281 3278                                  page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST);
3282 3279  
3283 3280                                  if ((PP_ISFREE(pp) == 0) ||
3284 3281                                      (PP_ISAGED(pp) == 0)) {
3285 3282                                          cmn_err(CE_PANIC, "page %p is not free",
3286 3283                                              (void *)pp);
3287 3284                                  }
3288 3285  
3289 3286                                  mutex_exit(pcm);
3290 3287                                  check_dma(dma_attr, pp, 1);
3291 3288                                  VM_STAT_ADD(pga_vmstats.pgma_allocok);
3292 3289                                  return (pp);
3293 3290                          }
3294 3291                          mutex_exit(pcm);
3295 3292  nextfreebin:
3296 3293                          if (plw_initialized == 0) {
3297 3294                                  page_list_walk_init(szc, 0, bin, 1, 0, &plw);
3298 3295                                  ASSERT(plw.plw_ceq_dif == page_colors);
3299 3296                                  plw_initialized = 1;
3300 3297                          }
3301 3298  
3302 3299                          if (plw.plw_do_split) {
3303 3300                                  pp = page_freelist_split(szc, bin, mnode,
3304 3301                                      mtype,
3305 3302                                      mmu_btop(dma_attr->dma_attr_addr_lo),
3306 3303                                      mmu_btop(dma_attr->dma_attr_addr_hi + 1),
3307 3304                                      &plw);
3308 3305                                  if (pp != NULL) {
3309 3306                                          check_dma(dma_attr, pp, 1);
3310 3307                                          return (pp);
3311 3308                                  }
3312 3309                          }
3313 3310  
3314 3311                          bin = page_list_walk_next_bin(szc, bin, &plw);
3315 3312                  }
3316 3313  
3317 3314                  MTYPE_NEXT(mnode, mtype, flags);
3318 3315          } while (mtype >= 0);
3319 3316  
3320 3317          /* failed to find a page in the freelist; try it in the cachelist */
3321 3318  
3322 3319          /* reset mtype start for cachelist search */
3323 3320          mtype = mtypestart;
3324 3321          ASSERT(mtype >= 0);
3325 3322  
3326 3323          /* start with the bin of matching color */
3327 3324          bin = origbin;
3328 3325  
3329 3326          do {
3330 3327                  for (i = 0; i <= page_colors; i++) {
3331 3328                          if (PAGE_CACHELISTS(mnode, bin, mtype) == NULL)
3332 3329                                  goto nextcachebin;
3333 3330                          pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST);
3334 3331                          mutex_enter(pcm);
3335 3332                          pp = PAGE_CACHELISTS(mnode, bin, mtype);
3336 3333                          first_pp = pp;
3337 3334                          while (pp != NULL) {
3338 3335                                  if (IS_DUMP_PAGE(pp) || page_trylock(pp,
3339 3336                                      SE_EXCL) == 0) {
3340 3337                                          pp = pp->p_next;
3341 3338                                          if (pp == first_pp)
3342 3339                                                  pp = NULL;
3343 3340                                          continue;
3344 3341                                  }
3345 3342                                  ASSERT(pp->p_vnode);
3346 3343                                  ASSERT(PP_ISAGED(pp) == 0);
3347 3344                                  ASSERT(pp->p_szc == 0);
3348 3345                                  ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
3349 3346  
3350 3347                                  /* check if page within DMA attributes */
3351 3348  
3352 3349                                  pgaddr = pa_to_ma(pfn_to_pa(pp->p_pagenum));
3353 3350                                  if ((pgaddr >= dma_attr->dma_attr_addr_lo) &&
3354 3351                                      (pgaddr + MMU_PAGESIZE - 1 <=
3355 3352                                      dma_attr->dma_attr_addr_hi)) {
3356 3353                                          break;
3357 3354                                  }
3358 3355  
3359 3356                                  /* continue looking */
3360 3357                                  page_unlock(pp);
3361 3358                                  pp = pp->p_next;
3362 3359                                  if (pp == first_pp)
3363 3360                                          pp = NULL;
3364 3361                          }
3365 3362  
3366 3363                          if (pp != NULL) {
3367 3364                                  ASSERT(mtype == PP_2_MTYPE(pp));
3368 3365                                  ASSERT(pp->p_szc == 0);
3369 3366  
3370 3367                                  /* found a page with specified DMA attributes */
3371 3368                                  page_sub(&PAGE_CACHELISTS(mnode, bin,
3372 3369                                      mtype), pp);
3373 3370                                  page_ctr_sub(mnode, mtype, pp, PG_CACHE_LIST);
3374 3371  
3375 3372                                  mutex_exit(pcm);
3376 3373                                  ASSERT(pp->p_vnode);
3377 3374                                  ASSERT(PP_ISAGED(pp) == 0);
3378 3375                                  check_dma(dma_attr, pp, 1);
3379 3376                                  VM_STAT_ADD(pga_vmstats.pgma_allocok);
3380 3377                                  return (pp);
3381 3378                          }
3382 3379                          mutex_exit(pcm);
3383 3380  nextcachebin:
3384 3381                          bin += (i == 0) ? BIN_STEP : 1;
3385 3382                          bin &= page_colors_mask;
3386 3383                  }
3387 3384                  MTYPE_NEXT(mnode, mtype, flags);
3388 3385          } while (mtype >= 0);
3389 3386  
3390 3387          VM_STAT_ADD(pga_vmstats.pgma_allocfailed);
3391 3388          return (NULL);
3392 3389  }
3393 3390  
3394 3391  /*
3395 3392   * This function is similar to page_get_freelist()/page_get_cachelist()
3396 3393   * but it searches both the lists to find a page with the specified
3397 3394   * color (or no color) and DMA attributes. The search is done in the
3398 3395   * freelist first and then in the cache list within the highest memory
3399 3396   * range (based on DMA attributes) before searching in the lower
3400 3397   * memory ranges.
3401 3398   *
3402 3399   * Note: This function is called only by page_create_io().
3403 3400   */
3404 3401  /*ARGSUSED*/
3405 3402  static page_t *
3406 3403  page_get_anylist(struct vnode *vp, u_offset_t off, struct as *as, caddr_t vaddr,
3407 3404      size_t size, uint_t flags, ddi_dma_attr_t *dma_attr, lgrp_t *lgrp)
3408 3405  {
3409 3406          uint_t          bin;
3410 3407          int             mtype;
3411 3408          page_t          *pp;
3412 3409          int             n;
3413 3410          int             m;
3414 3411          int             szc;
3415 3412          int             fullrange;
3416 3413          int             mnode;
3417 3414          int             local_failed_stat = 0;
3418 3415          lgrp_mnode_cookie_t     lgrp_cookie;
3419 3416  
3420 3417          VM_STAT_ADD(pga_vmstats.pga_alloc);
3421 3418  
3422 3419          /* only base pagesize currently supported */
3423 3420          if (size != MMU_PAGESIZE)
3424 3421                  return (NULL);
3425 3422  
3426 3423          /*
3427 3424           * If we're passed a specific lgroup, we use it.  Otherwise,
3428 3425           * assume first-touch placement is desired.
3429 3426           */
3430 3427          if (!LGRP_EXISTS(lgrp))
3431 3428                  lgrp = lgrp_home_lgrp();
3432 3429  
3433 3430          /* LINTED */
3434 3431          AS_2_BIN(as, seg, vp, vaddr, bin, 0);
3435 3432  
3436 3433          /*
3437 3434           * Only hold one freelist or cachelist lock at a time, that way we
3438 3435           * can start anywhere and not have to worry about lock
3439 3436           * ordering.
3440 3437           */
3441 3438          if (dma_attr == NULL) {
3442 3439                  n = mtype16m;
3443 3440                  m = mtypetop;
3444 3441                  fullrange = 1;
3445 3442                  VM_STAT_ADD(pga_vmstats.pga_nulldmaattr);
3446 3443          } else {
3447 3444                  pfn_t pfnlo = mmu_btop(dma_attr->dma_attr_addr_lo);
3448 3445                  pfn_t pfnhi = mmu_btop(dma_attr->dma_attr_addr_hi);
3449 3446  
3450 3447                  /*
3451 3448                   * We can guarantee alignment only for page boundary.
3452 3449                   */
3453 3450                  if (dma_attr->dma_attr_align > MMU_PAGESIZE)
3454 3451                          return (NULL);
3455 3452  
3456 3453                  /* Sanity check the dma_attr */
3457 3454                  if (pfnlo > pfnhi)
3458 3455                          return (NULL);
3459 3456  
3460 3457                  n = pfn_2_mtype(pfnlo);
3461 3458                  m = pfn_2_mtype(pfnhi);
3462 3459  
3463 3460                  fullrange = ((pfnlo == mnoderanges[n].mnr_pfnlo) &&
3464 3461                      (pfnhi >= mnoderanges[m].mnr_pfnhi));
3465 3462          }
3466 3463          VM_STAT_COND_ADD(fullrange == 0, pga_vmstats.pga_notfullrange);
3467 3464  
3468 3465          szc = 0;
3469 3466  
3470 3467          /* cylcing thru mtype handled by RANGE0 if n == mtype16m */
3471 3468          if (n == mtype16m) {
3472 3469                  flags |= PGI_MT_RANGE0;
3473 3470                  n = m;
3474 3471          }
3475 3472  
3476 3473          /*
3477 3474           * Try local memory node first, but try remote if we can't
3478 3475           * get a page of the right color.
3479 3476           */
3480 3477          LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_HIER);
3481 3478          while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3482 3479                  /*
3483 3480                   * allocate pages from high pfn to low.
3484 3481                   */
3485 3482                  mtype = m;
3486 3483                  do {
3487 3484                          if (fullrange != 0) {
3488 3485                                  pp = page_get_mnode_freelist(mnode,
3489 3486                                      bin, mtype, szc, flags);
3490 3487                                  if (pp == NULL) {
3491 3488                                          pp = page_get_mnode_cachelist(
3492 3489                                              bin, flags, mnode, mtype);
3493 3490                                  }
3494 3491                          } else {
3495 3492                                  pp = page_get_mnode_anylist(bin, szc,
3496 3493                                      flags, mnode, mtype, dma_attr);
3497 3494                          }
3498 3495                          if (pp != NULL) {
3499 3496                                  VM_STAT_ADD(pga_vmstats.pga_allocok);
3500 3497                                  check_dma(dma_attr, pp, 1);
3501 3498                                  return (pp);
3502 3499                          }
3503 3500                  } while (mtype != n &&
3504 3501                      (mtype = mnoderanges[mtype].mnr_next) != -1);
3505 3502                  if (!local_failed_stat) {
3506 3503                          lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1);
3507 3504                          local_failed_stat = 1;
3508 3505                  }
3509 3506          }
3510 3507          VM_STAT_ADD(pga_vmstats.pga_allocfailed);
3511 3508  
3512 3509          return (NULL);
3513 3510  }
3514 3511  
3515 3512  /*
3516 3513   * page_create_io()
3517 3514   *
3518 3515   * This function is a copy of page_create_va() with an additional
3519 3516   * argument 'mattr' that specifies DMA memory requirements to
3520 3517   * the page list functions. This function is used by the segkmem
3521 3518   * allocator so it is only to create new pages (i.e PG_EXCL is
3522 3519   * set).
3523 3520   *
3524 3521   * Note: This interface is currently used by x86 PSM only and is
3525 3522   *       not fully specified so the commitment level is only for
3526 3523   *       private interface specific to x86. This interface uses PSM
3527 3524   *       specific page_get_anylist() interface.
3528 3525   */
3529 3526  
3530 3527  #define PAGE_HASH_SEARCH(index, pp, vp, off) { \
3531 3528          for ((pp) = page_hash[(index)]; (pp); (pp) = (pp)->p_hash) { \
3532 3529                  if ((pp)->p_vnode == (vp) && (pp)->p_offset == (off)) \
3533 3530                          break; \
3534 3531          } \
3535 3532  }
3536 3533  
3537 3534  
3538 3535  page_t *
3539 3536  page_create_io(
3540 3537          struct vnode    *vp,
3541 3538          u_offset_t      off,
3542 3539          uint_t          bytes,
3543 3540          uint_t          flags,
3544 3541          struct as       *as,
3545 3542          caddr_t         vaddr,
3546 3543          ddi_dma_attr_t  *mattr) /* DMA memory attributes if any */
3547 3544  {
3548 3545          page_t          *plist = NULL;
3549 3546          uint_t          plist_len = 0;
3550 3547          pgcnt_t         npages;
3551 3548          page_t          *npp = NULL;
3552 3549          uint_t          pages_req;
3553 3550          page_t          *pp;
3554 3551          kmutex_t        *phm = NULL;
3555 3552          uint_t          index;
3556 3553  
3557 3554          TRACE_4(TR_FAC_VM, TR_PAGE_CREATE_START,
3558 3555              "page_create_start:vp %p off %llx bytes %u flags %x",
3559 3556              vp, off, bytes, flags);
3560 3557  
3561 3558          ASSERT((flags & ~(PG_EXCL | PG_WAIT | PG_PHYSCONTIG)) == 0);
3562 3559  
3563 3560          pages_req = npages = mmu_btopr(bytes);
3564 3561  
3565 3562          /*
3566 3563           * Do the freemem and pcf accounting.
3567 3564           */
3568 3565          if (!page_create_wait(npages, flags)) {
3569 3566                  return (NULL);
3570 3567          }
3571 3568  
3572 3569          TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SUCCESS,
3573 3570              "page_create_success:vp %p off %llx", vp, off);
3574 3571  
3575 3572          /*
3576 3573           * If satisfying this request has left us with too little
3577 3574           * memory, start the wheels turning to get some back.  The
3578 3575           * first clause of the test prevents waking up the pageout
3579 3576           * daemon in situations where it would decide that there's
3580 3577           * nothing to do.
3581 3578           */
3582 3579          if (nscan < desscan && freemem < minfree) {
3583 3580                  TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL,
3584 3581                      "pageout_cv_signal:freemem %ld", freemem);
3585 3582                  cv_signal(&proc_pageout->p_cv);
3586 3583          }
3587 3584  
3588 3585          if (flags & PG_PHYSCONTIG) {
3589 3586  
3590 3587                  plist = page_get_contigpage(&npages, mattr, 1);
3591 3588                  if (plist == NULL) {
3592 3589                          page_create_putback(npages);
3593 3590                          return (NULL);
3594 3591                  }
3595 3592  
3596 3593                  pp = plist;
3597 3594  
3598 3595                  do {
3599 3596                          if (!page_hashin(pp, vp, off, NULL)) {
3600 3597                                  panic("pg_creat_io: hashin failed %p %p %llx",
3601 3598                                      (void *)pp, (void *)vp, off);
3602 3599                          }
3603 3600                          VM_STAT_ADD(page_create_new);
3604 3601                          off += MMU_PAGESIZE;
3605 3602                          PP_CLRFREE(pp);
3606 3603                          PP_CLRAGED(pp);
3607 3604                          page_set_props(pp, P_REF);
3608 3605                          pp = pp->p_next;
3609 3606                  } while (pp != plist);
3610 3607  
3611 3608                  if (!npages) {
3612 3609                          check_dma(mattr, plist, pages_req);
3613 3610                          return (plist);
3614 3611                  } else {
3615 3612                          vaddr += (pages_req - npages) << MMU_PAGESHIFT;
3616 3613                  }
3617 3614  
3618 3615                  /*
3619 3616                   * fall-thru:
3620 3617                   *
3621 3618                   * page_get_contigpage returns when npages <= sgllen.
3622 3619                   * Grab the rest of the non-contig pages below from anylist.
3623 3620                   */
3624 3621          }
3625 3622  
3626 3623          /*
3627 3624           * Loop around collecting the requested number of pages.
3628 3625           * Most of the time, we have to `create' a new page. With
3629 3626           * this in mind, pull the page off the free list before
3630 3627           * getting the hash lock.  This will minimize the hash
3631 3628           * lock hold time, nesting, and the like.  If it turns
3632 3629           * out we don't need the page, we put it back at the end.
3633 3630           */
3634 3631          while (npages--) {
3635 3632                  phm = NULL;
3636 3633  
3637 3634                  index = PAGE_HASH_FUNC(vp, off);
3638 3635  top:
3639 3636                  ASSERT(phm == NULL);
3640 3637                  ASSERT(index == PAGE_HASH_FUNC(vp, off));
3641 3638                  ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
3642 3639  
3643 3640                  if (npp == NULL) {
3644 3641                          /*
3645 3642                           * Try to get the page of any color either from
3646 3643                           * the freelist or from the cache list.
3647 3644                           */
3648 3645                          npp = page_get_anylist(vp, off, as, vaddr, MMU_PAGESIZE,
3649 3646                              flags & ~PG_MATCH_COLOR, mattr, NULL);
3650 3647                          if (npp == NULL) {
3651 3648                                  if (mattr == NULL) {
3652 3649                                          /*
3653 3650                                           * Not looking for a special page;
3654 3651                                           * panic!
3655 3652                                           */
3656 3653                                          panic("no page found %d", (int)npages);
3657 3654                                  }
3658 3655                                  /*
3659 3656                                   * No page found! This can happen
3660 3657                                   * if we are looking for a page
3661 3658                                   * within a specific memory range
3662 3659                                   * for DMA purposes. If PG_WAIT is
3663 3660                                   * specified then we wait for a
3664 3661                                   * while and then try again. The
3665 3662                                   * wait could be forever if we
3666 3663                                   * don't get the page(s) we need.
3667 3664                                   *
3668 3665                                   * Note: XXX We really need a mechanism
3669 3666                                   * to wait for pages in the desired
3670 3667                                   * range. For now, we wait for any
3671 3668                                   * pages and see if we can use it.
3672 3669                                   */
3673 3670  
3674 3671                                  if ((mattr != NULL) && (flags & PG_WAIT)) {
3675 3672                                          delay(10);
3676 3673                                          goto top;
3677 3674                                  }
3678 3675                                  goto fail; /* undo accounting stuff */
3679 3676                          }
3680 3677  
3681 3678                          if (PP_ISAGED(npp) == 0) {
3682 3679                                  /*
3683 3680                                   * Since this page came from the
3684 3681                                   * cachelist, we must destroy the
3685 3682                                   * old vnode association.
3686 3683                                   */
3687 3684                                  page_hashout(npp, (kmutex_t *)NULL);
3688 3685                          }
3689 3686                  }
3690 3687  
3691 3688                  /*
3692 3689                   * We own this page!
3693 3690                   */
3694 3691                  ASSERT(PAGE_EXCL(npp));
3695 3692                  ASSERT(npp->p_vnode == NULL);
3696 3693                  ASSERT(!hat_page_is_mapped(npp));
3697 3694                  PP_CLRFREE(npp);
3698 3695                  PP_CLRAGED(npp);
3699 3696  
3700 3697                  /*
3701 3698                   * Here we have a page in our hot little mits and are
3702 3699                   * just waiting to stuff it on the appropriate lists.
3703 3700                   * Get the mutex and check to see if it really does
3704 3701                   * not exist.
3705 3702                   */
3706 3703                  phm = PAGE_HASH_MUTEX(index);
3707 3704                  mutex_enter(phm);
3708 3705                  PAGE_HASH_SEARCH(index, pp, vp, off);
3709 3706                  if (pp == NULL) {
3710 3707                          VM_STAT_ADD(page_create_new);
3711 3708                          pp = npp;
3712 3709                          npp = NULL;
3713 3710                          if (!page_hashin(pp, vp, off, phm)) {
3714 3711                                  /*
3715 3712                                   * Since we hold the page hash mutex and
3716 3713                                   * just searched for this page, page_hashin
3717 3714                                   * had better not fail.  If it does, that
3718 3715                                   * means somethread did not follow the
3719 3716                                   * page hash mutex rules.  Panic now and
3720 3717                                   * get it over with.  As usual, go down
3721 3718                                   * holding all the locks.
3722 3719                                   */
3723 3720                                  ASSERT(MUTEX_HELD(phm));
3724 3721                                  panic("page_create: hashin fail %p %p %llx %p",
3725 3722                                      (void *)pp, (void *)vp, off, (void *)phm);
3726 3723  
3727 3724                          }
3728 3725                          ASSERT(MUTEX_HELD(phm));
3729 3726                          mutex_exit(phm);
3730 3727                          phm = NULL;
3731 3728  
3732 3729                          /*
3733 3730                           * Hat layer locking need not be done to set
3734 3731                           * the following bits since the page is not hashed
3735 3732                           * and was on the free list (i.e., had no mappings).
3736 3733                           *
3737 3734                           * Set the reference bit to protect
3738 3735                           * against immediate pageout
3739 3736                           *
3740 3737                           * XXXmh modify freelist code to set reference
3741 3738                           * bit so we don't have to do it here.
3742 3739                           */
3743 3740                          page_set_props(pp, P_REF);
3744 3741                  } else {
3745 3742                          ASSERT(MUTEX_HELD(phm));
3746 3743                          mutex_exit(phm);
3747 3744                          phm = NULL;
3748 3745                          /*
3749 3746                           * NOTE: This should not happen for pages associated
3750 3747                           *       with kernel vnode 'kvp'.
3751 3748                           */
3752 3749                          /* XX64 - to debug why this happens! */
3753 3750                          ASSERT(!VN_ISKAS(vp));
3754 3751                          if (VN_ISKAS(vp))
3755 3752                                  cmn_err(CE_NOTE,
3756 3753                                      "page_create: page not expected "
3757 3754                                      "in hash list for kernel vnode - pp 0x%p",
3758 3755                                      (void *)pp);
3759 3756                          VM_STAT_ADD(page_create_exists);
3760 3757                          goto fail;
3761 3758                  }
3762 3759  
3763 3760                  /*
3764 3761                   * Got a page!  It is locked.  Acquire the i/o
3765 3762                   * lock since we are going to use the p_next and
3766 3763                   * p_prev fields to link the requested pages together.
3767 3764                   */
3768 3765                  page_io_lock(pp);
3769 3766                  page_add(&plist, pp);
3770 3767                  plist = plist->p_next;
3771 3768                  off += MMU_PAGESIZE;
3772 3769                  vaddr += MMU_PAGESIZE;
3773 3770          }
3774 3771  
3775 3772          check_dma(mattr, plist, pages_req);
3776 3773          return (plist);
3777 3774  
3778 3775  fail:
3779 3776          if (npp != NULL) {
3780 3777                  /*
3781 3778                   * Did not need this page after all.
3782 3779                   * Put it back on the free list.
3783 3780                   */
3784 3781                  VM_STAT_ADD(page_create_putbacks);
3785 3782                  PP_SETFREE(npp);
3786 3783                  PP_SETAGED(npp);
3787 3784                  npp->p_offset = (u_offset_t)-1;
3788 3785                  page_list_add(npp, PG_FREE_LIST | PG_LIST_TAIL);
3789 3786                  page_unlock(npp);
3790 3787          }
3791 3788  
3792 3789          /*
3793 3790           * Give up the pages we already got.
3794 3791           */
3795 3792          while (plist != NULL) {
3796 3793                  pp = plist;
3797 3794                  page_sub(&plist, pp);
3798 3795                  page_io_unlock(pp);
3799 3796                  plist_len++;
3800 3797                  /*LINTED: constant in conditional ctx*/
3801 3798                  VN_DISPOSE(pp, B_INVAL, 0, kcred);
3802 3799          }
3803 3800  
3804 3801          /*
3805 3802           * VN_DISPOSE does freemem accounting for the pages in plist
3806 3803           * by calling page_free. So, we need to undo the pcf accounting
3807 3804           * for only the remaining pages.
3808 3805           */
3809 3806          VM_STAT_ADD(page_create_putbacks);
3810 3807          page_create_putback(pages_req - plist_len);
3811 3808  
3812 3809          return (NULL);
3813 3810  }
3814 3811  #endif /* !__xpv */
3815 3812  
3816 3813  
3817 3814  /*
3818 3815   * Copy the data from the physical page represented by "frompp" to
3819 3816   * that represented by "topp". ppcopy uses CPU->cpu_caddr1 and
3820 3817   * CPU->cpu_caddr2.  It assumes that no one uses either map at interrupt
3821 3818   * level and no one sleeps with an active mapping there.
3822 3819   *
3823 3820   * Note that the ref/mod bits in the page_t's are not affected by
3824 3821   * this operation, hence it is up to the caller to update them appropriately.
3825 3822   */
3826 3823  int
3827 3824  ppcopy(page_t *frompp, page_t *topp)
3828 3825  {
3829 3826          caddr_t         pp_addr1;
3830 3827          caddr_t         pp_addr2;
3831 3828          hat_mempte_t    pte1;
3832 3829          hat_mempte_t    pte2;
3833 3830          kmutex_t        *ppaddr_mutex;
3834 3831          label_t         ljb;
3835 3832          int             ret = 1;
3836 3833  
3837 3834          ASSERT_STACK_ALIGNED();
3838 3835          ASSERT(PAGE_LOCKED(frompp));
3839 3836          ASSERT(PAGE_LOCKED(topp));
3840 3837  
3841 3838          if (kpm_enable) {
3842 3839                  pp_addr1 = hat_kpm_page2va(frompp, 0);
3843 3840                  pp_addr2 = hat_kpm_page2va(topp, 0);
3844 3841                  kpreempt_disable();
3845 3842          } else {
3846 3843                  /*
3847 3844                   * disable pre-emption so that CPU can't change
3848 3845                   */
3849 3846                  kpreempt_disable();
3850 3847  
3851 3848                  pp_addr1 = CPU->cpu_caddr1;
3852 3849                  pp_addr2 = CPU->cpu_caddr2;
3853 3850                  pte1 = CPU->cpu_caddr1pte;
3854 3851                  pte2 = CPU->cpu_caddr2pte;
3855 3852  
3856 3853                  ppaddr_mutex = &CPU->cpu_ppaddr_mutex;
3857 3854                  mutex_enter(ppaddr_mutex);
3858 3855  
3859 3856                  hat_mempte_remap(page_pptonum(frompp), pp_addr1, pte1,
3860 3857                      PROT_READ | HAT_STORECACHING_OK, HAT_LOAD_NOCONSIST);
3861 3858                  hat_mempte_remap(page_pptonum(topp), pp_addr2, pte2,
3862 3859                      PROT_READ | PROT_WRITE | HAT_STORECACHING_OK,
3863 3860                      HAT_LOAD_NOCONSIST);
3864 3861          }
3865 3862  
3866 3863          if (on_fault(&ljb)) {
3867 3864                  ret = 0;
3868 3865                  goto faulted;
3869 3866          }
3870 3867          if (use_sse_pagecopy)
3871 3868  #ifdef __xpv
3872 3869                  page_copy_no_xmm(pp_addr2, pp_addr1);
3873 3870  #else
3874 3871                  hwblkpagecopy(pp_addr1, pp_addr2);
3875 3872  #endif
3876 3873          else
3877 3874                  bcopy(pp_addr1, pp_addr2, PAGESIZE);
3878 3875  
3879 3876          no_fault();
3880 3877  faulted:
3881 3878          if (!kpm_enable) {
3882 3879  #ifdef __xpv
3883 3880                  /*
3884 3881                   * We can't leave unused mappings laying about under the
3885 3882                   * hypervisor, so blow them away.
3886 3883                   */
3887 3884                  if (HYPERVISOR_update_va_mapping((uintptr_t)pp_addr1, 0,
3888 3885                      UVMF_INVLPG | UVMF_LOCAL) < 0)
3889 3886                          panic("HYPERVISOR_update_va_mapping() failed");
3890 3887                  if (HYPERVISOR_update_va_mapping((uintptr_t)pp_addr2, 0,
3891 3888                      UVMF_INVLPG | UVMF_LOCAL) < 0)
3892 3889                          panic("HYPERVISOR_update_va_mapping() failed");
3893 3890  #endif
3894 3891                  mutex_exit(ppaddr_mutex);
3895 3892          }
3896 3893          kpreempt_enable();
3897 3894          return (ret);
3898 3895  }
3899 3896  
3900 3897  void
3901 3898  pagezero(page_t *pp, uint_t off, uint_t len)
3902 3899  {
3903 3900          ASSERT(PAGE_LOCKED(pp));
3904 3901          pfnzero(page_pptonum(pp), off, len);
3905 3902  }
3906 3903  
3907 3904  /*
3908 3905   * Zero the physical page from off to off + len given by pfn
3909 3906   * without changing the reference and modified bits of page.
3910 3907   *
3911 3908   * We use this using CPU private page address #2, see ppcopy() for more info.
3912 3909   * pfnzero() must not be called at interrupt level.
3913 3910   */
3914 3911  void
3915 3912  pfnzero(pfn_t pfn, uint_t off, uint_t len)
3916 3913  {
3917 3914          caddr_t         pp_addr2;
3918 3915          hat_mempte_t    pte2;
3919 3916          kmutex_t        *ppaddr_mutex = NULL;
3920 3917  
3921 3918          ASSERT_STACK_ALIGNED();
3922 3919          ASSERT(len <= MMU_PAGESIZE);
3923 3920          ASSERT(off <= MMU_PAGESIZE);
3924 3921          ASSERT(off + len <= MMU_PAGESIZE);
3925 3922  
3926 3923          if (kpm_enable && !pfn_is_foreign(pfn)) {
3927 3924                  pp_addr2 = hat_kpm_pfn2va(pfn);
3928 3925                  kpreempt_disable();
3929 3926          } else {
3930 3927                  kpreempt_disable();
3931 3928  
3932 3929                  pp_addr2 = CPU->cpu_caddr2;
3933 3930                  pte2 = CPU->cpu_caddr2pte;
3934 3931  
3935 3932                  ppaddr_mutex = &CPU->cpu_ppaddr_mutex;
3936 3933                  mutex_enter(ppaddr_mutex);
3937 3934  
3938 3935                  hat_mempte_remap(pfn, pp_addr2, pte2,
3939 3936                      PROT_READ | PROT_WRITE | HAT_STORECACHING_OK,
3940 3937                      HAT_LOAD_NOCONSIST);
3941 3938          }
3942 3939  
3943 3940          if (use_sse_pagezero) {
3944 3941  #ifdef __xpv
3945 3942                  uint_t rem;
3946 3943  
3947 3944                  /*
3948 3945                   * zero a byte at a time until properly aligned for
3949 3946                   * block_zero_no_xmm().
3950 3947                   */
3951 3948                  while (!P2NPHASE(off, ((uint_t)BLOCKZEROALIGN)) && len-- > 0)
3952 3949                          pp_addr2[off++] = 0;
3953 3950  
3954 3951                  /*
3955 3952                   * Now use faster block_zero_no_xmm() for any range
3956 3953                   * that is properly aligned and sized.
3957 3954                   */
3958 3955                  rem = P2PHASE(len, ((uint_t)BLOCKZEROALIGN));
3959 3956                  len -= rem;
3960 3957                  if (len != 0) {
3961 3958                          block_zero_no_xmm(pp_addr2 + off, len);
3962 3959                          off += len;
3963 3960                  }
3964 3961  
3965 3962                  /*
3966 3963                   * zero remainder with byte stores.
3967 3964                   */
3968 3965                  while (rem-- > 0)
3969 3966                          pp_addr2[off++] = 0;
3970 3967  #else
3971 3968                  hwblkclr(pp_addr2 + off, len);
3972 3969  #endif
3973 3970          } else {
3974 3971                  bzero(pp_addr2 + off, len);
3975 3972          }
3976 3973  
3977 3974          if (!kpm_enable || pfn_is_foreign(pfn)) {
3978 3975  #ifdef __xpv
3979 3976                  /*
3980 3977                   * On the hypervisor this page might get used for a page
3981 3978                   * table before any intervening change to this mapping,
3982 3979                   * so blow it away.
3983 3980                   */
3984 3981                  if (HYPERVISOR_update_va_mapping((uintptr_t)pp_addr2, 0,
3985 3982                      UVMF_INVLPG) < 0)
3986 3983                          panic("HYPERVISOR_update_va_mapping() failed");
3987 3984  #endif
3988 3985                  mutex_exit(ppaddr_mutex);
3989 3986          }
3990 3987  
3991 3988          kpreempt_enable();
3992 3989  }
3993 3990  
3994 3991  /*
3995 3992   * Platform-dependent page scrub call.
3996 3993   */
3997 3994  void
3998 3995  pagescrub(page_t *pp, uint_t off, uint_t len)
3999 3996  {
4000 3997          /*
4001 3998           * For now, we rely on the fact that pagezero() will
4002 3999           * always clear UEs.
4003 4000           */
4004 4001          pagezero(pp, off, len);
4005 4002  }
4006 4003  
4007 4004  /*
4008 4005   * set up two private addresses for use on a given CPU for use in ppcopy()
4009 4006   */
4010 4007  void
4011 4008  setup_vaddr_for_ppcopy(struct cpu *cpup)
4012 4009  {
4013 4010          void *addr;
4014 4011          hat_mempte_t pte_pa;
4015 4012  
4016 4013          addr = vmem_alloc(heap_arena, mmu_ptob(1), VM_SLEEP);
4017 4014          pte_pa = hat_mempte_setup(addr);
4018 4015          cpup->cpu_caddr1 = addr;
4019 4016          cpup->cpu_caddr1pte = pte_pa;
4020 4017  
4021 4018          addr = vmem_alloc(heap_arena, mmu_ptob(1), VM_SLEEP);
4022 4019          pte_pa = hat_mempte_setup(addr);
4023 4020          cpup->cpu_caddr2 = addr;
4024 4021          cpup->cpu_caddr2pte = pte_pa;
4025 4022  
4026 4023          mutex_init(&cpup->cpu_ppaddr_mutex, NULL, MUTEX_DEFAULT, NULL);
4027 4024  }
4028 4025  
4029 4026  /*
4030 4027   * Undo setup_vaddr_for_ppcopy
4031 4028   */
4032 4029  void
4033 4030  teardown_vaddr_for_ppcopy(struct cpu *cpup)
4034 4031  {
4035 4032          mutex_destroy(&cpup->cpu_ppaddr_mutex);
4036 4033  
4037 4034          hat_mempte_release(cpup->cpu_caddr2, cpup->cpu_caddr2pte);
4038 4035          cpup->cpu_caddr2pte = 0;
4039 4036          vmem_free(heap_arena, cpup->cpu_caddr2, mmu_ptob(1));
4040 4037          cpup->cpu_caddr2 = 0;
4041 4038  
4042 4039          hat_mempte_release(cpup->cpu_caddr1, cpup->cpu_caddr1pte);
4043 4040          cpup->cpu_caddr1pte = 0;
4044 4041          vmem_free(heap_arena, cpup->cpu_caddr1, mmu_ptob(1));
4045 4042          cpup->cpu_caddr1 = 0;
4046 4043  }
4047 4044  
4048 4045  /*
4049 4046   * Function for flushing D-cache when performing module relocations
4050 4047   * to an alternate mapping.  Unnecessary on Intel / AMD platforms.
4051 4048   */
4052 4049  void
4053 4050  dcache_flushall()
4054 4051  {}
4055 4052  
4056 4053  /*
4057 4054   * Allocate a memory page.  The argument 'seed' can be any pseudo-random
4058 4055   * number to vary where the pages come from.  This is quite a hacked up
4059 4056   * method -- it works for now, but really needs to be fixed up a bit.
4060 4057   *
4061 4058   * We currently use page_create_va() on the kvp with fake offsets,
4062 4059   * segments and virt address.  This is pretty bogus, but was copied from the
4063 4060   * old hat_i86.c code.  A better approach would be to specify either mnode
4064 4061   * random or mnode local and takes a page from whatever color has the MOST
4065 4062   * available - this would have a minimal impact on page coloring.
4066 4063   */
4067 4064  page_t *
4068 4065  page_get_physical(uintptr_t seed)
4069 4066  {
4070 4067          page_t *pp;
4071 4068          u_offset_t offset;
4072 4069          static struct seg tmpseg;
4073 4070          static uintptr_t ctr = 0;
4074 4071  
4075 4072          /*
4076 4073           * This code is gross, we really need a simpler page allocator.
4077 4074           *
4078 4075           * We need to assign an offset for the page to call page_create_va()
4079 4076           * To avoid conflicts with other pages, we get creative with the offset.
4080 4077           * For 32 bits, we need an offset > 4Gig
4081 4078           * For 64 bits, need an offset somewhere in the VA hole.
4082 4079           */
4083 4080          offset = seed;
4084 4081          if (offset > kernelbase)
4085 4082                  offset -= kernelbase;
4086 4083          offset <<= MMU_PAGESHIFT;
4087 4084  #if defined(__amd64)
4088 4085          offset += mmu.hole_start;       /* something in VA hole */
4089 4086  #else
4090 4087          offset += 1ULL << 40;   /* something > 4 Gig */
4091 4088  #endif
4092 4089  
4093 4090          if (page_resv(1, KM_NOSLEEP) == 0)
4094 4091                  return (NULL);
4095 4092  
4096 4093  #ifdef  DEBUG
4097 4094          pp = page_exists(&kvp, offset);
4098 4095          if (pp != NULL)
4099 4096                  panic("page already exists %p", (void *)pp);
4100 4097  #endif
4101 4098  
4102 4099          pp = page_create_va(&kvp, offset, MMU_PAGESIZE, PG_EXCL,
4103 4100              &tmpseg, (caddr_t)(ctr += MMU_PAGESIZE));   /* changing VA usage */
4104 4101          if (pp != NULL) {
4105 4102                  page_io_unlock(pp);
4106 4103                  page_downgrade(pp);
4107 4104          }
4108 4105          return (pp);
4109 4106  }

↓ open down ↓

2116 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX