1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
  23  */
  24 /*
  25  * Copyright (c) 2010, Intel Corporation.
  26  * All rights reserved.
  27  */
  28 
  29 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  30 /*      All Rights Reserved   */
  31 
  32 /*
  33  * Portions of this source code were derived from Berkeley 4.3 BSD
  34  * under license from the Regents of the University of California.
  35  */
  36 
  37 /*
  38  * UNIX machine dependent virtual memory support.
  39  */
  40 
  41 #include <sys/types.h>
  42 #include <sys/param.h>
  43 #include <sys/systm.h>
  44 #include <sys/user.h>
  45 #include <sys/proc.h>
  46 #include <sys/kmem.h>
  47 #include <sys/vmem.h>
  48 #include <sys/buf.h>
  49 #include <sys/cpuvar.h>
  50 #include <sys/lgrp.h>
  51 #include <sys/disp.h>
  52 #include <sys/vm.h>
  53 #include <sys/mman.h>
  54 #include <sys/vnode.h>
  55 #include <sys/cred.h>
  56 #include <sys/exec.h>
  57 #include <sys/exechdr.h>
  58 #include <sys/debug.h>
  59 #include <sys/vmsystm.h>
  60 #include <sys/swap.h>
  61 #include <sys/dumphdr.h>
  62 #include <sys/random.h>
  63 
  64 #include <vm/hat.h>
  65 #include <vm/as.h>
  66 #include <vm/seg.h>
  67 #include <vm/seg_kp.h>
  68 #include <vm/seg_vn.h>
  69 #include <vm/page.h>
  70 #include <vm/seg_kmem.h>
  71 #include <vm/seg_kpm.h>
  72 #include <vm/vm_dep.h>
  73 
  74 #include <sys/cpu.h>
  75 #include <sys/vm_machparam.h>
  76 #include <sys/memlist.h>
  77 #include <sys/bootconf.h> /* XXX the memlist stuff belongs in memlist_plat.h */
  78 #include <vm/hat_i86.h>
  79 #include <sys/x86_archext.h>
  80 #include <sys/elf_386.h>
  81 #include <sys/cmn_err.h>
  82 #include <sys/archsystm.h>
  83 #include <sys/machsystm.h>
  84 
  85 #include <sys/vtrace.h>
  86 #include <sys/ddidmareq.h>
  87 #include <sys/promif.h>
  88 #include <sys/memnode.h>
  89 #include <sys/stack.h>
  90 #include <util/qsort.h>
  91 #include <sys/taskq.h>
  92 
  93 #ifdef __xpv
  94 
  95 #include <sys/hypervisor.h>
  96 #include <sys/xen_mmu.h>
  97 #include <sys/balloon_impl.h>
  98 
  99 /*
 100  * domain 0 pages usable for DMA are kept pre-allocated and kept in
 101  * distinct lists, ordered by increasing mfn.
 102  */
 103 static kmutex_t io_pool_lock;
 104 static kmutex_t contig_list_lock;
 105 static page_t *io_pool_4g;      /* pool for 32 bit dma limited devices */
 106 static page_t *io_pool_16m;     /* pool for 24 bit dma limited legacy devices */
 107 static long io_pool_cnt;
 108 static long io_pool_cnt_max = 0;
 109 #define DEFAULT_IO_POOL_MIN     128
 110 static long io_pool_cnt_min = DEFAULT_IO_POOL_MIN;
 111 static long io_pool_cnt_lowater = 0;
 112 static long io_pool_shrink_attempts; /* how many times did we try to shrink */
 113 static long io_pool_shrinks;    /* how many times did we really shrink */
 114 static long io_pool_grows;      /* how many times did we grow */
 115 static mfn_t start_mfn = 1;
 116 static caddr_t io_pool_kva;     /* use to alloc pages when needed */
 117 
 118 static int create_contig_pfnlist(uint_t);
 119 
 120 /*
 121  * percentage of phys mem to hold in the i/o pool
 122  */
 123 #define DEFAULT_IO_POOL_PCT     2
 124 static long io_pool_physmem_pct = DEFAULT_IO_POOL_PCT;
 125 static void page_io_pool_sub(page_t **, page_t *, page_t *);
 126 int ioalloc_dbg = 0;
 127 
 128 #endif /* __xpv */
 129 
 130 uint_t vac_colors = 1;
 131 
 132 int largepagesupport = 0;
 133 extern uint_t page_create_new;
 134 extern uint_t page_create_exists;
 135 extern uint_t page_create_putbacks;
 136 /*
 137  * Allow users to disable the kernel's use of SSE.
 138  */
 139 extern int use_sse_pagecopy, use_sse_pagezero;
 140 
 141 /*
 142  * combined memory ranges from mnode and memranges[] to manage single
 143  * mnode/mtype dimension in the page lists.
 144  */
 145 typedef struct {
 146         pfn_t   mnr_pfnlo;
 147         pfn_t   mnr_pfnhi;
 148         int     mnr_mnode;
 149         int     mnr_memrange;           /* index into memranges[] */
 150         int     mnr_next;               /* next lower PA mnoderange */
 151         int     mnr_exists;
 152         /* maintain page list stats */
 153         pgcnt_t mnr_mt_clpgcnt;         /* cache list cnt */
 154         pgcnt_t mnr_mt_flpgcnt[MMU_PAGE_SIZES]; /* free list cnt per szc */
 155         pgcnt_t mnr_mt_totcnt;          /* sum of cache and free lists */
 156 #ifdef DEBUG
 157         struct mnr_mts {                /* mnode/mtype szc stats */
 158                 pgcnt_t mnr_mts_pgcnt;
 159                 int     mnr_mts_colors;
 160                 pgcnt_t *mnr_mtsc_pgcnt;
 161         }       *mnr_mts;
 162 #endif
 163 } mnoderange_t;
 164 
 165 #define MEMRANGEHI(mtype)                                               \
 166         ((mtype > 0) ? memranges[mtype - 1] - 1: physmax)
 167 #define MEMRANGELO(mtype)       (memranges[mtype])
 168 
 169 #define MTYPE_FREEMEM(mt)       (mnoderanges[mt].mnr_mt_totcnt)
 170 
 171 /*
 172  * As the PC architecture evolved memory up was clumped into several
 173  * ranges for various historical I/O devices to do DMA.
 174  * < 16Meg - ISA bus
 175  * < 2Gig - ???
 176  * < 4Gig - PCI bus or drivers that don't understand PAE mode
 177  *
 178  * These are listed in reverse order, so that we can skip over unused
 179  * ranges on machines with small memories.
 180  *
 181  * For now under the Hypervisor, we'll only ever have one memrange.
 182  */
 183 #define PFN_4GIG        0x100000
 184 #define PFN_16MEG       0x1000
 185 /* Indices into the memory range (arch_memranges) array. */
 186 #define MRI_4G          0
 187 #define MRI_2G          1
 188 #define MRI_16M         2
 189 #define MRI_0           3
 190 static pfn_t arch_memranges[NUM_MEM_RANGES] = {
 191     PFN_4GIG,   /* pfn range for 4G and above */
 192     0x80000,    /* pfn range for 2G-4G */
 193     PFN_16MEG,  /* pfn range for 16M-2G */
 194     0x00000,    /* pfn range for 0-16M */
 195 };
 196 pfn_t *memranges = &arch_memranges[0];
 197 int nranges = NUM_MEM_RANGES;
 198 
 199 /*
 200  * This combines mem_node_config and memranges into one data
 201  * structure to be used for page list management.
 202  */
 203 mnoderange_t    *mnoderanges;
 204 int             mnoderangecnt;
 205 int             mtype4g;
 206 int             mtype16m;
 207 int             mtypetop;       /* index of highest pfn'ed mnoderange */
 208 
 209 /*
 210  * 4g memory management variables for systems with more than 4g of memory:
 211  *
 212  * physical memory below 4g is required for 32bit dma devices and, currently,
 213  * for kmem memory. On systems with more than 4g of memory, the pool of memory
 214  * below 4g can be depleted without any paging activity given that there is
 215  * likely to be sufficient memory above 4g.
 216  *
 217  * physmax4g is set true if the largest pfn is over 4g. The rest of the
 218  * 4g memory management code is enabled only when physmax4g is true.
 219  *
 220  * maxmem4g is the count of the maximum number of pages on the page lists
 221  * with physical addresses below 4g. It can be a lot less then 4g given that
 222  * BIOS may reserve large chunks of space below 4g for hot plug pci devices,
 223  * agp aperture etc.
 224  *
 225  * freemem4g maintains the count of the number of available pages on the
 226  * page lists with physical addresses below 4g.
 227  *
 228  * DESFREE4G specifies the desired amount of below 4g memory. It defaults to
 229  * 6% (desfree4gshift = 4) of maxmem4g.
 230  *
 231  * RESTRICT4G_ALLOC returns true if freemem4g falls below DESFREE4G
 232  * and the amount of physical memory above 4g is greater than freemem4g.
 233  * In this case, page_get_* routines will restrict below 4g allocations
 234  * for requests that don't specifically require it.
 235  */
 236 
 237 #define DESFREE4G       (maxmem4g >> desfree4gshift)
 238 
 239 #define RESTRICT4G_ALLOC                                        \
 240         (physmax4g && (freemem4g < DESFREE4G) && ((freemem4g << 1) < freemem))
 241 
 242 static pgcnt_t  maxmem4g;
 243 static pgcnt_t  freemem4g;
 244 static int      physmax4g;
 245 static int      desfree4gshift = 4;     /* maxmem4g shift to derive DESFREE4G */
 246 
 247 /*
 248  * 16m memory management:
 249  *
 250  * reserve some amount of physical memory below 16m for legacy devices.
 251  *
 252  * RESTRICT16M_ALLOC returns true if an there are sufficient free pages above
 253  * 16m or if the 16m pool drops below DESFREE16M.
 254  *
 255  * In this case, general page allocations via page_get_{free,cache}list
 256  * routines will be restricted from allocating from the 16m pool. Allocations
 257  * that require specific pfn ranges (page_get_anylist) and PG_PANIC allocations
 258  * are not restricted.
 259  */
 260 
 261 #define FREEMEM16M      MTYPE_FREEMEM(mtype16m)
 262 #define DESFREE16M      desfree16m
 263 #define RESTRICT16M_ALLOC(freemem, pgcnt, flags)                \
 264         ((freemem != 0) && ((flags & PG_PANIC) == 0) &&             \
 265             ((freemem >= (FREEMEM16M)) ||                    \
 266             (FREEMEM16M  < (DESFREE16M + pgcnt))))
 267 
 268 static pgcnt_t  desfree16m = 0x380;
 269 
 270 /*
 271  * This can be patched via /etc/system to allow old non-PAE aware device
 272  * drivers to use kmem_alloc'd memory on 32 bit systems with > 4Gig RAM.
 273  */
 274 int restricted_kmemalloc = 0;
 275 
 276 #ifdef VM_STATS
 277 struct {
 278         ulong_t pga_alloc;
 279         ulong_t pga_notfullrange;
 280         ulong_t pga_nulldmaattr;
 281         ulong_t pga_allocok;
 282         ulong_t pga_allocfailed;
 283         ulong_t pgma_alloc;
 284         ulong_t pgma_allocok;
 285         ulong_t pgma_allocfailed;
 286         ulong_t pgma_allocempty;
 287 } pga_vmstats;
 288 #endif
 289 
 290 uint_t mmu_page_sizes;
 291 
 292 /* How many page sizes the users can see */
 293 uint_t mmu_exported_page_sizes;
 294 
 295 /* page sizes that legacy applications can see */
 296 uint_t mmu_legacy_page_sizes;
 297 
 298 /*
 299  * Number of pages in 1 GB.  Don't enable automatic large pages if we have
 300  * fewer than this many pages.
 301  */
 302 pgcnt_t shm_lpg_min_physmem = 1 << (30 - MMU_PAGESHIFT);
 303 pgcnt_t privm_lpg_min_physmem = 1 << (30 - MMU_PAGESHIFT);
 304 
 305 /*
 306  * Maximum and default segment size tunables for user private
 307  * and shared anon memory, and user text and initialized data.
 308  * These can be patched via /etc/system to allow large pages
 309  * to be used for mapping application private and shared anon memory.
 310  */
 311 size_t mcntl0_lpsize = MMU_PAGESIZE;
 312 size_t max_uheap_lpsize = MMU_PAGESIZE;
 313 size_t default_uheap_lpsize = MMU_PAGESIZE;
 314 size_t max_ustack_lpsize = MMU_PAGESIZE;
 315 size_t default_ustack_lpsize = MMU_PAGESIZE;
 316 size_t max_privmap_lpsize = MMU_PAGESIZE;
 317 size_t max_uidata_lpsize = MMU_PAGESIZE;
 318 size_t max_utext_lpsize = MMU_PAGESIZE;
 319 size_t max_shm_lpsize = MMU_PAGESIZE;
 320 
 321 
 322 /*
 323  * initialized by page_coloring_init().
 324  */
 325 uint_t  page_colors;
 326 uint_t  page_colors_mask;
 327 uint_t  page_coloring_shift;
 328 int     cpu_page_colors;
 329 static uint_t   l2_colors;
 330 
 331 /*
 332  * Page freelists and cachelists are dynamically allocated once mnoderangecnt
 333  * and page_colors are calculated from the l2 cache n-way set size.  Within a
 334  * mnode range, the page freelist and cachelist are hashed into bins based on
 335  * color. This makes it easier to search for a page within a specific memory
 336  * range.
 337  */
 338 #define PAGE_COLORS_MIN 16
 339 
 340 page_t ****page_freelists;
 341 page_t ***page_cachelists;
 342 
 343 
 344 /*
 345  * Used by page layer to know about page sizes
 346  */
 347 hw_pagesize_t hw_page_array[MAX_NUM_LEVEL + 1];
 348 
 349 kmutex_t        *fpc_mutex[NPC_MUTEX];
 350 kmutex_t        *cpc_mutex[NPC_MUTEX];
 351 
 352 /* Lock to protect mnoderanges array for memory DR operations. */
 353 static kmutex_t mnoderange_lock;
 354 
 355 /*
 356  * Only let one thread at a time try to coalesce large pages, to
 357  * prevent them from working against each other.
 358  */
 359 static kmutex_t contig_lock;
 360 #define CONTIG_LOCK()   mutex_enter(&contig_lock);
 361 #define CONTIG_UNLOCK() mutex_exit(&contig_lock);
 362 
 363 #define PFN_16M         (mmu_btop((uint64_t)0x1000000))
 364 
 365 /*
 366  * Return the optimum page size for a given mapping
 367  */
 368 /*ARGSUSED*/
 369 size_t
 370 map_pgsz(int maptype, struct proc *p, caddr_t addr, size_t len, int memcntl)
 371 {
 372         level_t l = 0;
 373         size_t pgsz = MMU_PAGESIZE;
 374         size_t max_lpsize;
 375         uint_t mszc;
 376 
 377         ASSERT(maptype != MAPPGSZ_VA);
 378 
 379         if (maptype != MAPPGSZ_ISM && physmem < privm_lpg_min_physmem) {
 380                 return (MMU_PAGESIZE);
 381         }
 382 
 383         switch (maptype) {
 384         case MAPPGSZ_HEAP:
 385         case MAPPGSZ_STK:
 386                 max_lpsize = memcntl ? mcntl0_lpsize : (maptype ==
 387                     MAPPGSZ_HEAP ? max_uheap_lpsize : max_ustack_lpsize);
 388                 if (max_lpsize == MMU_PAGESIZE) {
 389                         return (MMU_PAGESIZE);
 390                 }
 391                 if (len == 0) {
 392                         len = (maptype == MAPPGSZ_HEAP) ? p->p_brkbase +
 393                             p->p_brksize - p->p_bssbase : p->p_stksize;
 394                 }
 395                 len = (maptype == MAPPGSZ_HEAP) ? MAX(len,
 396                     default_uheap_lpsize) : MAX(len, default_ustack_lpsize);
 397 
 398                 /*
 399                  * use the pages size that best fits len
 400                  */
 401                 for (l = mmu.umax_page_level; l > 0; --l) {
 402                         if (LEVEL_SIZE(l) > max_lpsize || len < LEVEL_SIZE(l)) {
 403                                 continue;
 404                         } else {
 405                                 pgsz = LEVEL_SIZE(l);
 406                         }
 407                         break;
 408                 }
 409 
 410                 mszc = (maptype == MAPPGSZ_HEAP ? p->p_brkpageszc :
 411                     p->p_stkpageszc);
 412                 if (addr == 0 && (pgsz < hw_page_array[mszc].hp_size)) {
 413                         pgsz = hw_page_array[mszc].hp_size;
 414                 }
 415                 return (pgsz);
 416 
 417         case MAPPGSZ_ISM:
 418                 for (l = mmu.umax_page_level; l > 0; --l) {
 419                         if (len >= LEVEL_SIZE(l))
 420                                 return (LEVEL_SIZE(l));
 421                 }
 422                 return (LEVEL_SIZE(0));
 423         }
 424         return (pgsz);
 425 }
 426 
 427 static uint_t
 428 map_szcvec(caddr_t addr, size_t size, uintptr_t off, size_t max_lpsize,
 429     size_t min_physmem)
 430 {
 431         caddr_t eaddr = addr + size;
 432         uint_t szcvec = 0;
 433         caddr_t raddr;
 434         caddr_t readdr;
 435         size_t  pgsz;
 436         int i;
 437 
 438         if (physmem < min_physmem || max_lpsize <= MMU_PAGESIZE) {
 439                 return (0);
 440         }
 441 
 442         for (i = mmu_exported_page_sizes - 1; i > 0; i--) {
 443                 pgsz = page_get_pagesize(i);
 444                 if (pgsz > max_lpsize) {
 445                         continue;
 446                 }
 447                 raddr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
 448                 readdr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
 449                 if (raddr < addr || raddr >= readdr) {
 450                         continue;
 451                 }
 452                 if (P2PHASE((uintptr_t)addr ^ off, pgsz)) {
 453                         continue;
 454                 }
 455                 /*
 456                  * Set szcvec to the remaining page sizes.
 457                  */
 458                 szcvec = ((1 << (i + 1)) - 1) & ~1;
 459                 break;
 460         }
 461         return (szcvec);
 462 }
 463 
 464 /*
 465  * Return a bit vector of large page size codes that
 466  * can be used to map [addr, addr + len) region.
 467  */
 468 /*ARGSUSED*/
 469 uint_t
 470 map_pgszcvec(caddr_t addr, size_t size, uintptr_t off, int flags, int type,
 471     int memcntl)
 472 {
 473         size_t max_lpsize = mcntl0_lpsize;
 474 
 475         if (mmu.max_page_level == 0)
 476                 return (0);
 477 
 478         if (flags & MAP_TEXT) {
 479                 if (!memcntl)
 480                         max_lpsize = max_utext_lpsize;
 481                 return (map_szcvec(addr, size, off, max_lpsize,
 482                     shm_lpg_min_physmem));
 483 
 484         } else if (flags & MAP_INITDATA) {
 485                 if (!memcntl)
 486                         max_lpsize = max_uidata_lpsize;
 487                 return (map_szcvec(addr, size, off, max_lpsize,
 488                     privm_lpg_min_physmem));
 489 
 490         } else if (type == MAPPGSZC_SHM) {
 491                 if (!memcntl)
 492                         max_lpsize = max_shm_lpsize;
 493                 return (map_szcvec(addr, size, off, max_lpsize,
 494                     shm_lpg_min_physmem));
 495 
 496         } else if (type == MAPPGSZC_HEAP) {
 497                 if (!memcntl)
 498                         max_lpsize = max_uheap_lpsize;
 499                 return (map_szcvec(addr, size, off, max_lpsize,
 500                     privm_lpg_min_physmem));
 501 
 502         } else if (type == MAPPGSZC_STACK) {
 503                 if (!memcntl)
 504                         max_lpsize = max_ustack_lpsize;
 505                 return (map_szcvec(addr, size, off, max_lpsize,
 506                     privm_lpg_min_physmem));
 507 
 508         } else {
 509                 if (!memcntl)
 510                         max_lpsize = max_privmap_lpsize;
 511                 return (map_szcvec(addr, size, off, max_lpsize,
 512                     privm_lpg_min_physmem));
 513         }
 514 }
 515 
 516 /*
 517  * Handle a pagefault.
 518  */
 519 faultcode_t
 520 pagefault(
 521         caddr_t addr,
 522         enum fault_type type,
 523         enum seg_rw rw,
 524         int iskernel)
 525 {
 526         struct as *as;
 527         struct hat *hat;
 528         struct proc *p;
 529         kthread_t *t;
 530         faultcode_t res;
 531         caddr_t base;
 532         size_t len;
 533         int err;
 534         int mapped_red;
 535         uintptr_t ea;
 536 
 537         ASSERT_STACK_ALIGNED();
 538 
 539         if (INVALID_VADDR(addr))
 540                 return (FC_NOMAP);
 541 
 542         mapped_red = segkp_map_red();
 543 
 544         if (iskernel) {
 545                 as = &kas;
 546                 hat = as->a_hat;
 547         } else {
 548                 t = curthread;
 549                 p = ttoproc(t);
 550                 as = p->p_as;
 551                 hat = as->a_hat;
 552         }
 553 
 554         /*
 555          * Dispatch pagefault.
 556          */
 557         res = as_fault(hat, as, addr, 1, type, rw);
 558 
 559         /*
 560          * If this isn't a potential unmapped hole in the user's
 561          * UNIX data or stack segments, just return status info.
 562          */
 563         if (res != FC_NOMAP || iskernel)
 564                 goto out;
 565 
 566         /*
 567          * Check to see if we happened to faulted on a currently unmapped
 568          * part of the UNIX data or stack segments.  If so, create a zfod
 569          * mapping there and then try calling the fault routine again.
 570          */
 571         base = p->p_brkbase;
 572         len = p->p_brksize;
 573 
 574         if (addr < base || addr >= base + len) {          /* data seg? */
 575                 base = (caddr_t)p->p_usrstack - p->p_stksize;
 576                 len = p->p_stksize;
 577                 if (addr < base || addr >= p->p_usrstack) {    /* stack seg? */
 578                         /* not in either UNIX data or stack segments */
 579                         res = FC_NOMAP;
 580                         goto out;
 581                 }
 582         }
 583 
 584         /*
 585          * the rest of this function implements a 3.X 4.X 5.X compatibility
 586          * This code is probably not needed anymore
 587          */
 588         if (p->p_model == DATAMODEL_ILP32) {
 589 
 590                 /* expand the gap to the page boundaries on each side */
 591                 ea = P2ROUNDUP((uintptr_t)base + len, MMU_PAGESIZE);
 592                 base = (caddr_t)P2ALIGN((uintptr_t)base, MMU_PAGESIZE);
 593                 len = ea - (uintptr_t)base;
 594 
 595                 as_rangelock(as);
 596                 if (as_gap(as, MMU_PAGESIZE, &base, &len, AH_CONTAIN, addr) ==
 597                     0) {
 598                         err = as_map(as, base, len, segvn_create, zfod_argsp);
 599                         as_rangeunlock(as);
 600                         if (err) {
 601                                 res = FC_MAKE_ERR(err);
 602                                 goto out;
 603                         }
 604                 } else {
 605                         /*
 606                          * This page is already mapped by another thread after
 607                          * we returned from as_fault() above.  We just fall
 608                          * through as_fault() below.
 609                          */
 610                         as_rangeunlock(as);
 611                 }
 612 
 613                 res = as_fault(hat, as, addr, 1, F_INVAL, rw);
 614         }
 615 
 616 out:
 617         if (mapped_red)
 618                 segkp_unmap_red();
 619 
 620         return (res);
 621 }
 622 
 623 void
 624 map_addr(caddr_t *addrp, size_t len, offset_t off, int vacalign, uint_t flags)
 625 {
 626         struct proc *p = curproc;
 627         caddr_t userlimit = (flags & _MAP_LOW32) ?
 628             (caddr_t)_userlimit32 : p->p_as->a_userlimit;
 629 
 630         map_addr_proc(addrp, len, off, vacalign, userlimit, curproc, flags);
 631 }
 632 
 633 /*ARGSUSED*/
 634 int
 635 map_addr_vacalign_check(caddr_t addr, u_offset_t off)
 636 {
 637         return (0);
 638 }
 639 
 640 /*
 641  * The maximum amount a randomized mapping will be slewed.  We should perhaps
 642  * arrange things so these tunables can be separate for mmap, mmapobj, and
 643  * ld.so
 644  */
 645 volatile size_t aslr_max_map_skew = 256 * 1024 * 1024; /* 256MB */
 646 
 647 /*
 648  * map_addr_proc() is the routine called when the system is to
 649  * choose an address for the user.  We will pick an address
 650  * range which is the highest available below userlimit.
 651  *
 652  * Every mapping will have a redzone of a single page on either side of
 653  * the request. This is done to leave one page unmapped between segments.
 654  * This is not required, but it's useful for the user because if their
 655  * program strays across a segment boundary, it will catch a fault
 656  * immediately making debugging a little easier.  Currently the redzone
 657  * is mandatory.
 658  *
 659  * addrp is a value/result parameter.
 660  *      On input it is a hint from the user to be used in a completely
 661  *      machine dependent fashion.  We decide to completely ignore this hint.
 662  *      If MAP_ALIGN was specified, addrp contains the minimal alignment, which
 663  *      must be some "power of two" multiple of pagesize.
 664  *
 665  *      On output it is NULL if no address can be found in the current
 666  *      processes address space or else an address that is currently
 667  *      not mapped for len bytes with a page of red zone on either side.
 668  *
 669  *      vacalign is not needed on x86 (it's for viturally addressed caches)
 670  */
 671 /*ARGSUSED*/
 672 void
 673 map_addr_proc(
 674         caddr_t *addrp,
 675         size_t len,
 676         offset_t off,
 677         int vacalign,
 678         caddr_t userlimit,
 679         struct proc *p,
 680         uint_t flags)
 681 {
 682         struct as *as = p->p_as;
 683         caddr_t addr;
 684         caddr_t base;
 685         size_t slen;
 686         size_t align_amount;
 687 
 688         ASSERT32(userlimit == as->a_userlimit);
 689 
 690         base = p->p_brkbase;
 691 #if defined(__amd64)
 692         /*
 693          * XX64 Yes, this needs more work.
 694          */
 695         if (p->p_model == DATAMODEL_NATIVE) {
 696                 if (userlimit < as->a_userlimit) {
 697                         /*
 698                          * This happens when a program wants to map
 699                          * something in a range that's accessible to a
 700                          * program in a smaller address space.  For example,
 701                          * a 64-bit program calling mmap32(2) to guarantee
 702                          * that the returned address is below 4Gbytes.
 703                          */
 704                         ASSERT((uintptr_t)userlimit < ADDRESS_C(0xffffffff));
 705 
 706                         if (userlimit > base)
 707                                 slen = userlimit - base;
 708                         else {
 709                                 *addrp = NULL;
 710                                 return;
 711                         }
 712                 } else {
 713                         /*
 714                          * XX64 This layout is probably wrong .. but in
 715                          * the event we make the amd64 address space look
 716                          * like sparcv9 i.e. with the stack -above- the
 717                          * heap, this bit of code might even be correct.
 718                          */
 719                         slen = p->p_usrstack - base -
 720                             ((p->p_stk_ctl + PAGEOFFSET) & PAGEMASK);
 721                 }
 722         } else
 723 #endif
 724                 slen = userlimit - base;
 725 
 726         /* Make len be a multiple of PAGESIZE */
 727         len = (len + PAGEOFFSET) & PAGEMASK;
 728 
 729         /*
 730          * figure out what the alignment should be
 731          *
 732          * XX64 -- is there an ELF_AMD64_MAXPGSZ or is it the same????
 733          */
 734         if (len <= ELF_386_MAXPGSZ) {
 735                 /*
 736                  * Align virtual addresses to ensure that ELF shared libraries
 737                  * are mapped with the appropriate alignment constraints by
 738                  * the run-time linker.
 739                  */
 740                 align_amount = ELF_386_MAXPGSZ;
 741         } else {
 742                 /*
 743                  * For 32-bit processes, only those which have specified
 744                  * MAP_ALIGN and an addr will be aligned on a larger page size.
 745                  * Not doing so can potentially waste up to 1G of process
 746                  * address space.
 747                  */
 748                 int lvl = (p->p_model == DATAMODEL_ILP32) ? 1 :
 749                     mmu.umax_page_level;
 750 
 751                 while (lvl && len < LEVEL_SIZE(lvl))
 752                         --lvl;
 753 
 754                 align_amount = LEVEL_SIZE(lvl);
 755         }
 756         if ((flags & MAP_ALIGN) && ((uintptr_t)*addrp > align_amount))
 757                 align_amount = (uintptr_t)*addrp;
 758 
 759         ASSERT(ISP2(align_amount));
 760         ASSERT(align_amount == 0 || align_amount >= PAGESIZE);
 761 
 762         off = off & (align_amount - 1);
 763 
 764         /*
 765          * Look for a large enough hole starting below userlimit.
 766          * After finding it, use the upper part.
 767          */
 768         if (as_gap_aligned(as, len, &base, &slen, AH_HI, NULL, align_amount,
 769             PAGESIZE, off) == 0) {
 770                 caddr_t as_addr;
 771 
 772                 /*
 773                  * addr is the highest possible address to use since we have
 774                  * a PAGESIZE redzone at the beginning and end.
 775                  */
 776                 addr = base + slen - (PAGESIZE + len);
 777                 as_addr = addr;
 778                 /*
 779                  * Round address DOWN to the alignment amount and
 780                  * add the offset in.
 781                  * If addr is greater than as_addr, len would not be large
 782                  * enough to include the redzone, so we must adjust down
 783                  * by the alignment amount.
 784                  */
 785                 addr = (caddr_t)((uintptr_t)addr & (~(align_amount - 1)));
 786                 addr += (uintptr_t)off;
 787                 if (addr > as_addr) {
 788                         addr -= align_amount;
 789                 }
 790 
 791                 /*
 792                  * If randomization is requested, slew the allocation
 793                  * backwards, within the same gap, by a random amount.
 794                  *
 795                  * XXX: This will fall over in processes like Java, which
 796                  * commonly have a great many small mappings.
 797                  */
 798                 if (flags & _MAP_RANDOMIZE) {
 799                         uint32_t slew;
 800 
 801                         (void) random_get_pseudo_bytes((uint8_t *)&slew,
 802                             sizeof (slew));
 803 
 804                         slew = slew % MIN(aslr_max_map_skew, (addr - base));
 805                         addr -= P2ALIGN(slew, align_amount);
 806                 }
 807 
 808                 ASSERT(addr > base);
 809                 ASSERT(addr + len < base + slen);
 810                 ASSERT(((uintptr_t)addr & (align_amount - 1)) ==
 811                     ((uintptr_t)(off)));
 812                 *addrp = addr;
 813         } else {
 814                 *addrp = NULL;  /* no more virtual space */
 815         }
 816 }
 817 
 818 int valid_va_range_aligned_wraparound;
 819 
 820 /*
 821  * Determine whether [*basep, *basep + *lenp) contains a mappable range of
 822  * addresses at least "minlen" long, where the base of the range is at "off"
 823  * phase from an "align" boundary and there is space for a "redzone"-sized
 824  * redzone on either side of the range.  On success, 1 is returned and *basep
 825  * and *lenp are adjusted to describe the acceptable range (including
 826  * the redzone).  On failure, 0 is returned.
 827  */
 828 /*ARGSUSED3*/
 829 int
 830 valid_va_range_aligned(caddr_t *basep, size_t *lenp, size_t minlen, int dir,
 831     size_t align, size_t redzone, size_t off)
 832 {
 833         uintptr_t hi, lo;
 834         size_t tot_len;
 835 
 836         ASSERT(align == 0 ? off == 0 : off < align);
 837         ASSERT(ISP2(align));
 838         ASSERT(align == 0 || align >= PAGESIZE);
 839 
 840         lo = (uintptr_t)*basep;
 841         hi = lo + *lenp;
 842         tot_len = minlen + 2 * redzone; /* need at least this much space */
 843 
 844         /*
 845          * If hi rolled over the top, try cutting back.
 846          */
 847         if (hi < lo) {
 848                 *lenp = 0UL - lo - 1UL;
 849                 /* See if this really happens. If so, then we figure out why */
 850                 valid_va_range_aligned_wraparound++;
 851                 hi = lo + *lenp;
 852         }
 853         if (*lenp < tot_len) {
 854                 return (0);
 855         }
 856 
 857 #if defined(__amd64)
 858         /*
 859          * Deal with a possible hole in the address range between
 860          * hole_start and hole_end that should never be mapped.
 861          */
 862         if (lo < hole_start) {
 863                 if (hi > hole_start) {
 864                         if (hi < hole_end) {
 865                                 hi = hole_start;
 866                         } else {
 867                                 /* lo < hole_start && hi >= hole_end */
 868                                 if (dir == AH_LO) {
 869                                         /*
 870                                          * prefer lowest range
 871                                          */
 872                                         if (hole_start - lo >= tot_len)
 873                                                 hi = hole_start;
 874                                         else if (hi - hole_end >= tot_len)
 875                                                 lo = hole_end;
 876                                         else
 877                                                 return (0);
 878                                 } else {
 879                                         /*
 880                                          * prefer highest range
 881                                          */
 882                                         if (hi - hole_end >= tot_len)
 883                                                 lo = hole_end;
 884                                         else if (hole_start - lo >= tot_len)
 885                                                 hi = hole_start;
 886                                         else
 887                                                 return (0);
 888                                 }
 889                         }
 890                 }
 891         } else {
 892                 /* lo >= hole_start */
 893                 if (hi < hole_end)
 894                         return (0);
 895                 if (lo < hole_end)
 896                         lo = hole_end;
 897         }
 898 #endif
 899 
 900         if (hi - lo < tot_len)
 901                 return (0);
 902 
 903         if (align > 1) {
 904                 uintptr_t tlo = lo + redzone;
 905                 uintptr_t thi = hi - redzone;
 906                 tlo = (uintptr_t)P2PHASEUP(tlo, align, off);
 907                 if (tlo < lo + redzone) {
 908                         return (0);
 909                 }
 910                 if (thi < tlo || thi - tlo < minlen) {
 911                         return (0);
 912                 }
 913         }
 914 
 915         *basep = (caddr_t)lo;
 916         *lenp = hi - lo;
 917         return (1);
 918 }
 919 
 920 /*
 921  * Determine whether [*basep, *basep + *lenp) contains a mappable range of
 922  * addresses at least "minlen" long.  On success, 1 is returned and *basep
 923  * and *lenp are adjusted to describe the acceptable range.  On failure, 0
 924  * is returned.
 925  */
 926 int
 927 valid_va_range(caddr_t *basep, size_t *lenp, size_t minlen, int dir)
 928 {
 929         return (valid_va_range_aligned(basep, lenp, minlen, dir, 0, 0, 0));
 930 }
 931 
 932 /*
 933  * Determine whether [addr, addr+len] are valid user addresses.
 934  */
 935 /*ARGSUSED*/
 936 int
 937 valid_usr_range(caddr_t addr, size_t len, uint_t prot, struct as *as,
 938     caddr_t userlimit)
 939 {
 940         caddr_t eaddr = addr + len;
 941 
 942         if (eaddr <= addr || addr >= userlimit || eaddr > userlimit)
 943                 return (RANGE_BADADDR);
 944 
 945 #if defined(__amd64)
 946         /*
 947          * Check for the VA hole
 948          */
 949         if (eaddr > (caddr_t)hole_start && addr < (caddr_t)hole_end)
 950                 return (RANGE_BADADDR);
 951 #endif
 952 
 953         return (RANGE_OKAY);
 954 }
 955 
 956 /*
 957  * Return 1 if the page frame is onboard memory, else 0.
 958  */
 959 int
 960 pf_is_memory(pfn_t pf)
 961 {
 962         if (pfn_is_foreign(pf))
 963                 return (0);
 964         return (address_in_memlist(phys_install, pfn_to_pa(pf), 1));
 965 }
 966 
 967 /*
 968  * return the memrange containing pfn
 969  */
 970 int
 971 memrange_num(pfn_t pfn)
 972 {
 973         int n;
 974 
 975         for (n = 0; n < nranges - 1; ++n) {
 976                 if (pfn >= memranges[n])
 977                         break;
 978         }
 979         return (n);
 980 }
 981 
 982 /*
 983  * return the mnoderange containing pfn
 984  */
 985 /*ARGSUSED*/
 986 int
 987 pfn_2_mtype(pfn_t pfn)
 988 {
 989 #if defined(__xpv)
 990         return (0);
 991 #else
 992         int     n;
 993 
 994         /* Always start from highest pfn and work our way down */
 995         for (n = mtypetop; n != -1; n = mnoderanges[n].mnr_next) {
 996                 if (pfn >= mnoderanges[n].mnr_pfnlo) {
 997                         break;
 998                 }
 999         }
1000         return (n);
1001 #endif
1002 }
1003 
1004 #if !defined(__xpv)
1005 /*
1006  * is_contigpage_free:
1007  *      returns a page list of contiguous pages. It minimally has to return
1008  *      minctg pages. Caller determines minctg based on the scatter-gather
1009  *      list length.
1010  *
1011  *      pfnp is set to the next page frame to search on return.
1012  */
1013 static page_t *
1014 is_contigpage_free(
1015         pfn_t *pfnp,
1016         pgcnt_t *pgcnt,
1017         pgcnt_t minctg,
1018         uint64_t pfnseg,
1019         int iolock)
1020 {
1021         int     i = 0;
1022         pfn_t   pfn = *pfnp;
1023         page_t  *pp;
1024         page_t  *plist = NULL;
1025 
1026         /*
1027          * fail if pfn + minctg crosses a segment boundary.
1028          * Adjust for next starting pfn to begin at segment boundary.
1029          */
1030 
1031         if (((*pfnp + minctg - 1) & pfnseg) < (*pfnp & pfnseg)) {
1032                 *pfnp = roundup(*pfnp, pfnseg + 1);
1033                 return (NULL);
1034         }
1035 
1036         do {
1037 retry:
1038                 pp = page_numtopp_nolock(pfn + i);
1039                 if ((pp == NULL) || IS_DUMP_PAGE(pp) ||
1040                     (page_trylock(pp, SE_EXCL) == 0)) {
1041                         (*pfnp)++;
1042                         break;
1043                 }
1044                 if (page_pptonum(pp) != pfn + i) {
1045                         page_unlock(pp);
1046                         goto retry;
1047                 }
1048 
1049                 if (!(PP_ISFREE(pp))) {
1050                         page_unlock(pp);
1051                         (*pfnp)++;
1052                         break;
1053                 }
1054 
1055                 if (!PP_ISAGED(pp)) {
1056                         page_list_sub(pp, PG_CACHE_LIST);
1057                         page_hashout(pp, (kmutex_t *)NULL);
1058                 } else {
1059                         page_list_sub(pp, PG_FREE_LIST);
1060                 }
1061 
1062                 if (iolock)
1063                         page_io_lock(pp);
1064                 page_list_concat(&plist, &pp);
1065 
1066                 /*
1067                  * exit loop when pgcnt satisfied or segment boundary reached.
1068                  */
1069 
1070         } while ((++i < *pgcnt) && ((pfn + i) & pfnseg));
1071 
1072         *pfnp += i;             /* set to next pfn to search */
1073 
1074         if (i >= minctg) {
1075                 *pgcnt -= i;
1076                 return (plist);
1077         }
1078 
1079         /*
1080          * failure: minctg not satisfied.
1081          *
1082          * if next request crosses segment boundary, set next pfn
1083          * to search from the segment boundary.
1084          */
1085         if (((*pfnp + minctg - 1) & pfnseg) < (*pfnp & pfnseg))
1086                 *pfnp = roundup(*pfnp, pfnseg + 1);
1087 
1088         /* clean up any pages already allocated */
1089 
1090         while (plist) {
1091                 pp = plist;
1092                 page_sub(&plist, pp);
1093                 page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
1094                 if (iolock)
1095                         page_io_unlock(pp);
1096                 page_unlock(pp);
1097         }
1098 
1099         return (NULL);
1100 }
1101 #endif  /* !__xpv */
1102 
1103 /*
1104  * verify that pages being returned from allocator have correct DMA attribute
1105  */
1106 #ifndef DEBUG
1107 #define check_dma(a, b, c) (void)(0)
1108 #else
1109 static void
1110 check_dma(ddi_dma_attr_t *dma_attr, page_t *pp, int cnt)
1111 {
1112         if (dma_attr == NULL)
1113                 return;
1114 
1115         while (cnt-- > 0) {
1116                 if (pa_to_ma(pfn_to_pa(pp->p_pagenum)) <
1117                     dma_attr->dma_attr_addr_lo)
1118                         panic("PFN (pp=%p) below dma_attr_addr_lo", (void *)pp);
1119                 if (pa_to_ma(pfn_to_pa(pp->p_pagenum)) >=
1120                     dma_attr->dma_attr_addr_hi)
1121                         panic("PFN (pp=%p) above dma_attr_addr_hi", (void *)pp);
1122                 pp = pp->p_next;
1123         }
1124 }
1125 #endif
1126 
1127 #if !defined(__xpv)
1128 static page_t *
1129 page_get_contigpage(pgcnt_t *pgcnt, ddi_dma_attr_t *mattr, int iolock)
1130 {
1131         pfn_t           pfn;
1132         int             sgllen;
1133         uint64_t        pfnseg;
1134         pgcnt_t         minctg;
1135         page_t          *pplist = NULL, *plist;
1136         uint64_t        lo, hi;
1137         pgcnt_t         pfnalign = 0;
1138         static pfn_t    startpfn;
1139         static pgcnt_t  lastctgcnt;
1140         uintptr_t       align;
1141 
1142         CONTIG_LOCK();
1143 
1144         if (mattr) {
1145                 lo = mmu_btop((mattr->dma_attr_addr_lo + MMU_PAGEOFFSET));
1146                 hi = mmu_btop(mattr->dma_attr_addr_hi);
1147                 if (hi >= physmax)
1148                         hi = physmax - 1;
1149                 sgllen = mattr->dma_attr_sgllen;
1150                 pfnseg = mmu_btop(mattr->dma_attr_seg);
1151 
1152                 align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer);
1153                 if (align > MMU_PAGESIZE)
1154                         pfnalign = mmu_btop(align);
1155 
1156                 /*
1157                  * in order to satisfy the request, must minimally
1158                  * acquire minctg contiguous pages
1159                  */
1160                 minctg = howmany(*pgcnt, sgllen);
1161 
1162                 ASSERT(hi >= lo);
1163 
1164                 /*
1165                  * start from where last searched if the minctg >= lastctgcnt
1166                  */
1167                 if (minctg < lastctgcnt || startpfn < lo || startpfn > hi)
1168                         startpfn = lo;
1169         } else {
1170                 hi = physmax - 1;
1171                 lo = 0;
1172                 sgllen = 1;
1173                 pfnseg = mmu.highest_pfn;
1174                 minctg = *pgcnt;
1175 
1176                 if (minctg < lastctgcnt)
1177                         startpfn = lo;
1178         }
1179         lastctgcnt = minctg;
1180 
1181         ASSERT(pfnseg + 1 >= (uint64_t)minctg);
1182 
1183         /* conserve 16m memory - start search above 16m when possible */
1184         if (hi > PFN_16M && startpfn < PFN_16M)
1185                 startpfn = PFN_16M;
1186 
1187         pfn = startpfn;
1188         if (pfnalign)
1189                 pfn = P2ROUNDUP(pfn, pfnalign);
1190 
1191         while (pfn + minctg - 1 <= hi) {
1192 
1193                 plist = is_contigpage_free(&pfn, pgcnt, minctg, pfnseg, iolock);
1194                 if (plist) {
1195                         page_list_concat(&pplist, &plist);
1196                         sgllen--;
1197                         /*
1198                          * return when contig pages no longer needed
1199                          */
1200                         if (!*pgcnt || ((*pgcnt <= sgllen) && !pfnalign)) {
1201                                 startpfn = pfn;
1202                                 CONTIG_UNLOCK();
1203                                 check_dma(mattr, pplist, *pgcnt);
1204                                 return (pplist);
1205                         }
1206                         minctg = howmany(*pgcnt, sgllen);
1207                 }
1208                 if (pfnalign)
1209                         pfn = P2ROUNDUP(pfn, pfnalign);
1210         }
1211 
1212         /* cannot find contig pages in specified range */
1213         if (startpfn == lo) {
1214                 CONTIG_UNLOCK();
1215                 return (NULL);
1216         }
1217 
1218         /* did not start with lo previously */
1219         pfn = lo;
1220         if (pfnalign)
1221                 pfn = P2ROUNDUP(pfn, pfnalign);
1222 
1223         /* allow search to go above startpfn */
1224         while (pfn < startpfn) {
1225 
1226                 plist = is_contigpage_free(&pfn, pgcnt, minctg, pfnseg, iolock);
1227                 if (plist != NULL) {
1228 
1229                         page_list_concat(&pplist, &plist);
1230                         sgllen--;
1231 
1232                         /*
1233                          * return when contig pages no longer needed
1234                          */
1235                         if (!*pgcnt || ((*pgcnt <= sgllen) && !pfnalign)) {
1236                                 startpfn = pfn;
1237                                 CONTIG_UNLOCK();
1238                                 check_dma(mattr, pplist, *pgcnt);
1239                                 return (pplist);
1240                         }
1241                         minctg = howmany(*pgcnt, sgllen);
1242                 }
1243                 if (pfnalign)
1244                         pfn = P2ROUNDUP(pfn, pfnalign);
1245         }
1246         CONTIG_UNLOCK();
1247         return (NULL);
1248 }
1249 #endif  /* !__xpv */
1250 
1251 /*
1252  * mnode_range_cnt() calculates the number of memory ranges for mnode and
1253  * memranges[]. Used to determine the size of page lists and mnoderanges.
1254  */
1255 int
1256 mnode_range_cnt(int mnode)
1257 {
1258 #if defined(__xpv)
1259         ASSERT(mnode == 0);
1260         return (1);
1261 #else   /* __xpv */
1262         int     mri;
1263         int     mnrcnt = 0;
1264 
1265         if (mem_node_config[mnode].exists != 0) {
1266                 mri = nranges - 1;
1267 
1268                 /* find the memranges index below contained in mnode range */
1269 
1270                 while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase)
1271                         mri--;
1272 
1273                 /*
1274                  * increment mnode range counter when memranges or mnode
1275                  * boundary is reached.
1276                  */
1277                 while (mri >= 0 &&
1278                     mem_node_config[mnode].physmax >= MEMRANGELO(mri)) {
1279                         mnrcnt++;
1280                         if (mem_node_config[mnode].physmax > MEMRANGEHI(mri))
1281                                 mri--;
1282                         else
1283                                 break;
1284                 }
1285         }
1286         ASSERT(mnrcnt <= MAX_MNODE_MRANGES);
1287         return (mnrcnt);
1288 #endif  /* __xpv */
1289 }
1290 
1291 /*
1292  * mnode_range_setup() initializes mnoderanges.
1293  */
1294 void
1295 mnode_range_setup(mnoderange_t *mnoderanges)
1296 {
1297         mnoderange_t *mp = mnoderanges;
1298         int     mnode, mri;
1299         int     mindex = 0;     /* current index into mnoderanges array */
1300         int     i, j;
1301         pfn_t   hipfn;
1302         int     last, hi;
1303 
1304         for (mnode = 0; mnode < max_mem_nodes; mnode++) {
1305                 if (mem_node_config[mnode].exists == 0)
1306                         continue;
1307 
1308                 mri = nranges - 1;
1309 
1310                 while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase)
1311                         mri--;
1312 
1313                 while (mri >= 0 && mem_node_config[mnode].physmax >=
1314                     MEMRANGELO(mri)) {
1315                         mnoderanges->mnr_pfnlo = MAX(MEMRANGELO(mri),
1316                             mem_node_config[mnode].physbase);
1317                         mnoderanges->mnr_pfnhi = MIN(MEMRANGEHI(mri),
1318                             mem_node_config[mnode].physmax);
1319                         mnoderanges->mnr_mnode = mnode;
1320                         mnoderanges->mnr_memrange = mri;
1321                         mnoderanges->mnr_exists = 1;
1322                         mnoderanges++;
1323                         mindex++;
1324                         if (mem_node_config[mnode].physmax > MEMRANGEHI(mri))
1325                                 mri--;
1326                         else
1327                                 break;
1328                 }
1329         }
1330 
1331         /*
1332          * For now do a simple sort of the mnoderanges array to fill in
1333          * the mnr_next fields.  Since mindex is expected to be relatively
1334          * small, using a simple O(N^2) algorithm.
1335          */
1336         for (i = 0; i < mindex; i++) {
1337                 if (mp[i].mnr_pfnlo == 0)       /* find lowest */
1338                         break;
1339         }
1340         ASSERT(i < mindex);
1341         last = i;
1342         mtype16m = last;
1343         mp[last].mnr_next = -1;
1344         for (i = 0; i < mindex - 1; i++) {
1345                 hipfn = (pfn_t)(-1);
1346                 hi = -1;
1347                 /* find next highest mnode range */
1348                 for (j = 0; j < mindex; j++) {
1349                         if (mp[j].mnr_pfnlo > mp[last].mnr_pfnlo &&
1350                             mp[j].mnr_pfnlo < hipfn) {
1351                                 hipfn = mp[j].mnr_pfnlo;
1352                                 hi = j;
1353                         }
1354                 }
1355                 mp[hi].mnr_next = last;
1356                 last = hi;
1357         }
1358         mtypetop = last;
1359 }
1360 
1361 #ifndef __xpv
1362 /*
1363  * Update mnoderanges for memory hot-add DR operations.
1364  */
1365 static void
1366 mnode_range_add(int mnode)
1367 {
1368         int     *prev;
1369         int     n, mri;
1370         pfn_t   start, end;
1371         extern  void membar_sync(void);
1372 
1373         ASSERT(0 <= mnode && mnode < max_mem_nodes);
1374         ASSERT(mem_node_config[mnode].exists);
1375         start = mem_node_config[mnode].physbase;
1376         end = mem_node_config[mnode].physmax;
1377         ASSERT(start <= end);
1378         mutex_enter(&mnoderange_lock);
1379 
1380 #ifdef  DEBUG
1381         /* Check whether it interleaves with other memory nodes. */
1382         for (n = mtypetop; n != -1; n = mnoderanges[n].mnr_next) {
1383                 ASSERT(mnoderanges[n].mnr_exists);
1384                 if (mnoderanges[n].mnr_mnode == mnode)
1385                         continue;
1386                 ASSERT(start > mnoderanges[n].mnr_pfnhi ||
1387                     end < mnoderanges[n].mnr_pfnlo);
1388         }
1389 #endif  /* DEBUG */
1390 
1391         mri = nranges - 1;
1392         while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase)
1393                 mri--;
1394         while (mri >= 0 && mem_node_config[mnode].physmax >= MEMRANGELO(mri)) {
1395                 /* Check whether mtype already exists. */
1396                 for (n = mtypetop; n != -1; n = mnoderanges[n].mnr_next) {
1397                         if (mnoderanges[n].mnr_mnode == mnode &&
1398                             mnoderanges[n].mnr_memrange == mri) {
1399                                 mnoderanges[n].mnr_pfnlo = MAX(MEMRANGELO(mri),
1400                                     start);
1401                                 mnoderanges[n].mnr_pfnhi = MIN(MEMRANGEHI(mri),
1402                                     end);
1403                                 break;
1404                         }
1405                 }
1406 
1407                 /* Add a new entry if it doesn't exist yet. */
1408                 if (n == -1) {
1409                         /* Try to find an unused entry in mnoderanges array. */
1410                         for (n = 0; n < mnoderangecnt; n++) {
1411                                 if (mnoderanges[n].mnr_exists == 0)
1412                                         break;
1413                         }
1414                         ASSERT(n < mnoderangecnt);
1415                         mnoderanges[n].mnr_pfnlo = MAX(MEMRANGELO(mri), start);
1416                         mnoderanges[n].mnr_pfnhi = MIN(MEMRANGEHI(mri), end);
1417                         mnoderanges[n].mnr_mnode = mnode;
1418                         mnoderanges[n].mnr_memrange = mri;
1419                         mnoderanges[n].mnr_exists = 1;
1420                         /* Page 0 should always be present. */
1421                         for (prev = &mtypetop;
1422                             mnoderanges[*prev].mnr_pfnlo > start;
1423                             prev = &mnoderanges[*prev].mnr_next) {
1424                                 ASSERT(mnoderanges[*prev].mnr_next >= 0);
1425                                 ASSERT(mnoderanges[*prev].mnr_pfnlo > end);
1426                         }
1427                         mnoderanges[n].mnr_next = *prev;
1428                         membar_sync();
1429                         *prev = n;
1430                 }
1431 
1432                 if (mem_node_config[mnode].physmax > MEMRANGEHI(mri))
1433                         mri--;
1434                 else
1435                         break;
1436         }
1437 
1438         mutex_exit(&mnoderange_lock);
1439 }
1440 
1441 /*
1442  * Update mnoderanges for memory hot-removal DR operations.
1443  */
1444 static void
1445 mnode_range_del(int mnode)
1446 {
1447         _NOTE(ARGUNUSED(mnode));
1448         ASSERT(0 <= mnode && mnode < max_mem_nodes);
1449         /* TODO: support deletion operation. */
1450         ASSERT(0);
1451 }
1452 
1453 void
1454 plat_slice_add(pfn_t start, pfn_t end)
1455 {
1456         mem_node_add_slice(start, end);
1457         if (plat_dr_enabled()) {
1458                 mnode_range_add(PFN_2_MEM_NODE(start));
1459         }
1460 }
1461 
1462 void
1463 plat_slice_del(pfn_t start, pfn_t end)
1464 {
1465         ASSERT(PFN_2_MEM_NODE(start) == PFN_2_MEM_NODE(end));
1466         ASSERT(plat_dr_enabled());
1467         mnode_range_del(PFN_2_MEM_NODE(start));
1468         mem_node_del_slice(start, end);
1469 }
1470 #endif  /* __xpv */
1471 
1472 /*ARGSUSED*/
1473 int
1474 mtype_init(vnode_t *vp, caddr_t vaddr, uint_t *flags, size_t pgsz)
1475 {
1476         int mtype = mtypetop;
1477 
1478 #if !defined(__xpv)
1479 #if defined(__i386)
1480         /*
1481          * set the mtype range
1482          * - kmem requests need to be below 4g if restricted_kmemalloc is set.
1483          * - for non kmem requests, set range to above 4g if memory below 4g
1484          * runs low.
1485          */
1486         if (restricted_kmemalloc && VN_ISKAS(vp) &&
1487             (caddr_t)(vaddr) >= kernelheap &&
1488             (caddr_t)(vaddr) < ekernelheap) {
1489                 ASSERT(physmax4g);
1490                 mtype = mtype4g;
1491                 if (RESTRICT16M_ALLOC(freemem4g - btop(pgsz),
1492                     btop(pgsz), *flags)) {
1493                         *flags |= PGI_MT_RANGE16M;
1494                 } else {
1495                         VM_STAT_ADD(vmm_vmstats.unrestrict16mcnt);
1496                         VM_STAT_COND_ADD((*flags & PG_PANIC),
1497                             vmm_vmstats.pgpanicalloc);
1498                         *flags |= PGI_MT_RANGE0;
1499                 }
1500                 return (mtype);
1501         }
1502 #endif  /* __i386 */
1503 
1504         if (RESTRICT4G_ALLOC) {
1505                 VM_STAT_ADD(vmm_vmstats.restrict4gcnt);
1506                 /* here only for > 4g systems */
1507                 *flags |= PGI_MT_RANGE4G;
1508         } else if (RESTRICT16M_ALLOC(freemem, btop(pgsz), *flags)) {
1509                 *flags |= PGI_MT_RANGE16M;
1510         } else {
1511                 VM_STAT_ADD(vmm_vmstats.unrestrict16mcnt);
1512                 VM_STAT_COND_ADD((*flags & PG_PANIC), vmm_vmstats.pgpanicalloc);
1513                 *flags |= PGI_MT_RANGE0;
1514         }
1515 #endif /* !__xpv */
1516         return (mtype);
1517 }
1518 
1519 
1520 /* mtype init for page_get_replacement_page */
1521 /*ARGSUSED*/
1522 int
1523 mtype_pgr_init(int *flags, page_t *pp, int mnode, pgcnt_t pgcnt)
1524 {
1525         int mtype = mtypetop;
1526 #if !defined(__xpv)
1527         if (RESTRICT16M_ALLOC(freemem, pgcnt, *flags)) {
1528                 *flags |= PGI_MT_RANGE16M;
1529         } else {
1530                 VM_STAT_ADD(vmm_vmstats.unrestrict16mcnt);
1531                 *flags |= PGI_MT_RANGE0;
1532         }
1533 #endif
1534         return (mtype);
1535 }
1536 
1537 /*
1538  * Determine if the mnode range specified in mtype contains memory belonging
1539  * to memory node mnode.  If flags & PGI_MT_RANGE is set then mtype contains
1540  * the range from high pfn to 0, 16m or 4g.
1541  *
1542  * Return first mnode range type index found otherwise return -1 if none found.
1543  */
1544 int
1545 mtype_func(int mnode, int mtype, uint_t flags)
1546 {
1547         if (flags & PGI_MT_RANGE) {
1548                 int     mnr_lim = MRI_0;
1549 
1550                 if (flags & PGI_MT_NEXT) {
1551                         mtype = mnoderanges[mtype].mnr_next;
1552                 }
1553                 if (flags & PGI_MT_RANGE4G)
1554                         mnr_lim = MRI_4G;       /* exclude 0-4g range */
1555                 else if (flags & PGI_MT_RANGE16M)
1556                         mnr_lim = MRI_16M;      /* exclude 0-16m range */
1557                 while (mtype != -1 &&
1558                     mnoderanges[mtype].mnr_memrange <= mnr_lim) {
1559                         if (mnoderanges[mtype].mnr_mnode == mnode)
1560                                 return (mtype);
1561                         mtype = mnoderanges[mtype].mnr_next;
1562                 }
1563         } else if (mnoderanges[mtype].mnr_mnode == mnode) {
1564                 return (mtype);
1565         }
1566         return (-1);
1567 }
1568 
1569 /*
1570  * Update the page list max counts with the pfn range specified by the
1571  * input parameters.
1572  */
1573 void
1574 mtype_modify_max(pfn_t startpfn, long cnt)
1575 {
1576         int             mtype;
1577         pgcnt_t         inc;
1578         spgcnt_t        scnt = (spgcnt_t)(cnt);
1579         pgcnt_t         acnt = ABS(scnt);
1580         pfn_t           endpfn = startpfn + acnt;
1581         pfn_t           pfn, lo;
1582 
1583         if (!physmax4g)
1584                 return;
1585 
1586         mtype = mtypetop;
1587         for (pfn = endpfn; pfn > startpfn; ) {
1588                 ASSERT(mtype != -1);
1589                 lo = mnoderanges[mtype].mnr_pfnlo;
1590                 if (pfn > lo) {
1591                         if (startpfn >= lo) {
1592                                 inc = pfn - startpfn;
1593                         } else {
1594                                 inc = pfn - lo;
1595                         }
1596                         if (mnoderanges[mtype].mnr_memrange != MRI_4G) {
1597                                 if (scnt > 0)
1598                                         maxmem4g += inc;
1599                                 else
1600                                         maxmem4g -= inc;
1601                         }
1602                         pfn -= inc;
1603                 }
1604                 mtype = mnoderanges[mtype].mnr_next;
1605         }
1606 }
1607 
1608 int
1609 mtype_2_mrange(int mtype)
1610 {
1611         return (mnoderanges[mtype].mnr_memrange);
1612 }
1613 
1614 void
1615 mnodetype_2_pfn(int mnode, int mtype, pfn_t *pfnlo, pfn_t *pfnhi)
1616 {
1617         _NOTE(ARGUNUSED(mnode));
1618         ASSERT(mnoderanges[mtype].mnr_mnode == mnode);
1619         *pfnlo = mnoderanges[mtype].mnr_pfnlo;
1620         *pfnhi = mnoderanges[mtype].mnr_pfnhi;
1621 }
1622 
1623 size_t
1624 plcnt_sz(size_t ctrs_sz)
1625 {
1626 #ifdef DEBUG
1627         int     szc, colors;
1628 
1629         ctrs_sz += mnoderangecnt * sizeof (struct mnr_mts) * mmu_page_sizes;
1630         for (szc = 0; szc < mmu_page_sizes; szc++) {
1631                 colors = page_get_pagecolors(szc);
1632                 ctrs_sz += mnoderangecnt * sizeof (pgcnt_t) * colors;
1633         }
1634 #endif
1635         return (ctrs_sz);
1636 }
1637 
1638 caddr_t
1639 plcnt_init(caddr_t addr)
1640 {
1641 #ifdef DEBUG
1642         int     mt, szc, colors;
1643 
1644         for (mt = 0; mt < mnoderangecnt; mt++) {
1645                 mnoderanges[mt].mnr_mts = (struct mnr_mts *)addr;
1646                 addr += (sizeof (struct mnr_mts) * mmu_page_sizes);
1647                 for (szc = 0; szc < mmu_page_sizes; szc++) {
1648                         colors = page_get_pagecolors(szc);
1649                         mnoderanges[mt].mnr_mts[szc].mnr_mts_colors = colors;
1650                         mnoderanges[mt].mnr_mts[szc].mnr_mtsc_pgcnt =
1651                             (pgcnt_t *)addr;
1652                         addr += (sizeof (pgcnt_t) * colors);
1653                 }
1654         }
1655 #endif
1656         return (addr);
1657 }
1658 
1659 void
1660 plcnt_inc_dec(page_t *pp, int mtype, int szc, long cnt, int flags)
1661 {
1662         _NOTE(ARGUNUSED(pp));
1663 #ifdef DEBUG
1664         int     bin = PP_2_BIN(pp);
1665 
1666         atomic_add_long(&mnoderanges[mtype].mnr_mts[szc].mnr_mts_pgcnt, cnt);
1667         atomic_add_long(&mnoderanges[mtype].mnr_mts[szc].mnr_mtsc_pgcnt[bin],
1668             cnt);
1669 #endif
1670         ASSERT(mtype == PP_2_MTYPE(pp));
1671         if (physmax4g && mnoderanges[mtype].mnr_memrange != MRI_4G)
1672                 atomic_add_long(&freemem4g, cnt);
1673         if (flags & PG_CACHE_LIST)
1674                 atomic_add_long(&mnoderanges[mtype].mnr_mt_clpgcnt, cnt);
1675         else
1676                 atomic_add_long(&mnoderanges[mtype].mnr_mt_flpgcnt[szc], cnt);
1677         atomic_add_long(&mnoderanges[mtype].mnr_mt_totcnt, cnt);
1678 }
1679 
1680 /*
1681  * Returns the free page count for mnode
1682  */
1683 int
1684 mnode_pgcnt(int mnode)
1685 {
1686         int     mtype = mtypetop;
1687         int     flags = PGI_MT_RANGE0;
1688         pgcnt_t pgcnt = 0;
1689 
1690         mtype = mtype_func(mnode, mtype, flags);
1691 
1692         while (mtype != -1) {
1693                 pgcnt += MTYPE_FREEMEM(mtype);
1694                 mtype = mtype_func(mnode, mtype, flags | PGI_MT_NEXT);
1695         }
1696         return (pgcnt);
1697 }
1698 
1699 /*
1700  * Initialize page coloring variables based on the l2 cache parameters.
1701  * Calculate and return memory needed for page coloring data structures.
1702  */
1703 size_t
1704 page_coloring_init(uint_t l2_sz, int l2_linesz, int l2_assoc)
1705 {
1706         _NOTE(ARGUNUSED(l2_linesz));
1707         size_t  colorsz = 0;
1708         int     i;
1709         int     colors;
1710 
1711 #if defined(__xpv)
1712         /*
1713          * Hypervisor domains currently don't have any concept of NUMA.
1714          * Hence we'll act like there is only 1 memrange.
1715          */
1716         i = memrange_num(1);
1717 #else /* !__xpv */
1718         /*
1719          * Reduce the memory ranges lists if we don't have large amounts
1720          * of memory. This avoids searching known empty free lists.
1721          * To support memory DR operations, we need to keep memory ranges
1722          * for possible memory hot-add operations.
1723          */
1724         if (plat_dr_physmax > physmax)
1725                 i = memrange_num(plat_dr_physmax);
1726         else
1727                 i = memrange_num(physmax);
1728 #if defined(__i386)
1729         if (i > MRI_4G)
1730                 restricted_kmemalloc = 0;
1731 #endif
1732         /* physmax greater than 4g */
1733         if (i == MRI_4G)
1734                 physmax4g = 1;
1735 #endif /* !__xpv */
1736         memranges += i;
1737         nranges -= i;
1738 
1739         ASSERT(mmu_page_sizes <= MMU_PAGE_SIZES);
1740 
1741         ASSERT(ISP2(l2_linesz));
1742         ASSERT(l2_sz > MMU_PAGESIZE);
1743 
1744         /* l2_assoc is 0 for fully associative l2 cache */
1745         if (l2_assoc)
1746                 l2_colors = MAX(1, l2_sz / (l2_assoc * MMU_PAGESIZE));
1747         else
1748                 l2_colors = 1;
1749 
1750         ASSERT(ISP2(l2_colors));
1751 
1752         /* for scalability, configure at least PAGE_COLORS_MIN color bins */
1753         page_colors = MAX(l2_colors, PAGE_COLORS_MIN);
1754 
1755         /*
1756          * cpu_page_colors is non-zero when a page color may be spread across
1757          * multiple bins.
1758          */
1759         if (l2_colors < page_colors)
1760                 cpu_page_colors = l2_colors;
1761 
1762         ASSERT(ISP2(page_colors));
1763 
1764         page_colors_mask = page_colors - 1;
1765 
1766         ASSERT(ISP2(CPUSETSIZE()));
1767         page_coloring_shift = lowbit(CPUSETSIZE());
1768 
1769         /* initialize number of colors per page size */
1770         for (i = 0; i <= mmu.max_page_level; i++) {
1771                 hw_page_array[i].hp_size = LEVEL_SIZE(i);
1772                 hw_page_array[i].hp_shift = LEVEL_SHIFT(i);
1773                 hw_page_array[i].hp_pgcnt = LEVEL_SIZE(i) >> LEVEL_SHIFT(0);
1774                 hw_page_array[i].hp_colors = (page_colors_mask >>
1775                     (hw_page_array[i].hp_shift - hw_page_array[0].hp_shift))
1776                     + 1;
1777                 colorequivszc[i] = 0;
1778         }
1779 
1780         /*
1781          * The value of cpu_page_colors determines if additional color bins
1782          * need to be checked for a particular color in the page_get routines.
1783          */
1784         if (cpu_page_colors != 0) {
1785 
1786                 int a = lowbit(page_colors) - lowbit(cpu_page_colors);
1787                 ASSERT(a > 0);
1788                 ASSERT(a < 16);
1789 
1790                 for (i = 0; i <= mmu.max_page_level; i++) {
1791                         if ((colors = hw_page_array[i].hp_colors) <= 1) {
1792                                 colorequivszc[i] = 0;
1793                                 continue;
1794                         }
1795                         while ((colors >> a) == 0)
1796                                 a--;
1797                         ASSERT(a >= 0);
1798 
1799                         /* higher 4 bits encodes color equiv mask */
1800                         colorequivszc[i] = (a << 4);
1801                 }
1802         }
1803 
1804         /* factor in colorequiv to check additional 'equivalent' bins. */
1805         if (colorequiv > 1) {
1806 
1807                 int a = lowbit(colorequiv) - 1;
1808                 if (a > 15)
1809                         a = 15;
1810 
1811                 for (i = 0; i <= mmu.max_page_level; i++) {
1812                         if ((colors = hw_page_array[i].hp_colors) <= 1) {
1813                                 continue;
1814                         }
1815                         while ((colors >> a) == 0)
1816                                 a--;
1817                         if ((a << 4) > colorequivszc[i]) {
1818                                 colorequivszc[i] = (a << 4);
1819                         }
1820                 }
1821         }
1822 
1823         /* size for mnoderanges */
1824         for (mnoderangecnt = 0, i = 0; i < max_mem_nodes; i++)
1825                 mnoderangecnt += mnode_range_cnt(i);
1826         if (plat_dr_support_memory()) {
1827                 /*
1828                  * Reserve enough space for memory DR operations.
1829                  * Two extra mnoderanges for possbile fragmentations,
1830                  * one for the 2G boundary and the other for the 4G boundary.
1831                  * We don't expect a memory board crossing the 16M boundary
1832                  * for memory hot-add operations on x86 platforms.
1833                  */
1834                 mnoderangecnt += 2 + max_mem_nodes - lgrp_plat_node_cnt;
1835         }
1836         colorsz = mnoderangecnt * sizeof (mnoderange_t);
1837 
1838         /* size for fpc_mutex and cpc_mutex */
1839         colorsz += (2 * max_mem_nodes * sizeof (kmutex_t) * NPC_MUTEX);
1840 
1841         /* size of page_freelists */
1842         colorsz += mnoderangecnt * sizeof (page_t ***);
1843         colorsz += mnoderangecnt * mmu_page_sizes * sizeof (page_t **);
1844 
1845         for (i = 0; i < mmu_page_sizes; i++) {
1846                 colors = page_get_pagecolors(i);
1847                 colorsz += mnoderangecnt * colors * sizeof (page_t *);
1848         }
1849 
1850         /* size of page_cachelists */
1851         colorsz += mnoderangecnt * sizeof (page_t **);
1852         colorsz += mnoderangecnt * page_colors * sizeof (page_t *);
1853 
1854         return (colorsz);
1855 }
1856 
1857 /*
1858  * Called once at startup to configure page_coloring data structures and
1859  * does the 1st page_free()/page_freelist_add().
1860  */
1861 void
1862 page_coloring_setup(caddr_t pcmemaddr)
1863 {
1864         int     i;
1865         int     j;
1866         int     k;
1867         caddr_t addr;
1868         int     colors;
1869 
1870         /*
1871          * do page coloring setup
1872          */
1873         addr = pcmemaddr;
1874 
1875         mnoderanges = (mnoderange_t *)addr;
1876         addr += (mnoderangecnt * sizeof (mnoderange_t));
1877 
1878         mnode_range_setup(mnoderanges);
1879 
1880         if (physmax4g)
1881                 mtype4g = pfn_2_mtype(0xfffff);
1882 
1883         for (k = 0; k < NPC_MUTEX; k++) {
1884                 fpc_mutex[k] = (kmutex_t *)addr;
1885                 addr += (max_mem_nodes * sizeof (kmutex_t));
1886         }
1887         for (k = 0; k < NPC_MUTEX; k++) {
1888                 cpc_mutex[k] = (kmutex_t *)addr;
1889                 addr += (max_mem_nodes * sizeof (kmutex_t));
1890         }
1891         page_freelists = (page_t ****)addr;
1892         addr += (mnoderangecnt * sizeof (page_t ***));
1893 
1894         page_cachelists = (page_t ***)addr;
1895         addr += (mnoderangecnt * sizeof (page_t **));
1896 
1897         for (i = 0; i < mnoderangecnt; i++) {
1898                 page_freelists[i] = (page_t ***)addr;
1899                 addr += (mmu_page_sizes * sizeof (page_t **));
1900 
1901                 for (j = 0; j < mmu_page_sizes; j++) {
1902                         colors = page_get_pagecolors(j);
1903                         page_freelists[i][j] = (page_t **)addr;
1904                         addr += (colors * sizeof (page_t *));
1905                 }
1906                 page_cachelists[i] = (page_t **)addr;
1907                 addr += (page_colors * sizeof (page_t *));
1908         }
1909 }
1910 
1911 #if defined(__xpv)
1912 /*
1913  * Give back 10% of the io_pool pages to the free list.
1914  * Don't shrink the pool below some absolute minimum.
1915  */
1916 static void
1917 page_io_pool_shrink()
1918 {
1919         int retcnt;
1920         page_t *pp, *pp_first, *pp_last, **curpool;
1921         mfn_t mfn;
1922         int bothpools = 0;
1923 
1924         mutex_enter(&io_pool_lock);
1925         io_pool_shrink_attempts++;      /* should be a kstat? */
1926         retcnt = io_pool_cnt / 10;
1927         if (io_pool_cnt - retcnt < io_pool_cnt_min)
1928                 retcnt = io_pool_cnt - io_pool_cnt_min;
1929         if (retcnt <= 0)
1930                 goto done;
1931         io_pool_shrinks++;      /* should be a kstat? */
1932         curpool = &io_pool_4g;
1933 domore:
1934         /*
1935          * Loop through taking pages from the end of the list
1936          * (highest mfns) till amount to return reached.
1937          */
1938         for (pp = *curpool; pp && retcnt > 0; ) {
1939                 pp_first = pp_last = pp->p_prev;
1940                 if (pp_first == *curpool)
1941                         break;
1942                 retcnt--;
1943                 io_pool_cnt--;
1944                 page_io_pool_sub(curpool, pp_first, pp_last);
1945                 if ((mfn = pfn_to_mfn(pp->p_pagenum)) < start_mfn)
1946                         start_mfn = mfn;
1947                 page_free(pp_first, 1);
1948                 pp = *curpool;
1949         }
1950         if (retcnt != 0 && !bothpools) {
1951                 /*
1952                  * If not enough found in less constrained pool try the
1953                  * more constrained one.
1954                  */
1955                 curpool = &io_pool_16m;
1956                 bothpools = 1;
1957                 goto domore;
1958         }
1959 done:
1960         mutex_exit(&io_pool_lock);
1961 }
1962 
1963 #endif  /* __xpv */
1964 
1965 uint_t
1966 page_create_update_flags_x86(uint_t flags)
1967 {
1968 #if defined(__xpv)
1969         /*
1970          * Check this is an urgent allocation and free pages are depleted.
1971          */
1972         if (!(flags & PG_WAIT) && freemem < desfree)
1973                 page_io_pool_shrink();
1974 #else /* !__xpv */
1975         /*
1976          * page_create_get_something may call this because 4g memory may be
1977          * depleted. Set flags to allow for relocation of base page below
1978          * 4g if necessary.
1979          */
1980         if (physmax4g)
1981                 flags |= (PGI_PGCPSZC0 | PGI_PGCPHIPRI);
1982 #endif /* __xpv */
1983         return (flags);
1984 }
1985 
1986 /*ARGSUSED*/
1987 int
1988 bp_color(struct buf *bp)
1989 {
1990         return (0);
1991 }
1992 
1993 #if defined(__xpv)
1994 
1995 /*
1996  * Take pages out of an io_pool
1997  */
1998 static void
1999 page_io_pool_sub(page_t **poolp, page_t *pp_first, page_t *pp_last)
2000 {
2001         if (*poolp == pp_first) {
2002                 *poolp = pp_last->p_next;
2003                 if (*poolp == pp_first)
2004                         *poolp = NULL;
2005         }
2006         pp_first->p_prev->p_next = pp_last->p_next;
2007         pp_last->p_next->p_prev = pp_first->p_prev;
2008         pp_first->p_prev = pp_last;
2009         pp_last->p_next = pp_first;
2010 }
2011 
2012 /*
2013  * Put a page on the io_pool list. The list is ordered by increasing MFN.
2014  */
2015 static void
2016 page_io_pool_add(page_t **poolp, page_t *pp)
2017 {
2018         page_t  *look;
2019         mfn_t   mfn = mfn_list[pp->p_pagenum];
2020 
2021         if (*poolp == NULL) {
2022                 *poolp = pp;
2023                 pp->p_next = pp;
2024                 pp->p_prev = pp;
2025                 return;
2026         }
2027 
2028         /*
2029          * Since we try to take pages from the high end of the pool
2030          * chances are good that the pages to be put on the list will
2031          * go at or near the end of the list. so start at the end and
2032          * work backwards.
2033          */
2034         look = (*poolp)->p_prev;
2035         while (mfn < mfn_list[look->p_pagenum]) {
2036                 look = look->p_prev;
2037                 if (look == (*poolp)->p_prev)
2038                         break; /* backed all the way to front of list */
2039         }
2040 
2041         /* insert after look */
2042         pp->p_prev = look;
2043         pp->p_next = look->p_next;
2044         pp->p_next->p_prev = pp;
2045         look->p_next = pp;
2046         if (mfn < mfn_list[(*poolp)->p_pagenum]) {
2047                 /*
2048                  * we inserted a new first list element
2049                  * adjust pool pointer to newly inserted element
2050                  */
2051                 *poolp = pp;
2052         }
2053 }
2054 
2055 /*
2056  * Add a page to the io_pool.  Setting the force flag will force the page
2057  * into the io_pool no matter what.
2058  */
2059 static void
2060 add_page_to_pool(page_t *pp, int force)
2061 {
2062         page_t *highest;
2063         page_t *freep = NULL;
2064 
2065         mutex_enter(&io_pool_lock);
2066         /*
2067          * Always keep the scarce low memory pages
2068          */
2069         if (mfn_list[pp->p_pagenum] < PFN_16MEG) {
2070                 ++io_pool_cnt;
2071                 page_io_pool_add(&io_pool_16m, pp);
2072                 goto done;
2073         }
2074         if (io_pool_cnt < io_pool_cnt_max || force || io_pool_4g == NULL) {
2075                 ++io_pool_cnt;
2076                 page_io_pool_add(&io_pool_4g, pp);
2077         } else {
2078                 highest = io_pool_4g->p_prev;
2079                 if (mfn_list[pp->p_pagenum] < mfn_list[highest->p_pagenum]) {
2080                         page_io_pool_sub(&io_pool_4g, highest, highest);
2081                         page_io_pool_add(&io_pool_4g, pp);
2082                         freep = highest;
2083                 } else {
2084                         freep = pp;
2085                 }
2086         }
2087 done:
2088         mutex_exit(&io_pool_lock);
2089         if (freep)
2090                 page_free(freep, 1);
2091 }
2092 
2093 
2094 int contig_pfn_cnt;     /* no of pfns in the contig pfn list */
2095 int contig_pfn_max;     /* capacity of the contig pfn list */
2096 int next_alloc_pfn;     /* next position in list to start a contig search */
2097 int contig_pfnlist_updates;     /* pfn list update count */
2098 int contig_pfnlist_builds;      /* how many times have we (re)built list */
2099 int contig_pfnlist_buildfailed; /* how many times has list build failed */
2100 int create_contig_pending;      /* nonzero means taskq creating contig list */
2101 pfn_t *contig_pfn_list = NULL;  /* list of contig pfns in ascending mfn order */
2102 
2103 /*
2104  * Function to use in sorting a list of pfns by their underlying mfns.
2105  */
2106 static int
2107 mfn_compare(const void *pfnp1, const void *pfnp2)
2108 {
2109         mfn_t mfn1 = mfn_list[*(pfn_t *)pfnp1];
2110         mfn_t mfn2 = mfn_list[*(pfn_t *)pfnp2];
2111 
2112         if (mfn1 > mfn2)
2113                 return (1);
2114         if (mfn1 < mfn2)
2115                 return (-1);
2116         return (0);
2117 }
2118 
2119 /*
2120  * Compact the contig_pfn_list by tossing all the non-contiguous
2121  * elements from the list.
2122  */
2123 static void
2124 compact_contig_pfn_list(void)
2125 {
2126         pfn_t pfn, lapfn, prev_lapfn;
2127         mfn_t mfn;
2128         int i, newcnt = 0;
2129 
2130         prev_lapfn = 0;
2131         for (i = 0; i < contig_pfn_cnt - 1; i++) {
2132                 pfn = contig_pfn_list[i];
2133                 lapfn = contig_pfn_list[i + 1];
2134                 mfn = mfn_list[pfn];
2135                 /*
2136                  * See if next pfn is for a contig mfn
2137                  */
2138                 if (mfn_list[lapfn] != mfn + 1)
2139                         continue;
2140                 /*
2141                  * pfn and lookahead are both put in list
2142                  * unless pfn is the previous lookahead.
2143                  */
2144                 if (pfn != prev_lapfn)
2145                         contig_pfn_list[newcnt++] = pfn;
2146                 contig_pfn_list[newcnt++] = lapfn;
2147                 prev_lapfn = lapfn;
2148         }
2149         for (i = newcnt; i < contig_pfn_cnt; i++)
2150                 contig_pfn_list[i] = 0;
2151         contig_pfn_cnt = newcnt;
2152 }
2153 
2154 /*ARGSUSED*/
2155 static void
2156 call_create_contiglist(void *arg)
2157 {
2158         (void) create_contig_pfnlist(PG_WAIT);
2159 }
2160 
2161 /*
2162  * Create list of freelist pfns that have underlying
2163  * contiguous mfns.  The list is kept in ascending mfn order.
2164  * returns 1 if list created else 0.
2165  */
2166 static int
2167 create_contig_pfnlist(uint_t flags)
2168 {
2169         pfn_t pfn;
2170         page_t *pp;
2171         int ret = 1;
2172 
2173         mutex_enter(&contig_list_lock);
2174         if (contig_pfn_list != NULL)
2175                 goto out;
2176         contig_pfn_max = freemem + (freemem / 10);
2177         contig_pfn_list = kmem_zalloc(contig_pfn_max * sizeof (pfn_t),
2178             (flags & PG_WAIT) ? KM_SLEEP : KM_NOSLEEP);
2179         if (contig_pfn_list == NULL) {
2180                 /*
2181                  * If we could not create the contig list (because
2182                  * we could not sleep for memory).  Dispatch a taskq that can
2183                  * sleep to get the memory.
2184                  */
2185                 if (!create_contig_pending) {
2186                         if (taskq_dispatch(system_taskq, call_create_contiglist,
2187                             NULL, TQ_NOSLEEP) != NULL)
2188                                 create_contig_pending = 1;
2189                 }
2190                 contig_pfnlist_buildfailed++;   /* count list build failures */
2191                 ret = 0;
2192                 goto out;
2193         }
2194         create_contig_pending = 0;
2195         ASSERT(contig_pfn_cnt == 0);
2196         for (pfn = 0; pfn < mfn_count; pfn++) {
2197                 pp = page_numtopp_nolock(pfn);
2198                 if (pp == NULL || !PP_ISFREE(pp))
2199                         continue;
2200                 contig_pfn_list[contig_pfn_cnt] = pfn;
2201                 if (++contig_pfn_cnt == contig_pfn_max)
2202                         break;
2203         }
2204         /*
2205          * Sanity check the new list.
2206          */
2207         if (contig_pfn_cnt < 2) { /* no contig pfns */
2208                 contig_pfn_cnt = 0;
2209                 contig_pfnlist_buildfailed++;
2210                 kmem_free(contig_pfn_list, contig_pfn_max * sizeof (pfn_t));
2211                 contig_pfn_list = NULL;
2212                 contig_pfn_max = 0;
2213                 ret = 0;
2214                 goto out;
2215         }
2216         qsort(contig_pfn_list, contig_pfn_cnt, sizeof (pfn_t), mfn_compare);
2217         compact_contig_pfn_list();
2218         /*
2219          * Make sure next search of the newly created contiguous pfn
2220          * list starts at the beginning of the list.
2221          */
2222         next_alloc_pfn = 0;
2223         contig_pfnlist_builds++;        /* count list builds */
2224 out:
2225         mutex_exit(&contig_list_lock);
2226         return (ret);
2227 }
2228 
2229 
2230 /*
2231  * Toss the current contig pfnlist.  Someone is about to do a massive
2232  * update to pfn<->mfn mappings.  So we have them destroy the list and lock
2233  * it till they are done with their update.
2234  */
2235 void
2236 clear_and_lock_contig_pfnlist()
2237 {
2238         pfn_t *listp = NULL;
2239         size_t listsize;
2240 
2241         mutex_enter(&contig_list_lock);
2242         if (contig_pfn_list != NULL) {
2243                 listp = contig_pfn_list;
2244                 listsize = contig_pfn_max * sizeof (pfn_t);
2245                 contig_pfn_list = NULL;
2246                 contig_pfn_max = contig_pfn_cnt = 0;
2247         }
2248         if (listp != NULL)
2249                 kmem_free(listp, listsize);
2250 }
2251 
2252 /*
2253  * Unlock the contig_pfn_list.  The next attempted use of it will cause
2254  * it to be re-created.
2255  */
2256 void
2257 unlock_contig_pfnlist()
2258 {
2259         mutex_exit(&contig_list_lock);
2260 }
2261 
2262 /*
2263  * Update the contiguous pfn list in response to a pfn <-> mfn reassignment
2264  */
2265 void
2266 update_contig_pfnlist(pfn_t pfn, mfn_t oldmfn, mfn_t newmfn)
2267 {
2268         int probe_hi, probe_lo, probe_pos, insert_after, insert_point;
2269         pfn_t probe_pfn;
2270         mfn_t probe_mfn;
2271         int drop_lock = 0;
2272 
2273         if (mutex_owner(&contig_list_lock) != curthread) {
2274                 drop_lock = 1;
2275                 mutex_enter(&contig_list_lock);
2276         }
2277         if (contig_pfn_list == NULL)
2278                 goto done;
2279         contig_pfnlist_updates++;
2280         /*
2281          * Find the pfn in the current list.  Use a binary chop to locate it.
2282          */
2283         probe_hi = contig_pfn_cnt - 1;
2284         probe_lo = 0;
2285         probe_pos = (probe_hi + probe_lo) / 2;
2286         while ((probe_pfn = contig_pfn_list[probe_pos]) != pfn) {
2287                 if (probe_pos == probe_lo) { /* pfn not in list */
2288                         probe_pos = -1;
2289                         break;
2290                 }
2291                 if (pfn_to_mfn(probe_pfn) <= oldmfn)
2292                         probe_lo = probe_pos;
2293                 else
2294                         probe_hi = probe_pos;
2295                 probe_pos = (probe_hi + probe_lo) / 2;
2296         }
2297         if (probe_pos >= 0) {
2298                 /*
2299                  * Remove pfn from list and ensure next alloc
2300                  * position stays in bounds.
2301                  */
2302                 if (--contig_pfn_cnt <= next_alloc_pfn)
2303                         next_alloc_pfn = 0;
2304                 if (contig_pfn_cnt < 2) { /* no contig pfns */
2305                         contig_pfn_cnt = 0;
2306                         kmem_free(contig_pfn_list,
2307                             contig_pfn_max * sizeof (pfn_t));
2308                         contig_pfn_list = NULL;
2309                         contig_pfn_max = 0;
2310                         goto done;
2311                 }
2312                 ovbcopy(&contig_pfn_list[probe_pos + 1],
2313                     &contig_pfn_list[probe_pos],
2314                     (contig_pfn_cnt - probe_pos) * sizeof (pfn_t));
2315         }
2316         if (newmfn == MFN_INVALID)
2317                 goto done;
2318         /*
2319          * Check if new mfn has adjacent mfns in the list
2320          */
2321         probe_hi = contig_pfn_cnt - 1;
2322         probe_lo = 0;
2323         insert_after = -2;
2324         do {
2325                 probe_pos = (probe_hi + probe_lo) / 2;
2326                 probe_mfn = pfn_to_mfn(contig_pfn_list[probe_pos]);
2327                 if (newmfn == probe_mfn + 1)
2328                         insert_after = probe_pos;
2329                 else if (newmfn == probe_mfn - 1)
2330                         insert_after = probe_pos - 1;
2331                 if (probe_pos == probe_lo)
2332                         break;
2333                 if (probe_mfn <= newmfn)
2334                         probe_lo = probe_pos;
2335                 else
2336                         probe_hi = probe_pos;
2337         } while (insert_after == -2);
2338         /*
2339          * If there is space in the list and there are adjacent mfns
2340          * insert the pfn in to its proper place in the list.
2341          */
2342         if (insert_after != -2 && contig_pfn_cnt + 1 <= contig_pfn_max) {
2343                 insert_point = insert_after + 1;
2344                 ovbcopy(&contig_pfn_list[insert_point],
2345                     &contig_pfn_list[insert_point + 1],
2346                     (contig_pfn_cnt - insert_point) * sizeof (pfn_t));
2347                 contig_pfn_list[insert_point] = pfn;
2348                 contig_pfn_cnt++;
2349         }
2350 done:
2351         if (drop_lock)
2352                 mutex_exit(&contig_list_lock);
2353 }
2354 
2355 /*
2356  * Called to (re-)populate the io_pool from the free page lists.
2357  */
2358 long
2359 populate_io_pool(void)
2360 {
2361         pfn_t pfn;
2362         mfn_t mfn, max_mfn;
2363         page_t *pp;
2364 
2365         /*
2366          * Figure out the bounds of the pool on first invocation.
2367          * We use a percentage of memory for the io pool size.
2368          * we allow that to shrink, but not to less than a fixed minimum
2369          */
2370         if (io_pool_cnt_max == 0) {
2371                 io_pool_cnt_max = physmem / (100 / io_pool_physmem_pct);
2372                 io_pool_cnt_lowater = io_pool_cnt_max;
2373                 /*
2374                  * This is the first time in populate_io_pool, grab a va to use
2375                  * when we need to allocate pages.
2376                  */
2377                 io_pool_kva = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP);
2378         }
2379         /*
2380          * If we are out of pages in the pool, then grow the size of the pool
2381          */
2382         if (io_pool_cnt == 0) {
2383                 /*
2384                  * Grow the max size of the io pool by 5%, but never more than
2385                  * 25% of physical memory.
2386                  */
2387                 if (io_pool_cnt_max < physmem / 4)
2388                         io_pool_cnt_max += io_pool_cnt_max / 20;
2389         }
2390         io_pool_grows++;        /* should be a kstat? */
2391 
2392         /*
2393          * Get highest mfn on this platform, but limit to the 32 bit DMA max.
2394          */
2395         (void) mfn_to_pfn(start_mfn);
2396         max_mfn = MIN(cached_max_mfn, PFN_4GIG);
2397         for (mfn = start_mfn; mfn < max_mfn; start_mfn = ++mfn) {
2398                 pfn = mfn_to_pfn(mfn);
2399                 if (pfn & PFN_IS_FOREIGN_MFN)
2400                         continue;
2401                 /*
2402                  * try to allocate it from free pages
2403                  */
2404                 pp = page_numtopp_alloc(pfn);
2405                 if (pp == NULL)
2406                         continue;
2407                 PP_CLRFREE(pp);
2408                 add_page_to_pool(pp, 1);
2409                 if (io_pool_cnt >= io_pool_cnt_max)
2410                         break;
2411         }
2412 
2413         return (io_pool_cnt);
2414 }
2415 
2416 /*
2417  * Destroy a page that was being used for DMA I/O. It may or
2418  * may not actually go back to the io_pool.
2419  */
2420 void
2421 page_destroy_io(page_t *pp)
2422 {
2423         mfn_t mfn = mfn_list[pp->p_pagenum];
2424 
2425         /*
2426          * When the page was alloc'd a reservation was made, release it now
2427          */
2428         page_unresv(1);
2429         /*
2430          * Unload translations, if any, then hash out the
2431          * page to erase its identity.
2432          */
2433         (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
2434         page_hashout(pp, NULL);
2435 
2436         /*
2437          * If the page came from the free lists, just put it back to them.
2438          * DomU pages always go on the free lists as well.
2439          */
2440         if (!DOMAIN_IS_INITDOMAIN(xen_info) || mfn >= PFN_4GIG) {
2441                 page_free(pp, 1);
2442                 return;
2443         }
2444 
2445         add_page_to_pool(pp, 0);
2446 }
2447 
2448 
2449 long contig_searches;           /* count of times contig pages requested */
2450 long contig_search_restarts;    /* count of contig ranges tried */
2451 long contig_search_failed;      /* count of contig alloc failures */
2452 
2453 /*
2454  * Free partial page list
2455  */
2456 static void
2457 free_partial_list(page_t **pplist)
2458 {
2459         page_t *pp;
2460 
2461         while (*pplist != NULL) {
2462                 pp = *pplist;
2463                 page_io_pool_sub(pplist, pp, pp);
2464                 page_free(pp, 1);
2465         }
2466 }
2467 
2468 /*
2469  * Look thru the contiguous pfns that are not part of the io_pool for
2470  * contiguous free pages.  Return a list of the found pages or NULL.
2471  */
2472 page_t *
2473 find_contig_free(uint_t npages, uint_t flags, uint64_t pfnseg,
2474     pgcnt_t pfnalign)
2475 {
2476         page_t *pp, *plist = NULL;
2477         mfn_t mfn, prev_mfn, start_mfn;
2478         pfn_t pfn;
2479         int pages_needed, pages_requested;
2480         int search_start;
2481 
2482         /*
2483          * create the contig pfn list if not already done
2484          */
2485 retry:
2486         mutex_enter(&contig_list_lock);
2487         if (contig_pfn_list == NULL) {
2488                 mutex_exit(&contig_list_lock);
2489                 if (!create_contig_pfnlist(flags)) {
2490                         return (NULL);
2491                 }
2492                 goto retry;
2493         }
2494         contig_searches++;
2495         /*
2496          * Search contiguous pfn list for physically contiguous pages not in
2497          * the io_pool.  Start the search where the last search left off.
2498          */
2499         pages_requested = pages_needed = npages;
2500         search_start = next_alloc_pfn;
2501         start_mfn = prev_mfn = 0;
2502         while (pages_needed) {
2503                 pfn = contig_pfn_list[next_alloc_pfn];
2504                 mfn = pfn_to_mfn(pfn);
2505                 /*
2506                  * Check if mfn is first one or contig to previous one and
2507                  * if page corresponding to mfn is free and that mfn
2508                  * range is not crossing a segment boundary.
2509                  */
2510                 if ((prev_mfn == 0 || mfn == prev_mfn + 1) &&
2511                     (pp = page_numtopp_alloc(pfn)) != NULL &&
2512                     !((mfn & pfnseg) < (start_mfn & pfnseg))) {
2513                         PP_CLRFREE(pp);
2514                         page_io_pool_add(&plist, pp);
2515                         pages_needed--;
2516                         if (prev_mfn == 0) {
2517                                 if (pfnalign &&
2518                                     mfn != P2ROUNDUP(mfn, pfnalign)) {
2519                                         /*
2520                                          * not properly aligned
2521                                          */
2522                                         contig_search_restarts++;
2523                                         free_partial_list(&plist);
2524                                         pages_needed = pages_requested;
2525                                         start_mfn = prev_mfn = 0;
2526                                         goto skip;
2527                                 }
2528                                 start_mfn = mfn;
2529                         }
2530                         prev_mfn = mfn;
2531                 } else {
2532                         contig_search_restarts++;
2533                         free_partial_list(&plist);
2534                         pages_needed = pages_requested;
2535                         start_mfn = prev_mfn = 0;
2536                 }
2537 skip:
2538                 if (++next_alloc_pfn == contig_pfn_cnt)
2539                         next_alloc_pfn = 0;
2540                 if (next_alloc_pfn == search_start)
2541                         break; /* all pfns searched */
2542         }
2543         mutex_exit(&contig_list_lock);
2544         if (pages_needed) {
2545                 contig_search_failed++;
2546                 /*
2547                  * Failed to find enough contig pages.
2548                  * free partial page list
2549                  */
2550                 free_partial_list(&plist);
2551         }
2552         return (plist);
2553 }
2554 
2555 /*
2556  * Search the reserved io pool pages for a page range with the
2557  * desired characteristics.
2558  */
2559 page_t *
2560 page_io_pool_alloc(ddi_dma_attr_t *mattr, int contig, pgcnt_t minctg)
2561 {
2562         page_t *pp_first, *pp_last;
2563         page_t *pp, **poolp;
2564         pgcnt_t nwanted, pfnalign;
2565         uint64_t pfnseg;
2566         mfn_t mfn, tmfn, hi_mfn, lo_mfn;
2567         int align, attempt = 0;
2568 
2569         if (minctg == 1)
2570                 contig = 0;
2571         lo_mfn = mmu_btop(mattr->dma_attr_addr_lo);
2572         hi_mfn = mmu_btop(mattr->dma_attr_addr_hi);
2573         pfnseg = mmu_btop(mattr->dma_attr_seg);
2574         align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer);
2575         if (align > MMU_PAGESIZE)
2576                 pfnalign = mmu_btop(align);
2577         else
2578                 pfnalign = 0;
2579 
2580 try_again:
2581         /*
2582          * See if we want pages for a legacy device
2583          */
2584         if (hi_mfn < PFN_16MEG)
2585                 poolp = &io_pool_16m;
2586         else
2587                 poolp = &io_pool_4g;
2588 try_smaller:
2589         /*
2590          * Take pages from I/O pool. We'll use pages from the highest
2591          * MFN range possible.
2592          */
2593         pp_first = pp_last = NULL;
2594         mutex_enter(&io_pool_lock);
2595         nwanted = minctg;
2596         for (pp = *poolp; pp && nwanted > 0; ) {
2597                 pp = pp->p_prev;
2598 
2599                 /*
2600                  * skip pages above allowable range
2601                  */
2602                 mfn = mfn_list[pp->p_pagenum];
2603                 if (hi_mfn < mfn)
2604                         goto skip;
2605 
2606                 /*
2607                  * stop at pages below allowable range
2608                  */
2609                 if (lo_mfn > mfn)
2610                         break;
2611 restart:
2612                 if (pp_last == NULL) {
2613                         /*
2614                          * Check alignment
2615                          */
2616                         tmfn = mfn - (minctg - 1);
2617                         if (pfnalign && tmfn != P2ROUNDUP(tmfn, pfnalign))
2618                                 goto skip; /* not properly aligned */
2619                         /*
2620                          * Check segment
2621                          */
2622                         if ((mfn & pfnseg) < (tmfn & pfnseg))
2623                                 goto skip; /* crosses seg boundary */
2624                         /*
2625                          * Start building page list
2626                          */
2627                         pp_first = pp_last = pp;
2628                         nwanted--;
2629                 } else {
2630                         /*
2631                          * check physical contiguity if required
2632                          */
2633                         if (contig &&
2634                             mfn_list[pp_first->p_pagenum] != mfn + 1) {
2635                                 /*
2636                                  * not a contiguous page, restart list.
2637                                  */
2638                                 pp_last = NULL;
2639                                 nwanted = minctg;
2640                                 goto restart;
2641                         } else { /* add page to list */
2642                                 pp_first = pp;
2643                                 nwanted--;
2644                         }
2645                 }
2646 skip:
2647                 if (pp == *poolp)
2648                         break;
2649         }
2650 
2651         /*
2652          * If we didn't find memory. Try the more constrained pool, then
2653          * sweep free pages into the DMA pool and try again.
2654          */
2655         if (nwanted != 0) {
2656                 mutex_exit(&io_pool_lock);
2657                 /*
2658                  * If we were looking in the less constrained pool and
2659                  * didn't find pages, try the more constrained pool.
2660                  */
2661                 if (poolp == &io_pool_4g) {
2662                         poolp = &io_pool_16m;
2663                         goto try_smaller;
2664                 }
2665                 kmem_reap();
2666                 if (++attempt < 4) {
2667                         /*
2668                          * Grab some more io_pool pages
2669                          */
2670                         (void) populate_io_pool();
2671                         goto try_again; /* go around and retry */
2672                 }
2673                 return (NULL);
2674         }
2675         /*
2676          * Found the pages, now snip them from the list
2677          */
2678         page_io_pool_sub(poolp, pp_first, pp_last);
2679         io_pool_cnt -= minctg;
2680         /*
2681          * reset low water mark
2682          */
2683         if (io_pool_cnt < io_pool_cnt_lowater)
2684                 io_pool_cnt_lowater = io_pool_cnt;
2685         mutex_exit(&io_pool_lock);
2686         return (pp_first);
2687 }
2688 
2689 page_t *
2690 page_swap_with_hypervisor(struct vnode *vp, u_offset_t off, caddr_t vaddr,
2691     ddi_dma_attr_t *mattr, uint_t flags, pgcnt_t minctg)
2692 {
2693         uint_t kflags;
2694         int order, extra, extpages, i, contig, nbits, extents;
2695         page_t *pp, *expp, *pp_first, **pplist = NULL;
2696         mfn_t *mfnlist = NULL;
2697 
2698         contig = flags & PG_PHYSCONTIG;
2699         if (minctg == 1)
2700                 contig = 0;
2701         flags &= ~PG_PHYSCONTIG;
2702         kflags = flags & PG_WAIT ? KM_SLEEP : KM_NOSLEEP;
2703         /*
2704          * Hypervisor will allocate extents, if we want contig
2705          * pages extent must be >= minctg
2706          */
2707         if (contig) {
2708                 order = highbit(minctg) - 1;
2709                 if (minctg & ((1 << order) - 1))
2710                         order++;
2711                 extpages = 1 << order;
2712         } else {
2713                 order = 0;
2714                 extpages = minctg;
2715         }
2716         if (extpages > minctg) {
2717                 extra = extpages - minctg;
2718                 if (!page_resv(extra, kflags))
2719                         return (NULL);
2720         }
2721         pp_first = NULL;
2722         pplist = kmem_alloc(extpages * sizeof (page_t *), kflags);
2723         if (pplist == NULL)
2724                 goto balloon_fail;
2725         mfnlist = kmem_alloc(extpages * sizeof (mfn_t), kflags);
2726         if (mfnlist == NULL)
2727                 goto balloon_fail;
2728         pp = page_create_va(vp, off, minctg * PAGESIZE, flags, &kvseg, vaddr);
2729         if (pp == NULL)
2730                 goto balloon_fail;
2731         pp_first = pp;
2732         if (extpages > minctg) {
2733                 /*
2734                  * fill out the rest of extent pages to swap
2735                  * with the hypervisor
2736                  */
2737                 for (i = 0; i < extra; i++) {
2738                         expp = page_create_va(vp,
2739                             (u_offset_t)(uintptr_t)io_pool_kva,
2740                             PAGESIZE, flags, &kvseg, io_pool_kva);
2741                         if (expp == NULL)
2742                                 goto balloon_fail;
2743                         (void) hat_pageunload(expp, HAT_FORCE_PGUNLOAD);
2744                         page_io_unlock(expp);
2745                         page_hashout(expp, NULL);
2746                         page_io_lock(expp);
2747                         /*
2748                          * add page to end of list
2749                          */
2750                         expp->p_prev = pp_first->p_prev;
2751                         expp->p_next = pp_first;
2752                         expp->p_prev->p_next = expp;
2753                         pp_first->p_prev = expp;
2754                 }
2755 
2756         }
2757         for (i = 0; i < extpages; i++) {
2758                 pplist[i] = pp;
2759                 pp = pp->p_next;
2760         }
2761         nbits = highbit(mattr->dma_attr_addr_hi);
2762         extents = contig ? 1 : minctg;
2763         if (balloon_replace_pages(extents, pplist, nbits, order,
2764             mfnlist) != extents) {
2765                 if (ioalloc_dbg)
2766                         cmn_err(CE_NOTE, "request to hypervisor"
2767                             " for %d pages, maxaddr %" PRIx64 " failed",
2768                             extpages, mattr->dma_attr_addr_hi);
2769                 goto balloon_fail;
2770         }
2771 
2772         kmem_free(pplist, extpages * sizeof (page_t *));
2773         kmem_free(mfnlist, extpages * sizeof (mfn_t));
2774         /*
2775          * Return any excess pages to free list
2776          */
2777         if (extpages > minctg) {
2778                 for (i = 0; i < extra; i++) {
2779                         pp = pp_first->p_prev;
2780                         page_sub(&pp_first, pp);
2781                         page_io_unlock(pp);
2782                         page_unresv(1);
2783                         page_free(pp, 1);
2784                 }
2785         }
2786         return (pp_first);
2787 balloon_fail:
2788         /*
2789          * Return pages to free list and return failure
2790          */
2791         while (pp_first != NULL) {
2792                 pp = pp_first;
2793                 page_sub(&pp_first, pp);
2794                 page_io_unlock(pp);
2795                 if (pp->p_vnode != NULL)
2796                         page_hashout(pp, NULL);
2797                 page_free(pp, 1);
2798         }
2799         if (pplist)
2800                 kmem_free(pplist, extpages * sizeof (page_t *));
2801         if (mfnlist)
2802                 kmem_free(mfnlist, extpages * sizeof (mfn_t));
2803         page_unresv(extpages - minctg);
2804         return (NULL);
2805 }
2806 
2807 static void
2808 return_partial_alloc(page_t *plist)
2809 {
2810         page_t *pp;
2811 
2812         while (plist != NULL) {
2813                 pp = plist;
2814                 page_sub(&plist, pp);
2815                 page_io_unlock(pp);
2816                 page_destroy_io(pp);
2817         }
2818 }
2819 
2820 static page_t *
2821 page_get_contigpages(
2822         struct vnode    *vp,
2823         u_offset_t      off,
2824         int             *npagesp,
2825         uint_t          flags,
2826         caddr_t         vaddr,
2827         ddi_dma_attr_t  *mattr)
2828 {
2829         mfn_t   max_mfn = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL);
2830         page_t  *plist; /* list to return */
2831         page_t  *pp, *mcpl;
2832         int     contig, anyaddr, npages, getone = 0;
2833         mfn_t   lo_mfn;
2834         mfn_t   hi_mfn;
2835         pgcnt_t pfnalign = 0;
2836         int     align, sgllen;
2837         uint64_t pfnseg;
2838         pgcnt_t minctg;
2839 
2840         npages = *npagesp;
2841         ASSERT(mattr != NULL);
2842         lo_mfn = mmu_btop(mattr->dma_attr_addr_lo);
2843         hi_mfn = mmu_btop(mattr->dma_attr_addr_hi);
2844         sgllen = mattr->dma_attr_sgllen;
2845         pfnseg = mmu_btop(mattr->dma_attr_seg);
2846         align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer);
2847         if (align > MMU_PAGESIZE)
2848                 pfnalign = mmu_btop(align);
2849 
2850         contig = flags & PG_PHYSCONTIG;
2851         if (npages == -1) {
2852                 npages = 1;
2853                 pfnalign = 0;
2854         }
2855         /*
2856          * Clear the contig flag if only one page is needed.
2857          */
2858         if (npages == 1) {
2859                 getone = 1;
2860                 contig = 0;
2861         }
2862 
2863         /*
2864          * Check if any page in the system is fine.
2865          */
2866         anyaddr = lo_mfn == 0 && hi_mfn >= max_mfn;
2867         if (!contig && anyaddr && !pfnalign) {
2868                 flags &= ~PG_PHYSCONTIG;
2869                 plist = page_create_va(vp, off, npages * MMU_PAGESIZE,
2870                     flags, &kvseg, vaddr);
2871                 if (plist != NULL) {
2872                         *npagesp = 0;
2873                         return (plist);
2874                 }
2875         }
2876         plist = NULL;
2877         minctg = howmany(npages, sgllen);
2878         while (npages > sgllen || getone) {
2879                 if (minctg > npages)
2880                         minctg = npages;
2881                 mcpl = NULL;
2882                 /*
2883                  * We could want contig pages with no address range limits.
2884                  */
2885                 if (anyaddr && contig) {
2886                         /*
2887                          * Look for free contig pages to satisfy the request.
2888                          */
2889                         mcpl = find_contig_free(minctg, flags, pfnseg,
2890                             pfnalign);
2891                 }
2892                 /*
2893                  * Try the reserved io pools next
2894                  */
2895                 if (mcpl == NULL)
2896                         mcpl = page_io_pool_alloc(mattr, contig, minctg);
2897                 if (mcpl != NULL) {
2898                         pp = mcpl;
2899                         do {
2900                                 if (!page_hashin(pp, vp, off, NULL)) {
2901                                         panic("page_get_contigpages:"
2902                                             " hashin failed"
2903                                             " pp %p, vp %p, off %llx",
2904                                             (void *)pp, (void *)vp, off);
2905                                 }
2906                                 off += MMU_PAGESIZE;
2907                                 PP_CLRFREE(pp);
2908                                 PP_CLRAGED(pp);
2909                                 page_set_props(pp, P_REF);
2910                                 page_io_lock(pp);
2911                                 pp = pp->p_next;
2912                         } while (pp != mcpl);
2913                 } else {
2914                         /*
2915                          * Hypervisor exchange doesn't handle segment or
2916                          * alignment constraints
2917                          */
2918                         if (mattr->dma_attr_seg < mattr->dma_attr_addr_hi ||
2919                             pfnalign)
2920                                 goto fail;
2921                         /*
2922                          * Try exchanging pages with the hypervisor
2923                          */
2924                         mcpl = page_swap_with_hypervisor(vp, off, vaddr, mattr,
2925                             flags, minctg);
2926                         if (mcpl == NULL)
2927                                 goto fail;
2928                         off += minctg * MMU_PAGESIZE;
2929                 }
2930                 check_dma(mattr, mcpl, minctg);
2931                 /*
2932                  * Here with a minctg run of contiguous pages, add them to the
2933                  * list we will return for this request.
2934                  */
2935                 page_list_concat(&plist, &mcpl);
2936                 npages -= minctg;
2937                 *npagesp = npages;
2938                 sgllen--;
2939                 if (getone)
2940                         break;
2941         }
2942         return (plist);
2943 fail:
2944         return_partial_alloc(plist);
2945         return (NULL);
2946 }
2947 
2948 /*
2949  * Allocator for domain 0 I/O pages. We match the required
2950  * DMA attributes and contiguity constraints.
2951  */
2952 /*ARGSUSED*/
2953 page_t *
2954 page_create_io(
2955         struct vnode    *vp,
2956         u_offset_t      off,
2957         uint_t          bytes,
2958         uint_t          flags,
2959         struct as       *as,
2960         caddr_t         vaddr,
2961         ddi_dma_attr_t  *mattr)
2962 {
2963         page_t  *plist = NULL, *pp;
2964         int     npages = 0, contig, anyaddr, pages_req;
2965         mfn_t   lo_mfn;
2966         mfn_t   hi_mfn;
2967         pgcnt_t pfnalign = 0;
2968         int     align;
2969         int     is_domu = 0;
2970         int     dummy, bytes_got;
2971         mfn_t   max_mfn = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL);
2972 
2973         ASSERT(mattr != NULL);
2974         lo_mfn = mmu_btop(mattr->dma_attr_addr_lo);
2975         hi_mfn = mmu_btop(mattr->dma_attr_addr_hi);
2976         align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer);
2977         if (align > MMU_PAGESIZE)
2978                 pfnalign = mmu_btop(align);
2979 
2980         /*
2981          * Clear the contig flag if only one page is needed or the scatter
2982          * gather list length is >= npages.
2983          */
2984         pages_req = npages = mmu_btopr(bytes);
2985         contig = (flags & PG_PHYSCONTIG);
2986         bytes = P2ROUNDUP(bytes, MMU_PAGESIZE);
2987         if (bytes == MMU_PAGESIZE || mattr->dma_attr_sgllen >= npages)
2988                 contig = 0;
2989 
2990         /*
2991          * Check if any old page in the system is fine.
2992          * DomU should always go down this path.
2993          */
2994         is_domu = !DOMAIN_IS_INITDOMAIN(xen_info);
2995         anyaddr = lo_mfn == 0 && hi_mfn >= max_mfn && !pfnalign;
2996         if ((!contig && anyaddr) || is_domu) {
2997                 flags &= ~PG_PHYSCONTIG;
2998                 plist = page_create_va(vp, off, bytes, flags, &kvseg, vaddr);
2999                 if (plist != NULL)
3000                         return (plist);
3001                 else if (is_domu)
3002                         return (NULL); /* no memory available */
3003         }
3004         /*
3005          * DomU should never reach here
3006          */
3007         if (contig) {
3008                 plist = page_get_contigpages(vp, off, &npages, flags, vaddr,
3009                     mattr);
3010                 if (plist == NULL)
3011                         goto fail;
3012                 bytes_got = (pages_req - npages) << MMU_PAGESHIFT;
3013                 vaddr += bytes_got;
3014                 off += bytes_got;
3015                 /*
3016                  * We now have all the contiguous pages we need, but
3017                  * we may still need additional non-contiguous pages.
3018                  */
3019         }
3020         /*
3021          * now loop collecting the requested number of pages, these do
3022          * not have to be contiguous pages but we will use the contig
3023          * page alloc code to get the pages since it will honor any
3024          * other constraints the pages may have.
3025          */
3026         while (npages--) {
3027                 dummy = -1;
3028                 pp = page_get_contigpages(vp, off, &dummy, flags, vaddr, mattr);
3029                 if (pp == NULL)
3030                         goto fail;
3031                 page_add(&plist, pp);
3032                 vaddr += MMU_PAGESIZE;
3033                 off += MMU_PAGESIZE;
3034         }
3035         return (plist);
3036 fail:
3037         /*
3038          * Failed to get enough pages, return ones we did get
3039          */
3040         return_partial_alloc(plist);
3041         return (NULL);
3042 }
3043 
3044 /*
3045  * Lock and return the page with the highest mfn that we can find.  last_mfn
3046  * holds the last one found, so the next search can start from there.  We
3047  * also keep a counter so that we don't loop forever if the machine has no
3048  * free pages.
3049  *
3050  * This is called from the balloon thread to find pages to give away.  new_high
3051  * is used when new mfn's have been added to the system - we will reset our
3052  * search if the new mfn's are higher than our current search position.
3053  */
3054 page_t *
3055 page_get_high_mfn(mfn_t new_high)
3056 {
3057         static mfn_t last_mfn = 0;
3058         pfn_t pfn;
3059         page_t *pp;
3060         ulong_t loop_count = 0;
3061 
3062         if (new_high > last_mfn)
3063                 last_mfn = new_high;
3064 
3065         for (; loop_count < mfn_count; loop_count++, last_mfn--) {
3066                 if (last_mfn == 0) {
3067                         last_mfn = cached_max_mfn;
3068                 }
3069 
3070                 pfn = mfn_to_pfn(last_mfn);
3071                 if (pfn & PFN_IS_FOREIGN_MFN)
3072                         continue;
3073 
3074                 /* See if the page is free.  If so, lock it. */
3075                 pp = page_numtopp_alloc(pfn);
3076                 if (pp == NULL)
3077                         continue;
3078                 PP_CLRFREE(pp);
3079 
3080                 ASSERT(PAGE_EXCL(pp));
3081                 ASSERT(pp->p_vnode == NULL);
3082                 ASSERT(!hat_page_is_mapped(pp));
3083                 last_mfn--;
3084                 return (pp);
3085         }
3086         return (NULL);
3087 }
3088 
3089 #else /* !__xpv */
3090 
3091 /*
3092  * get a page from any list with the given mnode
3093  */
3094 static page_t *
3095 page_get_mnode_anylist(ulong_t origbin, uchar_t szc, uint_t flags,
3096     int mnode, int mtype, ddi_dma_attr_t *dma_attr)
3097 {
3098         kmutex_t                *pcm;
3099         int                     i;
3100         page_t                  *pp;
3101         page_t                  *first_pp;
3102         uint64_t                pgaddr;
3103         ulong_t                 bin;
3104         int                     mtypestart;
3105         int                     plw_initialized;
3106         page_list_walker_t      plw;
3107 
3108         VM_STAT_ADD(pga_vmstats.pgma_alloc);
3109 
3110         ASSERT((flags & PG_MATCH_COLOR) == 0);
3111         ASSERT(szc == 0);
3112         ASSERT(dma_attr != NULL);
3113 
3114         MTYPE_START(mnode, mtype, flags);
3115         if (mtype < 0) {
3116                 VM_STAT_ADD(pga_vmstats.pgma_allocempty);
3117                 return (NULL);
3118         }
3119 
3120         mtypestart = mtype;
3121 
3122         bin = origbin;
3123 
3124         /*
3125          * check up to page_colors + 1 bins - origbin may be checked twice
3126          * because of BIN_STEP skip
3127          */
3128         do {
3129                 plw_initialized = 0;
3130 
3131                 for (plw.plw_count = 0;
3132                     plw.plw_count < page_colors; plw.plw_count++) {
3133 
3134                         if (PAGE_FREELISTS(mnode, szc, bin, mtype) == NULL)
3135                                 goto nextfreebin;
3136 
3137                         pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
3138                         mutex_enter(pcm);
3139                         pp = PAGE_FREELISTS(mnode, szc, bin, mtype);
3140                         first_pp = pp;
3141                         while (pp != NULL) {
3142                                 if (IS_DUMP_PAGE(pp) || page_trylock(pp,
3143                                     SE_EXCL) == 0) {
3144                                         pp = pp->p_next;
3145                                         if (pp == first_pp) {
3146                                                 pp = NULL;
3147                                         }
3148                                         continue;
3149                                 }
3150 
3151                                 ASSERT(PP_ISFREE(pp));
3152                                 ASSERT(PP_ISAGED(pp));
3153                                 ASSERT(pp->p_vnode == NULL);
3154                                 ASSERT(pp->p_hash == NULL);
3155                                 ASSERT(pp->p_offset == (u_offset_t)-1);
3156                                 ASSERT(pp->p_szc == szc);
3157                                 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
3158                                 /* check if page within DMA attributes */
3159                                 pgaddr = pa_to_ma(pfn_to_pa(pp->p_pagenum));
3160                                 if ((pgaddr >= dma_attr->dma_attr_addr_lo) &&
3161                                     (pgaddr + MMU_PAGESIZE - 1 <=
3162                                     dma_attr->dma_attr_addr_hi)) {
3163                                         break;
3164                                 }
3165 
3166                                 /* continue looking */
3167                                 page_unlock(pp);
3168                                 pp = pp->p_next;
3169                                 if (pp == first_pp)
3170                                         pp = NULL;
3171 
3172                         }
3173                         if (pp != NULL) {
3174                                 ASSERT(mtype == PP_2_MTYPE(pp));
3175                                 ASSERT(pp->p_szc == 0);
3176 
3177                                 /* found a page with specified DMA attributes */
3178                                 page_sub(&PAGE_FREELISTS(mnode, szc, bin,
3179                                     mtype), pp);
3180                                 page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST);
3181 
3182                                 if ((PP_ISFREE(pp) == 0) ||
3183                                     (PP_ISAGED(pp) == 0)) {
3184                                         cmn_err(CE_PANIC, "page %p is not free",
3185                                             (void *)pp);
3186                                 }
3187 
3188                                 mutex_exit(pcm);
3189                                 check_dma(dma_attr, pp, 1);
3190                                 VM_STAT_ADD(pga_vmstats.pgma_allocok);
3191                                 return (pp);
3192                         }
3193                         mutex_exit(pcm);
3194 nextfreebin:
3195                         if (plw_initialized == 0) {
3196                                 page_list_walk_init(szc, 0, bin, 1, 0, &plw);
3197                                 ASSERT(plw.plw_ceq_dif == page_colors);
3198                                 plw_initialized = 1;
3199                         }
3200 
3201                         if (plw.plw_do_split) {
3202                                 pp = page_freelist_split(szc, bin, mnode,
3203                                     mtype,
3204                                     mmu_btop(dma_attr->dma_attr_addr_lo),
3205                                     mmu_btop(dma_attr->dma_attr_addr_hi + 1),
3206                                     &plw);
3207                                 if (pp != NULL) {
3208                                         check_dma(dma_attr, pp, 1);
3209                                         return (pp);
3210                                 }
3211                         }
3212 
3213                         bin = page_list_walk_next_bin(szc, bin, &plw);
3214                 }
3215 
3216                 MTYPE_NEXT(mnode, mtype, flags);
3217         } while (mtype >= 0);
3218 
3219         /* failed to find a page in the freelist; try it in the cachelist */
3220 
3221         /* reset mtype start for cachelist search */
3222         mtype = mtypestart;
3223         ASSERT(mtype >= 0);
3224 
3225         /* start with the bin of matching color */
3226         bin = origbin;
3227 
3228         do {
3229                 for (i = 0; i <= page_colors; i++) {
3230                         if (PAGE_CACHELISTS(mnode, bin, mtype) == NULL)
3231                                 goto nextcachebin;
3232                         pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST);
3233                         mutex_enter(pcm);
3234                         pp = PAGE_CACHELISTS(mnode, bin, mtype);
3235                         first_pp = pp;
3236                         while (pp != NULL) {
3237                                 if (IS_DUMP_PAGE(pp) || page_trylock(pp,
3238                                     SE_EXCL) == 0) {
3239                                         pp = pp->p_next;
3240                                         if (pp == first_pp)
3241                                                 pp = NULL;
3242                                         continue;
3243                                 }
3244                                 ASSERT(pp->p_vnode);
3245                                 ASSERT(PP_ISAGED(pp) == 0);
3246                                 ASSERT(pp->p_szc == 0);
3247                                 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
3248 
3249                                 /* check if page within DMA attributes */
3250 
3251                                 pgaddr = pa_to_ma(pfn_to_pa(pp->p_pagenum));
3252                                 if ((pgaddr >= dma_attr->dma_attr_addr_lo) &&
3253                                     (pgaddr + MMU_PAGESIZE - 1 <=
3254                                     dma_attr->dma_attr_addr_hi)) {
3255                                         break;
3256                                 }
3257 
3258                                 /* continue looking */
3259                                 page_unlock(pp);
3260                                 pp = pp->p_next;
3261                                 if (pp == first_pp)
3262                                         pp = NULL;
3263                         }
3264 
3265                         if (pp != NULL) {
3266                                 ASSERT(mtype == PP_2_MTYPE(pp));
3267                                 ASSERT(pp->p_szc == 0);
3268 
3269                                 /* found a page with specified DMA attributes */
3270                                 page_sub(&PAGE_CACHELISTS(mnode, bin,
3271                                     mtype), pp);
3272                                 page_ctr_sub(mnode, mtype, pp, PG_CACHE_LIST);
3273 
3274                                 mutex_exit(pcm);
3275                                 ASSERT(pp->p_vnode);
3276                                 ASSERT(PP_ISAGED(pp) == 0);
3277                                 check_dma(dma_attr, pp, 1);
3278                                 VM_STAT_ADD(pga_vmstats.pgma_allocok);
3279                                 return (pp);
3280                         }
3281                         mutex_exit(pcm);
3282 nextcachebin:
3283                         bin += (i == 0) ? BIN_STEP : 1;
3284                         bin &= page_colors_mask;
3285                 }
3286                 MTYPE_NEXT(mnode, mtype, flags);
3287         } while (mtype >= 0);
3288 
3289         VM_STAT_ADD(pga_vmstats.pgma_allocfailed);
3290         return (NULL);
3291 }
3292 
3293 /*
3294  * This function is similar to page_get_freelist()/page_get_cachelist()
3295  * but it searches both the lists to find a page with the specified
3296  * color (or no color) and DMA attributes. The search is done in the
3297  * freelist first and then in the cache list within the highest memory
3298  * range (based on DMA attributes) before searching in the lower
3299  * memory ranges.
3300  *
3301  * Note: This function is called only by page_create_io().
3302  */
3303 /*ARGSUSED*/
3304 static page_t *
3305 page_get_anylist(struct vnode *vp, u_offset_t off, struct as *as, caddr_t vaddr,
3306     size_t size, uint_t flags, ddi_dma_attr_t *dma_attr, lgrp_t *lgrp)
3307 {
3308         uint_t          bin;
3309         int             mtype;
3310         page_t          *pp;
3311         int             n;
3312         int             m;
3313         int             szc;
3314         int             fullrange;
3315         int             mnode;
3316         int             local_failed_stat = 0;
3317         lgrp_mnode_cookie_t     lgrp_cookie;
3318 
3319         VM_STAT_ADD(pga_vmstats.pga_alloc);
3320 
3321         /* only base pagesize currently supported */
3322         if (size != MMU_PAGESIZE)
3323                 return (NULL);
3324 
3325         /*
3326          * If we're passed a specific lgroup, we use it.  Otherwise,
3327          * assume first-touch placement is desired.
3328          */
3329         if (!LGRP_EXISTS(lgrp))
3330                 lgrp = lgrp_home_lgrp();
3331 
3332         /* LINTED */
3333         AS_2_BIN(as, seg, vp, vaddr, bin, 0);
3334 
3335         /*
3336          * Only hold one freelist or cachelist lock at a time, that way we
3337          * can start anywhere and not have to worry about lock
3338          * ordering.
3339          */
3340         if (dma_attr == NULL) {
3341                 n = mtype16m;
3342                 m = mtypetop;
3343                 fullrange = 1;
3344                 VM_STAT_ADD(pga_vmstats.pga_nulldmaattr);
3345         } else {
3346                 pfn_t pfnlo = mmu_btop(dma_attr->dma_attr_addr_lo);
3347                 pfn_t pfnhi = mmu_btop(dma_attr->dma_attr_addr_hi);
3348 
3349                 /*
3350                  * We can guarantee alignment only for page boundary.
3351                  */
3352                 if (dma_attr->dma_attr_align > MMU_PAGESIZE)
3353                         return (NULL);
3354 
3355                 /* Sanity check the dma_attr */
3356                 if (pfnlo > pfnhi)
3357                         return (NULL);
3358 
3359                 n = pfn_2_mtype(pfnlo);
3360                 m = pfn_2_mtype(pfnhi);
3361 
3362                 fullrange = ((pfnlo == mnoderanges[n].mnr_pfnlo) &&
3363                     (pfnhi >= mnoderanges[m].mnr_pfnhi));
3364         }
3365         VM_STAT_COND_ADD(fullrange == 0, pga_vmstats.pga_notfullrange);
3366 
3367         szc = 0;
3368 
3369         /* cylcing thru mtype handled by RANGE0 if n == mtype16m */
3370         if (n == mtype16m) {
3371                 flags |= PGI_MT_RANGE0;
3372                 n = m;
3373         }
3374 
3375         /*
3376          * Try local memory node first, but try remote if we can't
3377          * get a page of the right color.
3378          */
3379         LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_HIER);
3380         while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3381                 /*
3382                  * allocate pages from high pfn to low.
3383                  */
3384                 mtype = m;
3385                 do {
3386                         if (fullrange != 0) {
3387                                 pp = page_get_mnode_freelist(mnode,
3388                                     bin, mtype, szc, flags);
3389                                 if (pp == NULL) {
3390                                         pp = page_get_mnode_cachelist(
3391                                             bin, flags, mnode, mtype);
3392                                 }
3393                         } else {
3394                                 pp = page_get_mnode_anylist(bin, szc,
3395                                     flags, mnode, mtype, dma_attr);
3396                         }
3397                         if (pp != NULL) {
3398                                 VM_STAT_ADD(pga_vmstats.pga_allocok);
3399                                 check_dma(dma_attr, pp, 1);
3400                                 return (pp);
3401                         }
3402                 } while (mtype != n &&
3403                     (mtype = mnoderanges[mtype].mnr_next) != -1);
3404                 if (!local_failed_stat) {
3405                         lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1);
3406                         local_failed_stat = 1;
3407                 }
3408         }
3409         VM_STAT_ADD(pga_vmstats.pga_allocfailed);
3410 
3411         return (NULL);
3412 }
3413 
3414 /*
3415  * page_create_io()
3416  *
3417  * This function is a copy of page_create_va() with an additional
3418  * argument 'mattr' that specifies DMA memory requirements to
3419  * the page list functions. This function is used by the segkmem
3420  * allocator so it is only to create new pages (i.e PG_EXCL is
3421  * set).
3422  *
3423  * Note: This interface is currently used by x86 PSM only and is
3424  *       not fully specified so the commitment level is only for
3425  *       private interface specific to x86. This interface uses PSM
3426  *       specific page_get_anylist() interface.
3427  */
3428 
3429 #define PAGE_HASH_SEARCH(index, pp, vp, off) { \
3430         for ((pp) = page_hash[(index)]; (pp); (pp) = (pp)->p_hash) { \
3431                 if ((pp)->p_vnode == (vp) && (pp)->p_offset == (off)) \
3432                         break; \
3433         } \
3434 }
3435 
3436 
3437 page_t *
3438 page_create_io(
3439         struct vnode    *vp,
3440         u_offset_t      off,
3441         uint_t          bytes,
3442         uint_t          flags,
3443         struct as       *as,
3444         caddr_t         vaddr,
3445         ddi_dma_attr_t  *mattr) /* DMA memory attributes if any */
3446 {
3447         page_t          *plist = NULL;
3448         uint_t          plist_len = 0;
3449         pgcnt_t         npages;
3450         page_t          *npp = NULL;
3451         uint_t          pages_req;
3452         page_t          *pp;
3453         kmutex_t        *phm = NULL;
3454         uint_t          index;
3455 
3456         TRACE_4(TR_FAC_VM, TR_PAGE_CREATE_START,
3457             "page_create_start:vp %p off %llx bytes %u flags %x",
3458             vp, off, bytes, flags);
3459 
3460         ASSERT((flags & ~(PG_EXCL | PG_WAIT | PG_PHYSCONTIG)) == 0);
3461 
3462         pages_req = npages = mmu_btopr(bytes);
3463 
3464         /*
3465          * Do the freemem and pcf accounting.
3466          */
3467         if (!page_create_wait(npages, flags)) {
3468                 return (NULL);
3469         }
3470 
3471         TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SUCCESS,
3472             "page_create_success:vp %p off %llx", vp, off);
3473 
3474         /*
3475          * If satisfying this request has left us with too little
3476          * memory, start the wheels turning to get some back.  The
3477          * first clause of the test prevents waking up the pageout
3478          * daemon in situations where it would decide that there's
3479          * nothing to do.
3480          */
3481         if (nscan < desscan && freemem < minfree) {
3482                 TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL,
3483                     "pageout_cv_signal:freemem %ld", freemem);
3484                 cv_signal(&proc_pageout->p_cv);
3485         }
3486 
3487         if (flags & PG_PHYSCONTIG) {
3488 
3489                 plist = page_get_contigpage(&npages, mattr, 1);
3490                 if (plist == NULL) {
3491                         page_create_putback(npages);
3492                         return (NULL);
3493                 }
3494 
3495                 pp = plist;
3496 
3497                 do {
3498                         if (!page_hashin(pp, vp, off, NULL)) {
3499                                 panic("pg_creat_io: hashin failed %p %p %llx",
3500                                     (void *)pp, (void *)vp, off);
3501                         }
3502                         VM_STAT_ADD(page_create_new);
3503                         off += MMU_PAGESIZE;
3504                         PP_CLRFREE(pp);
3505                         PP_CLRAGED(pp);
3506                         page_set_props(pp, P_REF);
3507                         pp = pp->p_next;
3508                 } while (pp != plist);
3509 
3510                 if (!npages) {
3511                         check_dma(mattr, plist, pages_req);
3512                         return (plist);
3513                 } else {
3514                         vaddr += (pages_req - npages) << MMU_PAGESHIFT;
3515                 }
3516 
3517                 /*
3518                  * fall-thru:
3519                  *
3520                  * page_get_contigpage returns when npages <= sgllen.
3521                  * Grab the rest of the non-contig pages below from anylist.
3522                  */
3523         }
3524 
3525         /*
3526          * Loop around collecting the requested number of pages.
3527          * Most of the time, we have to `create' a new page. With
3528          * this in mind, pull the page off the free list before
3529          * getting the hash lock.  This will minimize the hash
3530          * lock hold time, nesting, and the like.  If it turns
3531          * out we don't need the page, we put it back at the end.
3532          */
3533         while (npages--) {
3534                 phm = NULL;
3535 
3536                 index = PAGE_HASH_FUNC(vp, off);
3537 top:
3538                 ASSERT(phm == NULL);
3539                 ASSERT(index == PAGE_HASH_FUNC(vp, off));
3540                 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
3541 
3542                 if (npp == NULL) {
3543                         /*
3544                          * Try to get the page of any color either from
3545                          * the freelist or from the cache list.
3546                          */
3547                         npp = page_get_anylist(vp, off, as, vaddr, MMU_PAGESIZE,
3548                             flags & ~PG_MATCH_COLOR, mattr, NULL);
3549                         if (npp == NULL) {
3550                                 if (mattr == NULL) {
3551                                         /*
3552                                          * Not looking for a special page;
3553                                          * panic!
3554                                          */
3555                                         panic("no page found %d", (int)npages);
3556                                 }
3557                                 /*
3558                                  * No page found! This can happen
3559                                  * if we are looking for a page
3560                                  * within a specific memory range
3561                                  * for DMA purposes. If PG_WAIT is
3562                                  * specified then we wait for a
3563                                  * while and then try again. The
3564                                  * wait could be forever if we
3565                                  * don't get the page(s) we need.
3566                                  *
3567                                  * Note: XXX We really need a mechanism
3568                                  * to wait for pages in the desired
3569                                  * range. For now, we wait for any
3570                                  * pages and see if we can use it.
3571                                  */
3572 
3573                                 if ((mattr != NULL) && (flags & PG_WAIT)) {
3574                                         delay(10);
3575                                         goto top;
3576                                 }
3577                                 goto fail; /* undo accounting stuff */
3578                         }
3579 
3580                         if (PP_ISAGED(npp) == 0) {
3581                                 /*
3582                                  * Since this page came from the
3583                                  * cachelist, we must destroy the
3584                                  * old vnode association.
3585                                  */
3586                                 page_hashout(npp, (kmutex_t *)NULL);
3587                         }
3588                 }
3589 
3590                 /*
3591                  * We own this page!
3592                  */
3593                 ASSERT(PAGE_EXCL(npp));
3594                 ASSERT(npp->p_vnode == NULL);
3595                 ASSERT(!hat_page_is_mapped(npp));
3596                 PP_CLRFREE(npp);
3597                 PP_CLRAGED(npp);
3598 
3599                 /*
3600                  * Here we have a page in our hot little mits and are
3601                  * just waiting to stuff it on the appropriate lists.
3602                  * Get the mutex and check to see if it really does
3603                  * not exist.
3604                  */
3605                 phm = PAGE_HASH_MUTEX(index);
3606                 mutex_enter(phm);
3607                 PAGE_HASH_SEARCH(index, pp, vp, off);
3608                 if (pp == NULL) {
3609                         VM_STAT_ADD(page_create_new);
3610                         pp = npp;
3611                         npp = NULL;
3612                         if (!page_hashin(pp, vp, off, phm)) {
3613                                 /*
3614                                  * Since we hold the page hash mutex and
3615                                  * just searched for this page, page_hashin
3616                                  * had better not fail.  If it does, that
3617                                  * means somethread did not follow the
3618                                  * page hash mutex rules.  Panic now and
3619                                  * get it over with.  As usual, go down
3620                                  * holding all the locks.
3621                                  */
3622                                 ASSERT(MUTEX_HELD(phm));
3623                                 panic("page_create: hashin fail %p %p %llx %p",
3624                                     (void *)pp, (void *)vp, off, (void *)phm);
3625 
3626                         }
3627                         ASSERT(MUTEX_HELD(phm));
3628                         mutex_exit(phm);
3629                         phm = NULL;
3630 
3631                         /*
3632                          * Hat layer locking need not be done to set
3633                          * the following bits since the page is not hashed
3634                          * and was on the free list (i.e., had no mappings).
3635                          *
3636                          * Set the reference bit to protect
3637                          * against immediate pageout
3638                          *
3639                          * XXXmh modify freelist code to set reference
3640                          * bit so we don't have to do it here.
3641                          */
3642                         page_set_props(pp, P_REF);
3643                 } else {
3644                         ASSERT(MUTEX_HELD(phm));
3645                         mutex_exit(phm);
3646                         phm = NULL;
3647                         /*
3648                          * NOTE: This should not happen for pages associated
3649                          *       with kernel vnode 'kvp'.
3650                          */
3651                         /* XX64 - to debug why this happens! */
3652                         ASSERT(!VN_ISKAS(vp));
3653                         if (VN_ISKAS(vp))
3654                                 cmn_err(CE_NOTE,
3655                                     "page_create: page not expected "
3656                                     "in hash list for kernel vnode - pp 0x%p",
3657                                     (void *)pp);
3658                         VM_STAT_ADD(page_create_exists);
3659                         goto fail;
3660                 }
3661 
3662                 /*
3663                  * Got a page!  It is locked.  Acquire the i/o
3664                  * lock since we are going to use the p_next and
3665                  * p_prev fields to link the requested pages together.
3666                  */
3667                 page_io_lock(pp);
3668                 page_add(&plist, pp);
3669                 plist = plist->p_next;
3670                 off += MMU_PAGESIZE;
3671                 vaddr += MMU_PAGESIZE;
3672         }
3673 
3674         check_dma(mattr, plist, pages_req);
3675         return (plist);
3676 
3677 fail:
3678         if (npp != NULL) {
3679                 /*
3680                  * Did not need this page after all.
3681                  * Put it back on the free list.
3682                  */
3683                 VM_STAT_ADD(page_create_putbacks);
3684                 PP_SETFREE(npp);
3685                 PP_SETAGED(npp);
3686                 npp->p_offset = (u_offset_t)-1;
3687                 page_list_add(npp, PG_FREE_LIST | PG_LIST_TAIL);
3688                 page_unlock(npp);
3689         }
3690 
3691         /*
3692          * Give up the pages we already got.
3693          */
3694         while (plist != NULL) {
3695                 pp = plist;
3696                 page_sub(&plist, pp);
3697                 page_io_unlock(pp);
3698                 plist_len++;
3699                 /*LINTED: constant in conditional ctx*/
3700                 VN_DISPOSE(pp, B_INVAL, 0, kcred);
3701         }
3702 
3703         /*
3704          * VN_DISPOSE does freemem accounting for the pages in plist
3705          * by calling page_free. So, we need to undo the pcf accounting
3706          * for only the remaining pages.
3707          */
3708         VM_STAT_ADD(page_create_putbacks);
3709         page_create_putback(pages_req - plist_len);
3710 
3711         return (NULL);
3712 }
3713 #endif /* !__xpv */
3714 
3715 
3716 /*
3717  * Copy the data from the physical page represented by "frompp" to
3718  * that represented by "topp". ppcopy uses CPU->cpu_caddr1 and
3719  * CPU->cpu_caddr2.  It assumes that no one uses either map at interrupt
3720  * level and no one sleeps with an active mapping there.
3721  *
3722  * Note that the ref/mod bits in the page_t's are not affected by
3723  * this operation, hence it is up to the caller to update them appropriately.
3724  */
3725 int
3726 ppcopy(page_t *frompp, page_t *topp)
3727 {
3728         caddr_t         pp_addr1;
3729         caddr_t         pp_addr2;
3730         hat_mempte_t    pte1;
3731         hat_mempte_t    pte2;
3732         kmutex_t        *ppaddr_mutex;
3733         label_t         ljb;
3734         int             ret = 1;
3735 
3736         ASSERT_STACK_ALIGNED();
3737         ASSERT(PAGE_LOCKED(frompp));
3738         ASSERT(PAGE_LOCKED(topp));
3739 
3740         if (kpm_enable) {
3741                 pp_addr1 = hat_kpm_page2va(frompp, 0);
3742                 pp_addr2 = hat_kpm_page2va(topp, 0);
3743                 kpreempt_disable();
3744         } else {
3745                 /*
3746                  * disable pre-emption so that CPU can't change
3747                  */
3748                 kpreempt_disable();
3749 
3750                 pp_addr1 = CPU->cpu_caddr1;
3751                 pp_addr2 = CPU->cpu_caddr2;
3752                 pte1 = CPU->cpu_caddr1pte;
3753                 pte2 = CPU->cpu_caddr2pte;
3754 
3755                 ppaddr_mutex = &CPU->cpu_ppaddr_mutex;
3756                 mutex_enter(ppaddr_mutex);
3757 
3758                 hat_mempte_remap(page_pptonum(frompp), pp_addr1, pte1,
3759                     PROT_READ | HAT_STORECACHING_OK, HAT_LOAD_NOCONSIST);
3760                 hat_mempte_remap(page_pptonum(topp), pp_addr2, pte2,
3761                     PROT_READ | PROT_WRITE | HAT_STORECACHING_OK,
3762                     HAT_LOAD_NOCONSIST);
3763         }
3764 
3765         if (on_fault(&ljb)) {
3766                 ret = 0;
3767                 goto faulted;
3768         }
3769         if (use_sse_pagecopy)
3770 #ifdef __xpv
3771                 page_copy_no_xmm(pp_addr2, pp_addr1);
3772 #else
3773                 hwblkpagecopy(pp_addr1, pp_addr2);
3774 #endif
3775         else
3776                 bcopy(pp_addr1, pp_addr2, PAGESIZE);
3777 
3778         no_fault();
3779 faulted:
3780         if (!kpm_enable) {
3781 #ifdef __xpv
3782                 /*
3783                  * We can't leave unused mappings laying about under the
3784                  * hypervisor, so blow them away.
3785                  */
3786                 if (HYPERVISOR_update_va_mapping((uintptr_t)pp_addr1, 0,
3787                     UVMF_INVLPG | UVMF_LOCAL) < 0)
3788                         panic("HYPERVISOR_update_va_mapping() failed");
3789                 if (HYPERVISOR_update_va_mapping((uintptr_t)pp_addr2, 0,
3790                     UVMF_INVLPG | UVMF_LOCAL) < 0)
3791                         panic("HYPERVISOR_update_va_mapping() failed");
3792 #endif
3793                 mutex_exit(ppaddr_mutex);
3794         }
3795         kpreempt_enable();
3796         return (ret);
3797 }
3798 
3799 void
3800 pagezero(page_t *pp, uint_t off, uint_t len)
3801 {
3802         ASSERT(PAGE_LOCKED(pp));
3803         pfnzero(page_pptonum(pp), off, len);
3804 }
3805 
3806 /*
3807  * Zero the physical page from off to off + len given by pfn
3808  * without changing the reference and modified bits of page.
3809  *
3810  * We use this using CPU private page address #2, see ppcopy() for more info.
3811  * pfnzero() must not be called at interrupt level.
3812  */
3813 void
3814 pfnzero(pfn_t pfn, uint_t off, uint_t len)
3815 {
3816         caddr_t         pp_addr2;
3817         hat_mempte_t    pte2;
3818         kmutex_t        *ppaddr_mutex = NULL;
3819 
3820         ASSERT_STACK_ALIGNED();
3821         ASSERT(len <= MMU_PAGESIZE);
3822         ASSERT(off <= MMU_PAGESIZE);
3823         ASSERT(off + len <= MMU_PAGESIZE);
3824 
3825         if (kpm_enable && !pfn_is_foreign(pfn)) {
3826                 pp_addr2 = hat_kpm_pfn2va(pfn);
3827                 kpreempt_disable();
3828         } else {
3829                 kpreempt_disable();
3830 
3831                 pp_addr2 = CPU->cpu_caddr2;
3832                 pte2 = CPU->cpu_caddr2pte;
3833 
3834                 ppaddr_mutex = &CPU->cpu_ppaddr_mutex;
3835                 mutex_enter(ppaddr_mutex);
3836 
3837                 hat_mempte_remap(pfn, pp_addr2, pte2,
3838                     PROT_READ | PROT_WRITE | HAT_STORECACHING_OK,
3839                     HAT_LOAD_NOCONSIST);
3840         }
3841 
3842         if (use_sse_pagezero) {
3843 #ifdef __xpv
3844                 uint_t rem;
3845 
3846                 /*
3847                  * zero a byte at a time until properly aligned for
3848                  * block_zero_no_xmm().
3849                  */
3850                 while (!P2NPHASE(off, ((uint_t)BLOCKZEROALIGN)) && len-- > 0)
3851                         pp_addr2[off++] = 0;
3852 
3853                 /*
3854                  * Now use faster block_zero_no_xmm() for any range
3855                  * that is properly aligned and sized.
3856                  */
3857                 rem = P2PHASE(len, ((uint_t)BLOCKZEROALIGN));
3858                 len -= rem;
3859                 if (len != 0) {
3860                         block_zero_no_xmm(pp_addr2 + off, len);
3861                         off += len;
3862                 }
3863 
3864                 /*
3865                  * zero remainder with byte stores.
3866                  */
3867                 while (rem-- > 0)
3868                         pp_addr2[off++] = 0;
3869 #else
3870                 hwblkclr(pp_addr2 + off, len);
3871 #endif
3872         } else {
3873                 bzero(pp_addr2 + off, len);
3874         }
3875 
3876         if (!kpm_enable || pfn_is_foreign(pfn)) {
3877 #ifdef __xpv
3878                 /*
3879                  * On the hypervisor this page might get used for a page
3880                  * table before any intervening change to this mapping,
3881                  * so blow it away.
3882                  */
3883                 if (HYPERVISOR_update_va_mapping((uintptr_t)pp_addr2, 0,
3884                     UVMF_INVLPG) < 0)
3885                         panic("HYPERVISOR_update_va_mapping() failed");
3886 #endif
3887                 mutex_exit(ppaddr_mutex);
3888         }
3889 
3890         kpreempt_enable();
3891 }
3892 
3893 /*
3894  * Platform-dependent page scrub call.
3895  */
3896 void
3897 pagescrub(page_t *pp, uint_t off, uint_t len)
3898 {
3899         /*
3900          * For now, we rely on the fact that pagezero() will
3901          * always clear UEs.
3902          */
3903         pagezero(pp, off, len);
3904 }
3905 
3906 /*
3907  * set up two private addresses for use on a given CPU for use in ppcopy()
3908  */
3909 void
3910 setup_vaddr_for_ppcopy(struct cpu *cpup)
3911 {
3912         void *addr;
3913         hat_mempte_t pte_pa;
3914 
3915         addr = vmem_alloc(heap_arena, mmu_ptob(1), VM_SLEEP);
3916         pte_pa = hat_mempte_setup(addr);
3917         cpup->cpu_caddr1 = addr;
3918         cpup->cpu_caddr1pte = pte_pa;
3919 
3920         addr = vmem_alloc(heap_arena, mmu_ptob(1), VM_SLEEP);
3921         pte_pa = hat_mempte_setup(addr);
3922         cpup->cpu_caddr2 = addr;
3923         cpup->cpu_caddr2pte = pte_pa;
3924 
3925         mutex_init(&cpup->cpu_ppaddr_mutex, NULL, MUTEX_DEFAULT, NULL);
3926 }
3927 
3928 /*
3929  * Undo setup_vaddr_for_ppcopy
3930  */
3931 void
3932 teardown_vaddr_for_ppcopy(struct cpu *cpup)
3933 {
3934         mutex_destroy(&cpup->cpu_ppaddr_mutex);
3935 
3936         hat_mempte_release(cpup->cpu_caddr2, cpup->cpu_caddr2pte);
3937         cpup->cpu_caddr2pte = 0;
3938         vmem_free(heap_arena, cpup->cpu_caddr2, mmu_ptob(1));
3939         cpup->cpu_caddr2 = 0;
3940 
3941         hat_mempte_release(cpup->cpu_caddr1, cpup->cpu_caddr1pte);
3942         cpup->cpu_caddr1pte = 0;
3943         vmem_free(heap_arena, cpup->cpu_caddr1, mmu_ptob(1));
3944         cpup->cpu_caddr1 = 0;
3945 }
3946 
3947 /*
3948  * Function for flushing D-cache when performing module relocations
3949  * to an alternate mapping.  Unnecessary on Intel / AMD platforms.
3950  */
3951 void
3952 dcache_flushall()
3953 {}
3954 
3955 /*
3956  * Allocate a memory page.  The argument 'seed' can be any pseudo-random
3957  * number to vary where the pages come from.  This is quite a hacked up
3958  * method -- it works for now, but really needs to be fixed up a bit.
3959  *
3960  * We currently use page_create_va() on the kvp with fake offsets,
3961  * segments and virt address.  This is pretty bogus, but was copied from the
3962  * old hat_i86.c code.  A better approach would be to specify either mnode
3963  * random or mnode local and takes a page from whatever color has the MOST
3964  * available - this would have a minimal impact on page coloring.
3965  */
3966 page_t *
3967 page_get_physical(uintptr_t seed)
3968 {
3969         page_t *pp;
3970         u_offset_t offset;
3971         static struct seg tmpseg;
3972         static uintptr_t ctr = 0;
3973 
3974         /*
3975          * This code is gross, we really need a simpler page allocator.
3976          *
3977          * We need to assign an offset for the page to call page_create_va()
3978          * To avoid conflicts with other pages, we get creative with the offset.
3979          * For 32 bits, we need an offset > 4Gig
3980          * For 64 bits, need an offset somewhere in the VA hole.
3981          */
3982         offset = seed;
3983         if (offset > kernelbase)
3984                 offset -= kernelbase;
3985         offset <<= MMU_PAGESHIFT;
3986 #if defined(__amd64)
3987         offset += mmu.hole_start;       /* something in VA hole */
3988 #else
3989         offset += 1ULL << 40;     /* something > 4 Gig */
3990 #endif
3991 
3992         if (page_resv(1, KM_NOSLEEP) == 0)
3993                 return (NULL);
3994 
3995 #ifdef  DEBUG
3996         pp = page_exists(&kvp, offset);
3997         if (pp != NULL)
3998                 panic("page already exists %p", (void *)pp);
3999 #endif
4000 
4001         pp = page_create_va(&kvp, offset, MMU_PAGESIZE, PG_EXCL,
4002             &tmpseg, (caddr_t)(ctr += MMU_PAGESIZE));       /* changing VA usage */
4003         if (pp != NULL) {
4004                 page_io_unlock(pp);
4005                 page_downgrade(pp);
4006         }
4007         return (pp);
4008 }