1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
  23  */
  24 /*
  25  * Copyright (c) 2010, Intel Corporation.
  26  * All rights reserved.
  27  * Copyright 2018 Joyent, Inc.
  28  */
  29 
  30 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  31 /*      All Rights Reserved   */
  32 
  33 /*
  34  * Portions of this source code were derived from Berkeley 4.3 BSD
  35  * under license from the Regents of the University of California.
  36  */
  37 
  38 /*
  39  * UNIX machine dependent virtual memory support.
  40  */
  41 
  42 #include <sys/types.h>
  43 #include <sys/param.h>
  44 #include <sys/systm.h>
  45 #include <sys/user.h>
  46 #include <sys/proc.h>
  47 #include <sys/kmem.h>
  48 #include <sys/vmem.h>
  49 #include <sys/buf.h>
  50 #include <sys/cpuvar.h>
  51 #include <sys/lgrp.h>
  52 #include <sys/disp.h>
  53 #include <sys/vm.h>
  54 #include <sys/mman.h>
  55 #include <sys/vnode.h>
  56 #include <sys/cred.h>
  57 #include <sys/exec.h>
  58 #include <sys/exechdr.h>
  59 #include <sys/debug.h>
  60 #include <sys/vmsystm.h>
  61 #include <sys/swap.h>
  62 #include <sys/dumphdr.h>
  63 #include <sys/random.h>
  64 
  65 #include <vm/hat.h>
  66 #include <vm/as.h>
  67 #include <vm/seg.h>
  68 #include <vm/seg_kp.h>
  69 #include <vm/seg_vn.h>
  70 #include <vm/page.h>
  71 #include <vm/seg_kmem.h>
  72 #include <vm/seg_kpm.h>
  73 #include <vm/vm_dep.h>
  74 
  75 #include <sys/cpu.h>
  76 #include <sys/vm_machparam.h>
  77 #include <sys/memlist.h>
  78 #include <sys/bootconf.h> /* XXX the memlist stuff belongs in memlist_plat.h */
  79 #include <vm/hat_i86.h>
  80 #include <sys/x86_archext.h>
  81 #include <sys/elf_386.h>
  82 #include <sys/cmn_err.h>
  83 #include <sys/archsystm.h>
  84 #include <sys/machsystm.h>
  85 #include <sys/secflags.h>
  86 
  87 #include <sys/vtrace.h>
  88 #include <sys/ddidmareq.h>
  89 #include <sys/promif.h>
  90 #include <sys/memnode.h>
  91 #include <sys/stack.h>
  92 #include <util/qsort.h>
  93 #include <sys/taskq.h>
  94 
  95 #ifdef __xpv
  96 
  97 #include <sys/hypervisor.h>
  98 #include <sys/xen_mmu.h>
  99 #include <sys/balloon_impl.h>
 100 
 101 /*
 102  * domain 0 pages usable for DMA are kept pre-allocated and kept in
 103  * distinct lists, ordered by increasing mfn.
 104  */
 105 static kmutex_t io_pool_lock;
 106 static kmutex_t contig_list_lock;
 107 static page_t *io_pool_4g;      /* pool for 32 bit dma limited devices */
 108 static page_t *io_pool_16m;     /* pool for 24 bit dma limited legacy devices */
 109 static long io_pool_cnt;
 110 static long io_pool_cnt_max = 0;
 111 #define DEFAULT_IO_POOL_MIN     128
 112 static long io_pool_cnt_min = DEFAULT_IO_POOL_MIN;
 113 static long io_pool_cnt_lowater = 0;
 114 static long io_pool_shrink_attempts; /* how many times did we try to shrink */
 115 static long io_pool_shrinks;    /* how many times did we really shrink */
 116 static long io_pool_grows;      /* how many times did we grow */
 117 static mfn_t start_mfn = 1;
 118 static caddr_t io_pool_kva;     /* use to alloc pages when needed */
 119 
 120 static int create_contig_pfnlist(uint_t);
 121 
 122 /*
 123  * percentage of phys mem to hold in the i/o pool
 124  */
 125 #define DEFAULT_IO_POOL_PCT     2
 126 static long io_pool_physmem_pct = DEFAULT_IO_POOL_PCT;
 127 static void page_io_pool_sub(page_t **, page_t *, page_t *);
 128 int ioalloc_dbg = 0;
 129 
 130 #endif /* __xpv */
 131 
 132 uint_t vac_colors = 1;
 133 
 134 int largepagesupport = 0;
 135 extern uint_t page_create_new;
 136 extern uint_t page_create_exists;
 137 extern uint_t page_create_putbacks;
 138 /*
 139  * Allow users to disable the kernel's use of SSE.
 140  */
 141 extern int use_sse_pagecopy, use_sse_pagezero;
 142 
 143 /*
 144  * combined memory ranges from mnode and memranges[] to manage single
 145  * mnode/mtype dimension in the page lists.
 146  */
 147 typedef struct {
 148         pfn_t   mnr_pfnlo;
 149         pfn_t   mnr_pfnhi;
 150         int     mnr_mnode;
 151         int     mnr_memrange;           /* index into memranges[] */
 152         int     mnr_next;               /* next lower PA mnoderange */
 153         int     mnr_exists;
 154         /* maintain page list stats */
 155         pgcnt_t mnr_mt_clpgcnt;         /* cache list cnt */
 156         pgcnt_t mnr_mt_flpgcnt[MMU_PAGE_SIZES]; /* free list cnt per szc */
 157         pgcnt_t mnr_mt_totcnt;          /* sum of cache and free lists */
 158 #ifdef DEBUG
 159         struct mnr_mts {                /* mnode/mtype szc stats */
 160                 pgcnt_t mnr_mts_pgcnt;
 161                 int     mnr_mts_colors;
 162                 pgcnt_t *mnr_mtsc_pgcnt;
 163         }       *mnr_mts;
 164 #endif
 165 } mnoderange_t;
 166 
 167 #define MEMRANGEHI(mtype)                                               \
 168         ((mtype > 0) ? memranges[mtype - 1] - 1: physmax)
 169 #define MEMRANGELO(mtype)       (memranges[mtype])
 170 
 171 #define MTYPE_FREEMEM(mt)       (mnoderanges[mt].mnr_mt_totcnt)
 172 
 173 /*
 174  * As the PC architecture evolved memory up was clumped into several
 175  * ranges for various historical I/O devices to do DMA.
 176  * < 16Meg - ISA bus
 177  * < 2Gig - ???
 178  * < 4Gig - PCI bus or drivers that don't understand PAE mode
 179  *
 180  * These are listed in reverse order, so that we can skip over unused
 181  * ranges on machines with small memories.
 182  *
 183  * For now under the Hypervisor, we'll only ever have one memrange.
 184  */
 185 #define PFN_4GIG        0x100000
 186 #define PFN_16MEG       0x1000
 187 /* Indices into the memory range (arch_memranges) array. */
 188 #define MRI_4G          0
 189 #define MRI_2G          1
 190 #define MRI_16M         2
 191 #define MRI_0           3
 192 static pfn_t arch_memranges[NUM_MEM_RANGES] = {
 193     PFN_4GIG,   /* pfn range for 4G and above */
 194     0x80000,    /* pfn range for 2G-4G */
 195     PFN_16MEG,  /* pfn range for 16M-2G */
 196     0x00000,    /* pfn range for 0-16M */
 197 };
 198 pfn_t *memranges = &arch_memranges[0];
 199 int nranges = NUM_MEM_RANGES;
 200 
 201 /*
 202  * This combines mem_node_config and memranges into one data
 203  * structure to be used for page list management.
 204  */
 205 mnoderange_t    *mnoderanges;
 206 int             mnoderangecnt;
 207 int             mtype4g;
 208 int             mtype16m;
 209 int             mtypetop;       /* index of highest pfn'ed mnoderange */
 210 
 211 /*
 212  * 4g memory management variables for systems with more than 4g of memory:
 213  *
 214  * physical memory below 4g is required for 32bit dma devices and, currently,
 215  * for kmem memory. On systems with more than 4g of memory, the pool of memory
 216  * below 4g can be depleted without any paging activity given that there is
 217  * likely to be sufficient memory above 4g.
 218  *
 219  * physmax4g is set true if the largest pfn is over 4g. The rest of the
 220  * 4g memory management code is enabled only when physmax4g is true.
 221  *
 222  * maxmem4g is the count of the maximum number of pages on the page lists
 223  * with physical addresses below 4g. It can be a lot less then 4g given that
 224  * BIOS may reserve large chunks of space below 4g for hot plug pci devices,
 225  * agp aperture etc.
 226  *
 227  * freemem4g maintains the count of the number of available pages on the
 228  * page lists with physical addresses below 4g.
 229  *
 230  * DESFREE4G specifies the desired amount of below 4g memory. It defaults to
 231  * 6% (desfree4gshift = 4) of maxmem4g.
 232  *
 233  * RESTRICT4G_ALLOC returns true if freemem4g falls below DESFREE4G
 234  * and the amount of physical memory above 4g is greater than freemem4g.
 235  * In this case, page_get_* routines will restrict below 4g allocations
 236  * for requests that don't specifically require it.
 237  */
 238 
 239 #define DESFREE4G       (maxmem4g >> desfree4gshift)
 240 
 241 #define RESTRICT4G_ALLOC                                        \
 242         (physmax4g && (freemem4g < DESFREE4G) && ((freemem4g << 1) < freemem))
 243 
 244 static pgcnt_t  maxmem4g;
 245 static pgcnt_t  freemem4g;
 246 static int      physmax4g;
 247 static int      desfree4gshift = 4;     /* maxmem4g shift to derive DESFREE4G */
 248 
 249 /*
 250  * 16m memory management:
 251  *
 252  * reserve some amount of physical memory below 16m for legacy devices.
 253  *
 254  * RESTRICT16M_ALLOC returns true if an there are sufficient free pages above
 255  * 16m or if the 16m pool drops below DESFREE16M.
 256  *
 257  * In this case, general page allocations via page_get_{free,cache}list
 258  * routines will be restricted from allocating from the 16m pool. Allocations
 259  * that require specific pfn ranges (page_get_anylist) and PG_PANIC allocations
 260  * are not restricted.
 261  */
 262 
 263 #define FREEMEM16M      MTYPE_FREEMEM(mtype16m)
 264 #define DESFREE16M      desfree16m
 265 #define RESTRICT16M_ALLOC(freemem, pgcnt, flags)                \
 266         ((freemem != 0) && ((flags & PG_PANIC) == 0) &&             \
 267             ((freemem >= (FREEMEM16M)) ||                    \
 268             (FREEMEM16M  < (DESFREE16M + pgcnt))))
 269 
 270 static pgcnt_t  desfree16m = 0x380;
 271 
 272 /*
 273  * This can be patched via /etc/system to allow old non-PAE aware device
 274  * drivers to use kmem_alloc'd memory on 32 bit systems with > 4Gig RAM.
 275  */
 276 int restricted_kmemalloc = 0;
 277 
 278 #ifdef VM_STATS
 279 struct {
 280         ulong_t pga_alloc;
 281         ulong_t pga_notfullrange;
 282         ulong_t pga_nulldmaattr;
 283         ulong_t pga_allocok;
 284         ulong_t pga_allocfailed;
 285         ulong_t pgma_alloc;
 286         ulong_t pgma_allocok;
 287         ulong_t pgma_allocfailed;
 288         ulong_t pgma_allocempty;
 289 } pga_vmstats;
 290 #endif
 291 
 292 uint_t mmu_page_sizes;
 293 
 294 /* How many page sizes the users can see */
 295 uint_t mmu_exported_page_sizes;
 296 
 297 /* page sizes that legacy applications can see */
 298 uint_t mmu_legacy_page_sizes;
 299 
 300 /*
 301  * Number of pages in 1 GB.  Don't enable automatic large pages if we have
 302  * fewer than this many pages.
 303  */
 304 pgcnt_t shm_lpg_min_physmem = 1 << (30 - MMU_PAGESHIFT);
 305 pgcnt_t privm_lpg_min_physmem = 1 << (30 - MMU_PAGESHIFT);
 306 
 307 /*
 308  * Maximum and default segment size tunables for user private
 309  * and shared anon memory, and user text and initialized data.
 310  * These can be patched via /etc/system to allow large pages
 311  * to be used for mapping application private and shared anon memory.
 312  */
 313 size_t mcntl0_lpsize = MMU_PAGESIZE;
 314 size_t max_uheap_lpsize = MMU_PAGESIZE;
 315 size_t default_uheap_lpsize = MMU_PAGESIZE;
 316 size_t max_ustack_lpsize = MMU_PAGESIZE;
 317 size_t default_ustack_lpsize = MMU_PAGESIZE;
 318 size_t max_privmap_lpsize = MMU_PAGESIZE;
 319 size_t max_uidata_lpsize = MMU_PAGESIZE;
 320 size_t max_utext_lpsize = MMU_PAGESIZE;
 321 size_t max_shm_lpsize = MMU_PAGESIZE;
 322 
 323 
 324 /*
 325  * initialized by page_coloring_init().
 326  */
 327 uint_t  page_colors;
 328 uint_t  page_colors_mask;
 329 uint_t  page_coloring_shift;
 330 int     cpu_page_colors;
 331 static uint_t   l2_colors;
 332 
 333 /*
 334  * Page freelists and cachelists are dynamically allocated once mnoderangecnt
 335  * and page_colors are calculated from the l2 cache n-way set size.  Within a
 336  * mnode range, the page freelist and cachelist are hashed into bins based on
 337  * color. This makes it easier to search for a page within a specific memory
 338  * range.
 339  */
 340 #define PAGE_COLORS_MIN 16
 341 
 342 page_t ****page_freelists;
 343 page_t ***page_cachelists;
 344 
 345 
 346 /*
 347  * Used by page layer to know about page sizes
 348  */
 349 hw_pagesize_t hw_page_array[MAX_NUM_LEVEL + 1];
 350 
 351 kmutex_t        *fpc_mutex[NPC_MUTEX];
 352 kmutex_t        *cpc_mutex[NPC_MUTEX];
 353 
 354 /* Lock to protect mnoderanges array for memory DR operations. */
 355 static kmutex_t mnoderange_lock;
 356 
 357 /*
 358  * Only let one thread at a time try to coalesce large pages, to
 359  * prevent them from working against each other.
 360  */
 361 static kmutex_t contig_lock;
 362 #define CONTIG_LOCK()   mutex_enter(&contig_lock);
 363 #define CONTIG_UNLOCK() mutex_exit(&contig_lock);
 364 
 365 #define PFN_16M         (mmu_btop((uint64_t)0x1000000))
 366 
 367 caddr_t
 368 i86devmap(pfn_t pf, pgcnt_t pgcnt, uint_t prot)
 369 {
 370         caddr_t addr;
 371         caddr_t addr1;
 372         page_t *pp;
 373 
 374         addr1 = addr = vmem_alloc(heap_arena, mmu_ptob(pgcnt), VM_SLEEP);
 375 
 376         for (; pgcnt != 0; addr += MMU_PAGESIZE, ++pf, --pgcnt) {
 377                 pp = page_numtopp_nolock(pf);
 378                 if (pp == NULL) {
 379                         hat_devload(kas.a_hat, addr, MMU_PAGESIZE, pf,
 380                             prot | HAT_NOSYNC, HAT_LOAD_LOCK);
 381                 } else {
 382                         hat_memload(kas.a_hat, addr, pp,
 383                             prot | HAT_NOSYNC, HAT_LOAD_LOCK);
 384                 }
 385         }
 386 
 387         return (addr1);
 388 }
 389 
 390 /*
 391  * This routine is like page_numtopp, but accepts only free pages, which
 392  * it allocates (unfrees) and returns with the exclusive lock held.
 393  * It is used by machdep.c/dma_init() to find contiguous free pages.
 394  */
 395 page_t *
 396 page_numtopp_alloc(pfn_t pfnum)
 397 {
 398         page_t *pp;
 399 
 400 retry:
 401         pp = page_numtopp_nolock(pfnum);
 402         if (pp == NULL) {
 403                 return (NULL);
 404         }
 405 
 406         if (!page_trylock(pp, SE_EXCL)) {
 407                 return (NULL);
 408         }
 409 
 410         if (page_pptonum(pp) != pfnum) {
 411                 page_unlock(pp);
 412                 goto retry;
 413         }
 414 
 415         if (!PP_ISFREE(pp)) {
 416                 page_unlock(pp);
 417                 return (NULL);
 418         }
 419         if (pp->p_szc) {
 420                 page_demote_free_pages(pp);
 421                 page_unlock(pp);
 422                 goto retry;
 423         }
 424 
 425         /* If associated with a vnode, destroy mappings */
 426 
 427         if (pp->p_vnode) {
 428 
 429                 page_destroy_free(pp);
 430 
 431                 if (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_NO_RECLAIM)) {
 432                         return (NULL);
 433                 }
 434 
 435                 if (page_pptonum(pp) != pfnum) {
 436                         page_unlock(pp);
 437                         goto retry;
 438                 }
 439         }
 440 
 441         if (!PP_ISFREE(pp)) {
 442                 page_unlock(pp);
 443                 return (NULL);
 444         }
 445 
 446         if (!page_reclaim(pp, (kmutex_t *)NULL))
 447                 return (NULL);
 448 
 449         return (pp);
 450 }
 451 
 452 /*
 453  * Return the optimum page size for a given mapping
 454  */
 455 /*ARGSUSED*/
 456 size_t
 457 map_pgsz(int maptype, struct proc *p, caddr_t addr, size_t len, int memcntl)
 458 {
 459         level_t l = 0;
 460         size_t pgsz = MMU_PAGESIZE;
 461         size_t max_lpsize;
 462         uint_t mszc;
 463 
 464         ASSERT(maptype != MAPPGSZ_VA);
 465 
 466         if (maptype != MAPPGSZ_ISM && physmem < privm_lpg_min_physmem) {
 467                 return (MMU_PAGESIZE);
 468         }
 469 
 470         switch (maptype) {
 471         case MAPPGSZ_HEAP:
 472         case MAPPGSZ_STK:
 473                 max_lpsize = memcntl ? mcntl0_lpsize : (maptype ==
 474                     MAPPGSZ_HEAP ? max_uheap_lpsize : max_ustack_lpsize);
 475                 if (max_lpsize == MMU_PAGESIZE) {
 476                         return (MMU_PAGESIZE);
 477                 }
 478                 if (len == 0) {
 479                         len = (maptype == MAPPGSZ_HEAP) ? p->p_brkbase +
 480                             p->p_brksize - p->p_bssbase : p->p_stksize;
 481                 }
 482                 len = (maptype == MAPPGSZ_HEAP) ? MAX(len,
 483                     default_uheap_lpsize) : MAX(len, default_ustack_lpsize);
 484 
 485                 /*
 486                  * use the pages size that best fits len
 487                  */
 488                 for (l = mmu.umax_page_level; l > 0; --l) {
 489                         if (LEVEL_SIZE(l) > max_lpsize || len < LEVEL_SIZE(l)) {
 490                                 continue;
 491                         } else {
 492                                 pgsz = LEVEL_SIZE(l);
 493                         }
 494                         break;
 495                 }
 496 
 497                 mszc = (maptype == MAPPGSZ_HEAP ? p->p_brkpageszc :
 498                     p->p_stkpageszc);
 499                 if (addr == 0 && (pgsz < hw_page_array[mszc].hp_size)) {
 500                         pgsz = hw_page_array[mszc].hp_size;
 501                 }
 502                 return (pgsz);
 503 
 504         case MAPPGSZ_ISM:
 505                 for (l = mmu.umax_page_level; l > 0; --l) {
 506                         if (len >= LEVEL_SIZE(l))
 507                                 return (LEVEL_SIZE(l));
 508                 }
 509                 return (LEVEL_SIZE(0));
 510         }
 511         return (pgsz);
 512 }
 513 
 514 static uint_t
 515 map_szcvec(caddr_t addr, size_t size, uintptr_t off, size_t max_lpsize,
 516     size_t min_physmem)
 517 {
 518         caddr_t eaddr = addr + size;
 519         uint_t szcvec = 0;
 520         caddr_t raddr;
 521         caddr_t readdr;
 522         size_t  pgsz;
 523         int i;
 524 
 525         if (physmem < min_physmem || max_lpsize <= MMU_PAGESIZE) {
 526                 return (0);
 527         }
 528 
 529         for (i = mmu_exported_page_sizes - 1; i > 0; i--) {
 530                 pgsz = page_get_pagesize(i);
 531                 if (pgsz > max_lpsize) {
 532                         continue;
 533                 }
 534                 raddr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
 535                 readdr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
 536                 if (raddr < addr || raddr >= readdr) {
 537                         continue;
 538                 }
 539                 if (P2PHASE((uintptr_t)addr ^ off, pgsz)) {
 540                         continue;
 541                 }
 542                 /*
 543                  * Set szcvec to the remaining page sizes.
 544                  */
 545                 szcvec = ((1 << (i + 1)) - 1) & ~1;
 546                 break;
 547         }
 548         return (szcvec);
 549 }
 550 
 551 /*
 552  * Return a bit vector of large page size codes that
 553  * can be used to map [addr, addr + len) region.
 554  */
 555 /*ARGSUSED*/
 556 uint_t
 557 map_pgszcvec(caddr_t addr, size_t size, uintptr_t off, int flags, int type,
 558     int memcntl)
 559 {
 560         size_t max_lpsize = mcntl0_lpsize;
 561 
 562         if (mmu.max_page_level == 0)
 563                 return (0);
 564 
 565         if (flags & MAP_TEXT) {
 566                 if (!memcntl)
 567                         max_lpsize = max_utext_lpsize;
 568                 return (map_szcvec(addr, size, off, max_lpsize,
 569                     shm_lpg_min_physmem));
 570 
 571         } else if (flags & MAP_INITDATA) {
 572                 if (!memcntl)
 573                         max_lpsize = max_uidata_lpsize;
 574                 return (map_szcvec(addr, size, off, max_lpsize,
 575                     privm_lpg_min_physmem));
 576 
 577         } else if (type == MAPPGSZC_SHM) {
 578                 if (!memcntl)
 579                         max_lpsize = max_shm_lpsize;
 580                 return (map_szcvec(addr, size, off, max_lpsize,
 581                     shm_lpg_min_physmem));
 582 
 583         } else if (type == MAPPGSZC_HEAP) {
 584                 if (!memcntl)
 585                         max_lpsize = max_uheap_lpsize;
 586                 return (map_szcvec(addr, size, off, max_lpsize,
 587                     privm_lpg_min_physmem));
 588 
 589         } else if (type == MAPPGSZC_STACK) {
 590                 if (!memcntl)
 591                         max_lpsize = max_ustack_lpsize;
 592                 return (map_szcvec(addr, size, off, max_lpsize,
 593                     privm_lpg_min_physmem));
 594 
 595         } else {
 596                 if (!memcntl)
 597                         max_lpsize = max_privmap_lpsize;
 598                 return (map_szcvec(addr, size, off, max_lpsize,
 599                     privm_lpg_min_physmem));
 600         }
 601 }
 602 
 603 /*
 604  * Handle a pagefault.
 605  */
 606 faultcode_t
 607 pagefault(
 608         caddr_t addr,
 609         enum fault_type type,
 610         enum seg_rw rw,
 611         int iskernel)
 612 {
 613         struct as *as;
 614         struct hat *hat;
 615         struct proc *p;
 616         kthread_t *t;
 617         faultcode_t res;
 618         caddr_t base;
 619         size_t len;
 620         int err;
 621         int mapped_red;
 622         uintptr_t ea;
 623 
 624         ASSERT_STACK_ALIGNED();
 625 
 626         if (INVALID_VADDR(addr))
 627                 return (FC_NOMAP);
 628 
 629         mapped_red = segkp_map_red();
 630 
 631         if (iskernel) {
 632                 as = &kas;
 633                 hat = as->a_hat;
 634         } else {
 635                 t = curthread;
 636                 p = ttoproc(t);
 637                 as = p->p_as;
 638                 hat = as->a_hat;
 639         }
 640 
 641         /*
 642          * Dispatch pagefault.
 643          */
 644         res = as_fault(hat, as, addr, 1, type, rw);
 645 
 646         /*
 647          * If this isn't a potential unmapped hole in the user's
 648          * UNIX data or stack segments, just return status info.
 649          */
 650         if (res != FC_NOMAP || iskernel)
 651                 goto out;
 652 
 653         /*
 654          * Check to see if we happened to faulted on a currently unmapped
 655          * part of the UNIX data or stack segments.  If so, create a zfod
 656          * mapping there and then try calling the fault routine again.
 657          */
 658         base = p->p_brkbase;
 659         len = p->p_brksize;
 660 
 661         if (addr < base || addr >= base + len) {          /* data seg? */
 662                 base = (caddr_t)p->p_usrstack - p->p_stksize;
 663                 len = p->p_stksize;
 664                 if (addr < base || addr >= p->p_usrstack) {    /* stack seg? */
 665                         /* not in either UNIX data or stack segments */
 666                         res = FC_NOMAP;
 667                         goto out;
 668                 }
 669         }
 670 
 671         /*
 672          * the rest of this function implements a 3.X 4.X 5.X compatibility
 673          * This code is probably not needed anymore
 674          */
 675         if (p->p_model == DATAMODEL_ILP32) {
 676 
 677                 /* expand the gap to the page boundaries on each side */
 678                 ea = P2ROUNDUP((uintptr_t)base + len, MMU_PAGESIZE);
 679                 base = (caddr_t)P2ALIGN((uintptr_t)base, MMU_PAGESIZE);
 680                 len = ea - (uintptr_t)base;
 681 
 682                 as_rangelock(as);
 683                 if (as_gap(as, MMU_PAGESIZE, &base, &len, AH_CONTAIN, addr) ==
 684                     0) {
 685                         err = as_map(as, base, len, segvn_create, zfod_argsp);
 686                         as_rangeunlock(as);
 687                         if (err) {
 688                                 res = FC_MAKE_ERR(err);
 689                                 goto out;
 690                         }
 691                 } else {
 692                         /*
 693                          * This page is already mapped by another thread after
 694                          * we returned from as_fault() above.  We just fall
 695                          * through as_fault() below.
 696                          */
 697                         as_rangeunlock(as);
 698                 }
 699 
 700                 res = as_fault(hat, as, addr, 1, F_INVAL, rw);
 701         }
 702 
 703 out:
 704         if (mapped_red)
 705                 segkp_unmap_red();
 706 
 707         return (res);
 708 }
 709 
 710 void
 711 map_addr(caddr_t *addrp, size_t len, offset_t off, int vacalign, uint_t flags)
 712 {
 713         struct proc *p = curproc;
 714         caddr_t userlimit = (flags & _MAP_LOW32) ?
 715             (caddr_t)_userlimit32 : p->p_as->a_userlimit;
 716 
 717         map_addr_proc(addrp, len, off, vacalign, userlimit, curproc, flags);
 718 }
 719 
 720 /*ARGSUSED*/
 721 int
 722 map_addr_vacalign_check(caddr_t addr, u_offset_t off)
 723 {
 724         return (0);
 725 }
 726 
 727 /*
 728  * The maximum amount a randomized mapping will be slewed.  We should perhaps
 729  * arrange things so these tunables can be separate for mmap, mmapobj, and
 730  * ld.so
 731  */
 732 size_t aslr_max_map_skew = 256 * 1024 * 1024; /* 256MB */
 733 
 734 /*
 735  * map_addr_proc() is the routine called when the system is to
 736  * choose an address for the user.  We will pick an address
 737  * range which is the highest available below userlimit.
 738  *
 739  * Every mapping will have a redzone of a single page on either side of
 740  * the request. This is done to leave one page unmapped between segments.
 741  * This is not required, but it's useful for the user because if their
 742  * program strays across a segment boundary, it will catch a fault
 743  * immediately making debugging a little easier.  Currently the redzone
 744  * is mandatory.
 745  *
 746  * addrp is a value/result parameter.
 747  *      On input it is a hint from the user to be used in a completely
 748  *      machine dependent fashion.  We decide to completely ignore this hint.
 749  *      If MAP_ALIGN was specified, addrp contains the minimal alignment, which
 750  *      must be some "power of two" multiple of pagesize.
 751  *
 752  *      On output it is NULL if no address can be found in the current
 753  *      processes address space or else an address that is currently
 754  *      not mapped for len bytes with a page of red zone on either side.
 755  *
 756  *      vacalign is not needed on x86 (it's for viturally addressed caches)
 757  */
 758 /*ARGSUSED*/
 759 void
 760 map_addr_proc(
 761         caddr_t *addrp,
 762         size_t len,
 763         offset_t off,
 764         int vacalign,
 765         caddr_t userlimit,
 766         struct proc *p,
 767         uint_t flags)
 768 {
 769         struct as *as = p->p_as;
 770         caddr_t addr;
 771         caddr_t base;
 772         size_t slen;
 773         size_t align_amount;
 774 
 775         ASSERT32(userlimit == as->a_userlimit);
 776 
 777         base = p->p_brkbase;
 778 #if defined(__amd64)
 779         /*
 780          * XX64 Yes, this needs more work.
 781          */
 782         if (p->p_model == DATAMODEL_NATIVE) {
 783                 if (userlimit < as->a_userlimit) {
 784                         /*
 785                          * This happens when a program wants to map
 786                          * something in a range that's accessible to a
 787                          * program in a smaller address space.  For example,
 788                          * a 64-bit program calling mmap32(2) to guarantee
 789                          * that the returned address is below 4Gbytes.
 790                          */
 791                         ASSERT((uintptr_t)userlimit < ADDRESS_C(0xffffffff));
 792 
 793                         if (userlimit > base)
 794                                 slen = userlimit - base;
 795                         else {
 796                                 *addrp = NULL;
 797                                 return;
 798                         }
 799                 } else {
 800                         /*
 801                          * XX64 This layout is probably wrong .. but in
 802                          * the event we make the amd64 address space look
 803                          * like sparcv9 i.e. with the stack -above- the
 804                          * heap, this bit of code might even be correct.
 805                          */
 806                         slen = p->p_usrstack - base -
 807                             ((p->p_stk_ctl + PAGEOFFSET) & PAGEMASK);
 808                 }
 809         } else
 810 #endif
 811                 slen = userlimit - base;
 812 
 813         /* Make len be a multiple of PAGESIZE */
 814         len = (len + PAGEOFFSET) & PAGEMASK;
 815 
 816         /*
 817          * figure out what the alignment should be
 818          *
 819          * XX64 -- is there an ELF_AMD64_MAXPGSZ or is it the same????
 820          */
 821         if (len <= ELF_386_MAXPGSZ) {
 822                 /*
 823                  * Align virtual addresses to ensure that ELF shared libraries
 824                  * are mapped with the appropriate alignment constraints by
 825                  * the run-time linker.
 826                  */
 827                 align_amount = ELF_386_MAXPGSZ;
 828         } else {
 829                 /*
 830                  * For 32-bit processes, only those which have specified
 831                  * MAP_ALIGN and an addr will be aligned on a larger page size.
 832                  * Not doing so can potentially waste up to 1G of process
 833                  * address space.
 834                  */
 835                 int lvl = (p->p_model == DATAMODEL_ILP32) ? 1 :
 836                     mmu.umax_page_level;
 837 
 838                 while (lvl && len < LEVEL_SIZE(lvl))
 839                         --lvl;
 840 
 841                 align_amount = LEVEL_SIZE(lvl);
 842         }
 843         if ((flags & MAP_ALIGN) && ((uintptr_t)*addrp > align_amount))
 844                 align_amount = (uintptr_t)*addrp;
 845 
 846         ASSERT(ISP2(align_amount));
 847         ASSERT(align_amount == 0 || align_amount >= PAGESIZE);
 848 
 849         off = off & (align_amount - 1);
 850 
 851         /*
 852          * Look for a large enough hole starting below userlimit.
 853          * After finding it, use the upper part.
 854          */
 855         if (as_gap_aligned(as, len, &base, &slen, AH_HI, NULL, align_amount,
 856             PAGESIZE, off) == 0) {
 857                 caddr_t as_addr;
 858 
 859                 /*
 860                  * addr is the highest possible address to use since we have
 861                  * a PAGESIZE redzone at the beginning and end.
 862                  */
 863                 addr = base + slen - (PAGESIZE + len);
 864                 as_addr = addr;
 865                 /*
 866                  * Round address DOWN to the alignment amount and
 867                  * add the offset in.
 868                  * If addr is greater than as_addr, len would not be large
 869                  * enough to include the redzone, so we must adjust down
 870                  * by the alignment amount.
 871                  */
 872                 addr = (caddr_t)((uintptr_t)addr & (~(align_amount - 1)));
 873                 addr += (uintptr_t)off;
 874                 if (addr > as_addr) {
 875                         addr -= align_amount;
 876                 }
 877 
 878                 /*
 879                  * If randomization is requested, slew the allocation
 880                  * backwards, within the same gap, by a random amount.
 881                  */
 882                 if (flags & _MAP_RANDOMIZE) {
 883                         uint32_t slew;
 884 
 885                         (void) random_get_pseudo_bytes((uint8_t *)&slew,
 886                             sizeof (slew));
 887 
 888                         slew = slew % MIN(aslr_max_map_skew, (addr - base));
 889                         addr -= P2ALIGN(slew, align_amount);
 890                 }
 891 
 892                 ASSERT(addr > base);
 893                 ASSERT(addr + len < base + slen);
 894                 ASSERT(((uintptr_t)addr & (align_amount - 1)) ==
 895                     ((uintptr_t)(off)));
 896                 *addrp = addr;
 897         } else {
 898                 *addrp = NULL;  /* no more virtual space */
 899         }
 900 }
 901 
 902 int valid_va_range_aligned_wraparound;
 903 
 904 /*
 905  * Determine whether [*basep, *basep + *lenp) contains a mappable range of
 906  * addresses at least "minlen" long, where the base of the range is at "off"
 907  * phase from an "align" boundary and there is space for a "redzone"-sized
 908  * redzone on either side of the range.  On success, 1 is returned and *basep
 909  * and *lenp are adjusted to describe the acceptable range (including
 910  * the redzone).  On failure, 0 is returned.
 911  */
 912 /*ARGSUSED3*/
 913 int
 914 valid_va_range_aligned(caddr_t *basep, size_t *lenp, size_t minlen, int dir,
 915     size_t align, size_t redzone, size_t off)
 916 {
 917         uintptr_t hi, lo;
 918         size_t tot_len;
 919 
 920         ASSERT(align == 0 ? off == 0 : off < align);
 921         ASSERT(ISP2(align));
 922         ASSERT(align == 0 || align >= PAGESIZE);
 923 
 924         lo = (uintptr_t)*basep;
 925         hi = lo + *lenp;
 926         tot_len = minlen + 2 * redzone; /* need at least this much space */
 927 
 928         /*
 929          * If hi rolled over the top, try cutting back.
 930          */
 931         if (hi < lo) {
 932                 *lenp = 0UL - lo - 1UL;
 933                 /* See if this really happens. If so, then we figure out why */
 934                 valid_va_range_aligned_wraparound++;
 935                 hi = lo + *lenp;
 936         }
 937         if (*lenp < tot_len) {
 938                 return (0);
 939         }
 940 
 941 #if defined(__amd64)
 942         /*
 943          * Deal with a possible hole in the address range between
 944          * hole_start and hole_end that should never be mapped.
 945          */
 946         if (lo < hole_start) {
 947                 if (hi > hole_start) {
 948                         if (hi < hole_end) {
 949                                 hi = hole_start;
 950                         } else {
 951                                 /* lo < hole_start && hi >= hole_end */
 952                                 if (dir == AH_LO) {
 953                                         /*
 954                                          * prefer lowest range
 955                                          */
 956                                         if (hole_start - lo >= tot_len)
 957                                                 hi = hole_start;
 958                                         else if (hi - hole_end >= tot_len)
 959                                                 lo = hole_end;
 960                                         else
 961                                                 return (0);
 962                                 } else {
 963                                         /*
 964                                          * prefer highest range
 965                                          */
 966                                         if (hi - hole_end >= tot_len)
 967                                                 lo = hole_end;
 968                                         else if (hole_start - lo >= tot_len)
 969                                                 hi = hole_start;
 970                                         else
 971                                                 return (0);
 972                                 }
 973                         }
 974                 }
 975         } else {
 976                 /* lo >= hole_start */
 977                 if (hi < hole_end)
 978                         return (0);
 979                 if (lo < hole_end)
 980                         lo = hole_end;
 981         }
 982 #endif
 983 
 984         if (hi - lo < tot_len)
 985                 return (0);
 986 
 987         if (align > 1) {
 988                 uintptr_t tlo = lo + redzone;
 989                 uintptr_t thi = hi - redzone;
 990                 tlo = (uintptr_t)P2PHASEUP(tlo, align, off);
 991                 if (tlo < lo + redzone) {
 992                         return (0);
 993                 }
 994                 if (thi < tlo || thi - tlo < minlen) {
 995                         return (0);
 996                 }
 997         }
 998 
 999         *basep = (caddr_t)lo;
1000         *lenp = hi - lo;
1001         return (1);
1002 }
1003 
1004 /*
1005  * Determine whether [*basep, *basep + *lenp) contains a mappable range of
1006  * addresses at least "minlen" long.  On success, 1 is returned and *basep
1007  * and *lenp are adjusted to describe the acceptable range.  On failure, 0
1008  * is returned.
1009  */
1010 int
1011 valid_va_range(caddr_t *basep, size_t *lenp, size_t minlen, int dir)
1012 {
1013         return (valid_va_range_aligned(basep, lenp, minlen, dir, 0, 0, 0));
1014 }
1015 
1016 /*
1017  * Default to forbidding the first 64k of address space.  This protects most
1018  * reasonably sized structures from dereferences through NULL:
1019  *     ((foo_t *)0)->bar
1020  */
1021 uintptr_t forbidden_null_mapping_sz = 0x10000;
1022 
1023 /*
1024  * Determine whether [addr, addr+len] are valid user addresses.
1025  */
1026 /*ARGSUSED*/
1027 int
1028 valid_usr_range(caddr_t addr, size_t len, uint_t prot, struct as *as,
1029     caddr_t userlimit)
1030 {
1031         caddr_t eaddr = addr + len;
1032 
1033         if (eaddr <= addr || addr >= userlimit || eaddr > userlimit)
1034                 return (RANGE_BADADDR);
1035 
1036         if ((addr <= (caddr_t)forbidden_null_mapping_sz) &&
1037             as->a_proc != NULL &&
1038             secflag_enabled(as->a_proc, PROC_SEC_FORBIDNULLMAP))
1039                 return (RANGE_BADADDR);
1040 
1041 #if defined(__amd64)
1042         /*
1043          * Check for the VA hole
1044          */
1045         if (eaddr > (caddr_t)hole_start && addr < (caddr_t)hole_end)
1046                 return (RANGE_BADADDR);
1047 #endif
1048 
1049         return (RANGE_OKAY);
1050 }
1051 
1052 /*
1053  * Return 1 if the page frame is onboard memory, else 0.
1054  */
1055 int
1056 pf_is_memory(pfn_t pf)
1057 {
1058         if (pfn_is_foreign(pf))
1059                 return (0);
1060         return (address_in_memlist(phys_install, pfn_to_pa(pf), 1));
1061 }
1062 
1063 /*
1064  * return the memrange containing pfn
1065  */
1066 int
1067 memrange_num(pfn_t pfn)
1068 {
1069         int n;
1070 
1071         for (n = 0; n < nranges - 1; ++n) {
1072                 if (pfn >= memranges[n])
1073                         break;
1074         }
1075         return (n);
1076 }
1077 
1078 /*
1079  * return the mnoderange containing pfn
1080  */
1081 /*ARGSUSED*/
1082 int
1083 pfn_2_mtype(pfn_t pfn)
1084 {
1085 #if defined(__xpv)
1086         return (0);
1087 #else
1088         int     n;
1089 
1090         /* Always start from highest pfn and work our way down */
1091         for (n = mtypetop; n != -1; n = mnoderanges[n].mnr_next) {
1092                 if (pfn >= mnoderanges[n].mnr_pfnlo) {
1093                         break;
1094                 }
1095         }
1096         return (n);
1097 #endif
1098 }
1099 
1100 #if !defined(__xpv)
1101 /*
1102  * is_contigpage_free:
1103  *      returns a page list of contiguous pages. It minimally has to return
1104  *      minctg pages. Caller determines minctg based on the scatter-gather
1105  *      list length.
1106  *
1107  *      pfnp is set to the next page frame to search on return.
1108  */
1109 static page_t *
1110 is_contigpage_free(
1111         pfn_t *pfnp,
1112         pgcnt_t *pgcnt,
1113         pgcnt_t minctg,
1114         uint64_t pfnseg,
1115         int iolock)
1116 {
1117         int     i = 0;
1118         pfn_t   pfn = *pfnp;
1119         page_t  *pp;
1120         page_t  *plist = NULL;
1121 
1122         /*
1123          * fail if pfn + minctg crosses a segment boundary.
1124          * Adjust for next starting pfn to begin at segment boundary.
1125          */
1126 
1127         if (((*pfnp + minctg - 1) & pfnseg) < (*pfnp & pfnseg)) {
1128                 *pfnp = roundup(*pfnp, pfnseg + 1);
1129                 return (NULL);
1130         }
1131 
1132         do {
1133 retry:
1134                 pp = page_numtopp_nolock(pfn + i);
1135                 if ((pp == NULL) || IS_DUMP_PAGE(pp) ||
1136                     (page_trylock(pp, SE_EXCL) == 0)) {
1137                         (*pfnp)++;
1138                         break;
1139                 }
1140                 if (page_pptonum(pp) != pfn + i) {
1141                         page_unlock(pp);
1142                         goto retry;
1143                 }
1144 
1145                 if (!(PP_ISFREE(pp))) {
1146                         page_unlock(pp);
1147                         (*pfnp)++;
1148                         break;
1149                 }
1150 
1151                 if (!PP_ISAGED(pp)) {
1152                         page_list_sub(pp, PG_CACHE_LIST);
1153                         page_hashout(pp, (kmutex_t *)NULL);
1154                 } else {
1155                         page_list_sub(pp, PG_FREE_LIST);
1156                 }
1157 
1158                 if (iolock)
1159                         page_io_lock(pp);
1160                 page_list_concat(&plist, &pp);
1161 
1162                 /*
1163                  * exit loop when pgcnt satisfied or segment boundary reached.
1164                  */
1165 
1166         } while ((++i < *pgcnt) && ((pfn + i) & pfnseg));
1167 
1168         *pfnp += i;             /* set to next pfn to search */
1169 
1170         if (i >= minctg) {
1171                 *pgcnt -= i;
1172                 return (plist);
1173         }
1174 
1175         /*
1176          * failure: minctg not satisfied.
1177          *
1178          * if next request crosses segment boundary, set next pfn
1179          * to search from the segment boundary.
1180          */
1181         if (((*pfnp + minctg - 1) & pfnseg) < (*pfnp & pfnseg))
1182                 *pfnp = roundup(*pfnp, pfnseg + 1);
1183 
1184         /* clean up any pages already allocated */
1185 
1186         while (plist) {
1187                 pp = plist;
1188                 page_sub(&plist, pp);
1189                 page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
1190                 if (iolock)
1191                         page_io_unlock(pp);
1192                 page_unlock(pp);
1193         }
1194 
1195         return (NULL);
1196 }
1197 #endif  /* !__xpv */
1198 
1199 /*
1200  * verify that pages being returned from allocator have correct DMA attribute
1201  */
1202 #ifndef DEBUG
1203 #define check_dma(a, b, c) (void)(0)
1204 #else
1205 static void
1206 check_dma(ddi_dma_attr_t *dma_attr, page_t *pp, int cnt)
1207 {
1208         if (dma_attr == NULL)
1209                 return;
1210 
1211         while (cnt-- > 0) {
1212                 if (pa_to_ma(pfn_to_pa(pp->p_pagenum)) <
1213                     dma_attr->dma_attr_addr_lo)
1214                         panic("PFN (pp=%p) below dma_attr_addr_lo", (void *)pp);
1215                 if (pa_to_ma(pfn_to_pa(pp->p_pagenum)) >=
1216                     dma_attr->dma_attr_addr_hi)
1217                         panic("PFN (pp=%p) above dma_attr_addr_hi", (void *)pp);
1218                 pp = pp->p_next;
1219         }
1220 }
1221 #endif
1222 
1223 #if !defined(__xpv)
1224 static page_t *
1225 page_get_contigpage(pgcnt_t *pgcnt, ddi_dma_attr_t *mattr, int iolock)
1226 {
1227         pfn_t           pfn;
1228         int             sgllen;
1229         uint64_t        pfnseg;
1230         pgcnt_t         minctg;
1231         page_t          *pplist = NULL, *plist;
1232         uint64_t        lo, hi;
1233         pgcnt_t         pfnalign = 0;
1234         static pfn_t    startpfn;
1235         static pgcnt_t  lastctgcnt;
1236         uintptr_t       align;
1237 
1238         CONTIG_LOCK();
1239 
1240         if (mattr) {
1241                 lo = mmu_btop((mattr->dma_attr_addr_lo + MMU_PAGEOFFSET));
1242                 hi = mmu_btop(mattr->dma_attr_addr_hi);
1243                 if (hi >= physmax)
1244                         hi = physmax - 1;
1245                 sgllen = mattr->dma_attr_sgllen;
1246                 pfnseg = mmu_btop(mattr->dma_attr_seg);
1247 
1248                 align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer);
1249                 if (align > MMU_PAGESIZE)
1250                         pfnalign = mmu_btop(align);
1251 
1252                 /*
1253                  * in order to satisfy the request, must minimally
1254                  * acquire minctg contiguous pages
1255                  */
1256                 minctg = howmany(*pgcnt, sgllen);
1257 
1258                 ASSERT(hi >= lo);
1259 
1260                 /*
1261                  * start from where last searched if the minctg >= lastctgcnt
1262                  */
1263                 if (minctg < lastctgcnt || startpfn < lo || startpfn > hi)
1264                         startpfn = lo;
1265         } else {
1266                 hi = physmax - 1;
1267                 lo = 0;
1268                 sgllen = 1;
1269                 pfnseg = mmu.highest_pfn;
1270                 minctg = *pgcnt;
1271 
1272                 if (minctg < lastctgcnt)
1273                         startpfn = lo;
1274         }
1275         lastctgcnt = minctg;
1276 
1277         ASSERT(pfnseg + 1 >= (uint64_t)minctg);
1278 
1279         /* conserve 16m memory - start search above 16m when possible */
1280         if (hi > PFN_16M && startpfn < PFN_16M)
1281                 startpfn = PFN_16M;
1282 
1283         pfn = startpfn;
1284         if (pfnalign)
1285                 pfn = P2ROUNDUP(pfn, pfnalign);
1286 
1287         while (pfn + minctg - 1 <= hi) {
1288 
1289                 plist = is_contigpage_free(&pfn, pgcnt, minctg, pfnseg, iolock);
1290                 if (plist) {
1291                         page_list_concat(&pplist, &plist);
1292                         sgllen--;
1293                         /*
1294                          * return when contig pages no longer needed
1295                          */
1296                         if (!*pgcnt || ((*pgcnt <= sgllen) && !pfnalign)) {
1297                                 startpfn = pfn;
1298                                 CONTIG_UNLOCK();
1299                                 check_dma(mattr, pplist, *pgcnt);
1300                                 return (pplist);
1301                         }
1302                         minctg = howmany(*pgcnt, sgllen);
1303                 }
1304                 if (pfnalign)
1305                         pfn = P2ROUNDUP(pfn, pfnalign);
1306         }
1307 
1308         /* cannot find contig pages in specified range */
1309         if (startpfn == lo) {
1310                 CONTIG_UNLOCK();
1311                 return (NULL);
1312         }
1313 
1314         /* did not start with lo previously */
1315         pfn = lo;
1316         if (pfnalign)
1317                 pfn = P2ROUNDUP(pfn, pfnalign);
1318 
1319         /* allow search to go above startpfn */
1320         while (pfn < startpfn) {
1321 
1322                 plist = is_contigpage_free(&pfn, pgcnt, minctg, pfnseg, iolock);
1323                 if (plist != NULL) {
1324 
1325                         page_list_concat(&pplist, &plist);
1326                         sgllen--;
1327 
1328                         /*
1329                          * return when contig pages no longer needed
1330                          */
1331                         if (!*pgcnt || ((*pgcnt <= sgllen) && !pfnalign)) {
1332                                 startpfn = pfn;
1333                                 CONTIG_UNLOCK();
1334                                 check_dma(mattr, pplist, *pgcnt);
1335                                 return (pplist);
1336                         }
1337                         minctg = howmany(*pgcnt, sgllen);
1338                 }
1339                 if (pfnalign)
1340                         pfn = P2ROUNDUP(pfn, pfnalign);
1341         }
1342         CONTIG_UNLOCK();
1343         return (NULL);
1344 }
1345 #endif  /* !__xpv */
1346 
1347 /*
1348  * mnode_range_cnt() calculates the number of memory ranges for mnode and
1349  * memranges[]. Used to determine the size of page lists and mnoderanges.
1350  */
1351 int
1352 mnode_range_cnt(int mnode)
1353 {
1354 #if defined(__xpv)
1355         ASSERT(mnode == 0);
1356         return (1);
1357 #else   /* __xpv */
1358         int     mri;
1359         int     mnrcnt = 0;
1360 
1361         if (mem_node_config[mnode].exists != 0) {
1362                 mri = nranges - 1;
1363 
1364                 /* find the memranges index below contained in mnode range */
1365 
1366                 while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase)
1367                         mri--;
1368 
1369                 /*
1370                  * increment mnode range counter when memranges or mnode
1371                  * boundary is reached.
1372                  */
1373                 while (mri >= 0 &&
1374                     mem_node_config[mnode].physmax >= MEMRANGELO(mri)) {
1375                         mnrcnt++;
1376                         if (mem_node_config[mnode].physmax > MEMRANGEHI(mri))
1377                                 mri--;
1378                         else
1379                                 break;
1380                 }
1381         }
1382         ASSERT(mnrcnt <= MAX_MNODE_MRANGES);
1383         return (mnrcnt);
1384 #endif  /* __xpv */
1385 }
1386 
1387 /*
1388  * mnode_range_setup() initializes mnoderanges.
1389  */
1390 void
1391 mnode_range_setup(mnoderange_t *mnoderanges)
1392 {
1393         mnoderange_t *mp = mnoderanges;
1394         int     mnode, mri;
1395         int     mindex = 0;     /* current index into mnoderanges array */
1396         int     i, j;
1397         pfn_t   hipfn;
1398         int     last, hi;
1399 
1400         for (mnode = 0; mnode < max_mem_nodes; mnode++) {
1401                 if (mem_node_config[mnode].exists == 0)
1402                         continue;
1403 
1404                 mri = nranges - 1;
1405 
1406                 while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase)
1407                         mri--;
1408 
1409                 while (mri >= 0 && mem_node_config[mnode].physmax >=
1410                     MEMRANGELO(mri)) {
1411                         mnoderanges->mnr_pfnlo = MAX(MEMRANGELO(mri),
1412                             mem_node_config[mnode].physbase);
1413                         mnoderanges->mnr_pfnhi = MIN(MEMRANGEHI(mri),
1414                             mem_node_config[mnode].physmax);
1415                         mnoderanges->mnr_mnode = mnode;
1416                         mnoderanges->mnr_memrange = mri;
1417                         mnoderanges->mnr_exists = 1;
1418                         mnoderanges++;
1419                         mindex++;
1420                         if (mem_node_config[mnode].physmax > MEMRANGEHI(mri))
1421                                 mri--;
1422                         else
1423                                 break;
1424                 }
1425         }
1426 
1427         /*
1428          * For now do a simple sort of the mnoderanges array to fill in
1429          * the mnr_next fields.  Since mindex is expected to be relatively
1430          * small, using a simple O(N^2) algorithm.
1431          */
1432         for (i = 0; i < mindex; i++) {
1433                 if (mp[i].mnr_pfnlo == 0)       /* find lowest */
1434                         break;
1435         }
1436         ASSERT(i < mindex);
1437         last = i;
1438         mtype16m = last;
1439         mp[last].mnr_next = -1;
1440         for (i = 0; i < mindex - 1; i++) {
1441                 hipfn = (pfn_t)(-1);
1442                 hi = -1;
1443                 /* find next highest mnode range */
1444                 for (j = 0; j < mindex; j++) {
1445                         if (mp[j].mnr_pfnlo > mp[last].mnr_pfnlo &&
1446                             mp[j].mnr_pfnlo < hipfn) {
1447                                 hipfn = mp[j].mnr_pfnlo;
1448                                 hi = j;
1449                         }
1450                 }
1451                 mp[hi].mnr_next = last;
1452                 last = hi;
1453         }
1454         mtypetop = last;
1455 }
1456 
1457 #ifndef __xpv
1458 /*
1459  * Update mnoderanges for memory hot-add DR operations.
1460  */
1461 static void
1462 mnode_range_add(int mnode)
1463 {
1464         int     *prev;
1465         int     n, mri;
1466         pfn_t   start, end;
1467         extern  void membar_sync(void);
1468 
1469         ASSERT(0 <= mnode && mnode < max_mem_nodes);
1470         ASSERT(mem_node_config[mnode].exists);
1471         start = mem_node_config[mnode].physbase;
1472         end = mem_node_config[mnode].physmax;
1473         ASSERT(start <= end);
1474         mutex_enter(&mnoderange_lock);
1475 
1476 #ifdef  DEBUG
1477         /* Check whether it interleaves with other memory nodes. */
1478         for (n = mtypetop; n != -1; n = mnoderanges[n].mnr_next) {
1479                 ASSERT(mnoderanges[n].mnr_exists);
1480                 if (mnoderanges[n].mnr_mnode == mnode)
1481                         continue;
1482                 ASSERT(start > mnoderanges[n].mnr_pfnhi ||
1483                     end < mnoderanges[n].mnr_pfnlo);
1484         }
1485 #endif  /* DEBUG */
1486 
1487         mri = nranges - 1;
1488         while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase)
1489                 mri--;
1490         while (mri >= 0 && mem_node_config[mnode].physmax >= MEMRANGELO(mri)) {
1491                 /* Check whether mtype already exists. */
1492                 for (n = mtypetop; n != -1; n = mnoderanges[n].mnr_next) {
1493                         if (mnoderanges[n].mnr_mnode == mnode &&
1494                             mnoderanges[n].mnr_memrange == mri) {
1495                                 mnoderanges[n].mnr_pfnlo = MAX(MEMRANGELO(mri),
1496                                     start);
1497                                 mnoderanges[n].mnr_pfnhi = MIN(MEMRANGEHI(mri),
1498                                     end);
1499                                 break;
1500                         }
1501                 }
1502 
1503                 /* Add a new entry if it doesn't exist yet. */
1504                 if (n == -1) {
1505                         /* Try to find an unused entry in mnoderanges array. */
1506                         for (n = 0; n < mnoderangecnt; n++) {
1507                                 if (mnoderanges[n].mnr_exists == 0)
1508                                         break;
1509                         }
1510                         ASSERT(n < mnoderangecnt);
1511                         mnoderanges[n].mnr_pfnlo = MAX(MEMRANGELO(mri), start);
1512                         mnoderanges[n].mnr_pfnhi = MIN(MEMRANGEHI(mri), end);
1513                         mnoderanges[n].mnr_mnode = mnode;
1514                         mnoderanges[n].mnr_memrange = mri;
1515                         mnoderanges[n].mnr_exists = 1;
1516                         /* Page 0 should always be present. */
1517                         for (prev = &mtypetop;
1518                             mnoderanges[*prev].mnr_pfnlo > start;
1519                             prev = &mnoderanges[*prev].mnr_next) {
1520                                 ASSERT(mnoderanges[*prev].mnr_next >= 0);
1521                                 ASSERT(mnoderanges[*prev].mnr_pfnlo > end);
1522                         }
1523                         mnoderanges[n].mnr_next = *prev;
1524                         membar_sync();
1525                         *prev = n;
1526                 }
1527 
1528                 if (mem_node_config[mnode].physmax > MEMRANGEHI(mri))
1529                         mri--;
1530                 else
1531                         break;
1532         }
1533 
1534         mutex_exit(&mnoderange_lock);
1535 }
1536 
1537 /*
1538  * Update mnoderanges for memory hot-removal DR operations.
1539  */
1540 static void
1541 mnode_range_del(int mnode)
1542 {
1543         _NOTE(ARGUNUSED(mnode));
1544         ASSERT(0 <= mnode && mnode < max_mem_nodes);
1545         /* TODO: support deletion operation. */
1546         ASSERT(0);
1547 }
1548 
1549 void
1550 plat_slice_add(pfn_t start, pfn_t end)
1551 {
1552         mem_node_add_slice(start, end);
1553         if (plat_dr_enabled()) {
1554                 mnode_range_add(PFN_2_MEM_NODE(start));
1555         }
1556 }
1557 
1558 void
1559 plat_slice_del(pfn_t start, pfn_t end)
1560 {
1561         ASSERT(PFN_2_MEM_NODE(start) == PFN_2_MEM_NODE(end));
1562         ASSERT(plat_dr_enabled());
1563         mnode_range_del(PFN_2_MEM_NODE(start));
1564         mem_node_del_slice(start, end);
1565 }
1566 #endif  /* __xpv */
1567 
1568 /*ARGSUSED*/
1569 int
1570 mtype_init(vnode_t *vp, caddr_t vaddr, uint_t *flags, size_t pgsz)
1571 {
1572         int mtype = mtypetop;
1573 
1574 #if !defined(__xpv)
1575 #if defined(__i386)
1576         /*
1577          * set the mtype range
1578          * - kmem requests need to be below 4g if restricted_kmemalloc is set.
1579          * - for non kmem requests, set range to above 4g if memory below 4g
1580          * runs low.
1581          */
1582         if (restricted_kmemalloc && VN_ISKAS(vp) &&
1583             (caddr_t)(vaddr) >= kernelheap &&
1584             (caddr_t)(vaddr) < ekernelheap) {
1585                 ASSERT(physmax4g);
1586                 mtype = mtype4g;
1587                 if (RESTRICT16M_ALLOC(freemem4g - btop(pgsz),
1588                     btop(pgsz), *flags)) {
1589                         *flags |= PGI_MT_RANGE16M;
1590                 } else {
1591                         VM_STAT_ADD(vmm_vmstats.unrestrict16mcnt);
1592                         VM_STAT_COND_ADD((*flags & PG_PANIC),
1593                             vmm_vmstats.pgpanicalloc);
1594                         *flags |= PGI_MT_RANGE0;
1595                 }
1596                 return (mtype);
1597         }
1598 #endif  /* __i386 */
1599 
1600         if (RESTRICT4G_ALLOC) {
1601                 VM_STAT_ADD(vmm_vmstats.restrict4gcnt);
1602                 /* here only for > 4g systems */
1603                 *flags |= PGI_MT_RANGE4G;
1604         } else if (RESTRICT16M_ALLOC(freemem, btop(pgsz), *flags)) {
1605                 *flags |= PGI_MT_RANGE16M;
1606         } else {
1607                 VM_STAT_ADD(vmm_vmstats.unrestrict16mcnt);
1608                 VM_STAT_COND_ADD((*flags & PG_PANIC), vmm_vmstats.pgpanicalloc);
1609                 *flags |= PGI_MT_RANGE0;
1610         }
1611 #endif /* !__xpv */
1612         return (mtype);
1613 }
1614 
1615 
1616 /* mtype init for page_get_replacement_page */
1617 /*ARGSUSED*/
1618 int
1619 mtype_pgr_init(int *flags, page_t *pp, int mnode, pgcnt_t pgcnt)
1620 {
1621         int mtype = mtypetop;
1622 #if !defined(__xpv)
1623         if (RESTRICT16M_ALLOC(freemem, pgcnt, *flags)) {
1624                 *flags |= PGI_MT_RANGE16M;
1625         } else {
1626                 VM_STAT_ADD(vmm_vmstats.unrestrict16mcnt);
1627                 *flags |= PGI_MT_RANGE0;
1628         }
1629 #endif
1630         return (mtype);
1631 }
1632 
1633 /*
1634  * Determine if the mnode range specified in mtype contains memory belonging
1635  * to memory node mnode.  If flags & PGI_MT_RANGE is set then mtype contains
1636  * the range from high pfn to 0, 16m or 4g.
1637  *
1638  * Return first mnode range type index found otherwise return -1 if none found.
1639  */
1640 int
1641 mtype_func(int mnode, int mtype, uint_t flags)
1642 {
1643         if (flags & PGI_MT_RANGE) {
1644                 int     mnr_lim = MRI_0;
1645 
1646                 if (flags & PGI_MT_NEXT) {
1647                         mtype = mnoderanges[mtype].mnr_next;
1648                 }
1649                 if (flags & PGI_MT_RANGE4G)
1650                         mnr_lim = MRI_4G;       /* exclude 0-4g range */
1651                 else if (flags & PGI_MT_RANGE16M)
1652                         mnr_lim = MRI_16M;      /* exclude 0-16m range */
1653                 while (mtype != -1 &&
1654                     mnoderanges[mtype].mnr_memrange <= mnr_lim) {
1655                         if (mnoderanges[mtype].mnr_mnode == mnode)
1656                                 return (mtype);
1657                         mtype = mnoderanges[mtype].mnr_next;
1658                 }
1659         } else if (mnoderanges[mtype].mnr_mnode == mnode) {
1660                 return (mtype);
1661         }
1662         return (-1);
1663 }
1664 
1665 /*
1666  * Update the page list max counts with the pfn range specified by the
1667  * input parameters.
1668  */
1669 void
1670 mtype_modify_max(pfn_t startpfn, long cnt)
1671 {
1672         int             mtype;
1673         pgcnt_t         inc;
1674         spgcnt_t        scnt = (spgcnt_t)(cnt);
1675         pgcnt_t         acnt = ABS(scnt);
1676         pfn_t           endpfn = startpfn + acnt;
1677         pfn_t           pfn, lo;
1678 
1679         if (!physmax4g)
1680                 return;
1681 
1682         mtype = mtypetop;
1683         for (pfn = endpfn; pfn > startpfn; ) {
1684                 ASSERT(mtype != -1);
1685                 lo = mnoderanges[mtype].mnr_pfnlo;
1686                 if (pfn > lo) {
1687                         if (startpfn >= lo) {
1688                                 inc = pfn - startpfn;
1689                         } else {
1690                                 inc = pfn - lo;
1691                         }
1692                         if (mnoderanges[mtype].mnr_memrange != MRI_4G) {
1693                                 if (scnt > 0)
1694                                         maxmem4g += inc;
1695                                 else
1696                                         maxmem4g -= inc;
1697                         }
1698                         pfn -= inc;
1699                 }
1700                 mtype = mnoderanges[mtype].mnr_next;
1701         }
1702 }
1703 
1704 int
1705 mtype_2_mrange(int mtype)
1706 {
1707         return (mnoderanges[mtype].mnr_memrange);
1708 }
1709 
1710 void
1711 mnodetype_2_pfn(int mnode, int mtype, pfn_t *pfnlo, pfn_t *pfnhi)
1712 {
1713         _NOTE(ARGUNUSED(mnode));
1714         ASSERT(mnoderanges[mtype].mnr_mnode == mnode);
1715         *pfnlo = mnoderanges[mtype].mnr_pfnlo;
1716         *pfnhi = mnoderanges[mtype].mnr_pfnhi;
1717 }
1718 
1719 size_t
1720 plcnt_sz(size_t ctrs_sz)
1721 {
1722 #ifdef DEBUG
1723         int     szc, colors;
1724 
1725         ctrs_sz += mnoderangecnt * sizeof (struct mnr_mts) * mmu_page_sizes;
1726         for (szc = 0; szc < mmu_page_sizes; szc++) {
1727                 colors = page_get_pagecolors(szc);
1728                 ctrs_sz += mnoderangecnt * sizeof (pgcnt_t) * colors;
1729         }
1730 #endif
1731         return (ctrs_sz);
1732 }
1733 
1734 caddr_t
1735 plcnt_init(caddr_t addr)
1736 {
1737 #ifdef DEBUG
1738         int     mt, szc, colors;
1739 
1740         for (mt = 0; mt < mnoderangecnt; mt++) {
1741                 mnoderanges[mt].mnr_mts = (struct mnr_mts *)addr;
1742                 addr += (sizeof (struct mnr_mts) * mmu_page_sizes);
1743                 for (szc = 0; szc < mmu_page_sizes; szc++) {
1744                         colors = page_get_pagecolors(szc);
1745                         mnoderanges[mt].mnr_mts[szc].mnr_mts_colors = colors;
1746                         mnoderanges[mt].mnr_mts[szc].mnr_mtsc_pgcnt =
1747                             (pgcnt_t *)addr;
1748                         addr += (sizeof (pgcnt_t) * colors);
1749                 }
1750         }
1751 #endif
1752         return (addr);
1753 }
1754 
1755 void
1756 plcnt_inc_dec(page_t *pp, int mtype, int szc, long cnt, int flags)
1757 {
1758         _NOTE(ARGUNUSED(pp));
1759 #ifdef DEBUG
1760         int     bin = PP_2_BIN(pp);
1761 
1762         atomic_add_long(&mnoderanges[mtype].mnr_mts[szc].mnr_mts_pgcnt, cnt);
1763         atomic_add_long(&mnoderanges[mtype].mnr_mts[szc].mnr_mtsc_pgcnt[bin],
1764             cnt);
1765 #endif
1766         ASSERT(mtype == PP_2_MTYPE(pp));
1767         if (physmax4g && mnoderanges[mtype].mnr_memrange != MRI_4G)
1768                 atomic_add_long(&freemem4g, cnt);
1769         if (flags & PG_CACHE_LIST)
1770                 atomic_add_long(&mnoderanges[mtype].mnr_mt_clpgcnt, cnt);
1771         else
1772                 atomic_add_long(&mnoderanges[mtype].mnr_mt_flpgcnt[szc], cnt);
1773         atomic_add_long(&mnoderanges[mtype].mnr_mt_totcnt, cnt);
1774 }
1775 
1776 /*
1777  * Returns the free page count for mnode
1778  */
1779 int
1780 mnode_pgcnt(int mnode)
1781 {
1782         int     mtype = mtypetop;
1783         int     flags = PGI_MT_RANGE0;
1784         pgcnt_t pgcnt = 0;
1785 
1786         mtype = mtype_func(mnode, mtype, flags);
1787 
1788         while (mtype != -1) {
1789                 pgcnt += MTYPE_FREEMEM(mtype);
1790                 mtype = mtype_func(mnode, mtype, flags | PGI_MT_NEXT);
1791         }
1792         return (pgcnt);
1793 }
1794 
1795 /*
1796  * Initialize page coloring variables based on the l2 cache parameters.
1797  * Calculate and return memory needed for page coloring data structures.
1798  */
1799 size_t
1800 page_coloring_init(uint_t l2_sz, int l2_linesz, int l2_assoc)
1801 {
1802         _NOTE(ARGUNUSED(l2_linesz));
1803         size_t  colorsz = 0;
1804         int     i;
1805         int     colors;
1806 
1807 #if defined(__xpv)
1808         /*
1809          * Hypervisor domains currently don't have any concept of NUMA.
1810          * Hence we'll act like there is only 1 memrange.
1811          */
1812         i = memrange_num(1);
1813 #else /* !__xpv */
1814         /*
1815          * Reduce the memory ranges lists if we don't have large amounts
1816          * of memory. This avoids searching known empty free lists.
1817          * To support memory DR operations, we need to keep memory ranges
1818          * for possible memory hot-add operations.
1819          */
1820         if (plat_dr_physmax > physmax)
1821                 i = memrange_num(plat_dr_physmax);
1822         else
1823                 i = memrange_num(physmax);
1824 #if defined(__i386)
1825         if (i > MRI_4G)
1826                 restricted_kmemalloc = 0;
1827 #endif
1828         /* physmax greater than 4g */
1829         if (i == MRI_4G)
1830                 physmax4g = 1;
1831 #endif /* !__xpv */
1832         memranges += i;
1833         nranges -= i;
1834 
1835         ASSERT(mmu_page_sizes <= MMU_PAGE_SIZES);
1836 
1837         ASSERT(ISP2(l2_linesz));
1838         ASSERT(l2_sz > MMU_PAGESIZE);
1839 
1840         /* l2_assoc is 0 for fully associative l2 cache */
1841         if (l2_assoc)
1842                 l2_colors = MAX(1, l2_sz / (l2_assoc * MMU_PAGESIZE));
1843         else
1844                 l2_colors = 1;
1845 
1846         ASSERT(ISP2(l2_colors));
1847 
1848         /* for scalability, configure at least PAGE_COLORS_MIN color bins */
1849         page_colors = MAX(l2_colors, PAGE_COLORS_MIN);
1850 
1851         /*
1852          * cpu_page_colors is non-zero when a page color may be spread across
1853          * multiple bins.
1854          */
1855         if (l2_colors < page_colors)
1856                 cpu_page_colors = l2_colors;
1857 
1858         ASSERT(ISP2(page_colors));
1859 
1860         page_colors_mask = page_colors - 1;
1861 
1862         ASSERT(ISP2(CPUSETSIZE()));
1863         page_coloring_shift = lowbit(CPUSETSIZE());
1864 
1865         /* initialize number of colors per page size */
1866         for (i = 0; i <= mmu.max_page_level; i++) {
1867                 hw_page_array[i].hp_size = LEVEL_SIZE(i);
1868                 hw_page_array[i].hp_shift = LEVEL_SHIFT(i);
1869                 hw_page_array[i].hp_pgcnt = LEVEL_SIZE(i) >> LEVEL_SHIFT(0);
1870                 hw_page_array[i].hp_colors = (page_colors_mask >>
1871                     (hw_page_array[i].hp_shift - hw_page_array[0].hp_shift))
1872                     + 1;
1873                 colorequivszc[i] = 0;
1874         }
1875 
1876         /*
1877          * The value of cpu_page_colors determines if additional color bins
1878          * need to be checked for a particular color in the page_get routines.
1879          */
1880         if (cpu_page_colors != 0) {
1881 
1882                 int a = lowbit(page_colors) - lowbit(cpu_page_colors);
1883                 ASSERT(a > 0);
1884                 ASSERT(a < 16);
1885 
1886                 for (i = 0; i <= mmu.max_page_level; i++) {
1887                         if ((colors = hw_page_array[i].hp_colors) <= 1) {
1888                                 colorequivszc[i] = 0;
1889                                 continue;
1890                         }
1891                         while ((colors >> a) == 0)
1892                                 a--;
1893                         ASSERT(a >= 0);
1894 
1895                         /* higher 4 bits encodes color equiv mask */
1896                         colorequivszc[i] = (a << 4);
1897                 }
1898         }
1899 
1900         /* factor in colorequiv to check additional 'equivalent' bins. */
1901         if (colorequiv > 1) {
1902 
1903                 int a = lowbit(colorequiv) - 1;
1904                 if (a > 15)
1905                         a = 15;
1906 
1907                 for (i = 0; i <= mmu.max_page_level; i++) {
1908                         if ((colors = hw_page_array[i].hp_colors) <= 1) {
1909                                 continue;
1910                         }
1911                         while ((colors >> a) == 0)
1912                                 a--;
1913                         if ((a << 4) > colorequivszc[i]) {
1914                                 colorequivszc[i] = (a << 4);
1915                         }
1916                 }
1917         }
1918 
1919         /* size for mnoderanges */
1920         for (mnoderangecnt = 0, i = 0; i < max_mem_nodes; i++)
1921                 mnoderangecnt += mnode_range_cnt(i);
1922         if (plat_dr_support_memory()) {
1923                 /*
1924                  * Reserve enough space for memory DR operations.
1925                  * Two extra mnoderanges for possbile fragmentations,
1926                  * one for the 2G boundary and the other for the 4G boundary.
1927                  * We don't expect a memory board crossing the 16M boundary
1928                  * for memory hot-add operations on x86 platforms.
1929                  */
1930                 mnoderangecnt += 2 + max_mem_nodes - lgrp_plat_node_cnt;
1931         }
1932         colorsz = mnoderangecnt * sizeof (mnoderange_t);
1933 
1934         /* size for fpc_mutex and cpc_mutex */
1935         colorsz += (2 * max_mem_nodes * sizeof (kmutex_t) * NPC_MUTEX);
1936 
1937         /* size of page_freelists */
1938         colorsz += mnoderangecnt * sizeof (page_t ***);
1939         colorsz += mnoderangecnt * mmu_page_sizes * sizeof (page_t **);
1940 
1941         for (i = 0; i < mmu_page_sizes; i++) {
1942                 colors = page_get_pagecolors(i);
1943                 colorsz += mnoderangecnt * colors * sizeof (page_t *);
1944         }
1945 
1946         /* size of page_cachelists */
1947         colorsz += mnoderangecnt * sizeof (page_t **);
1948         colorsz += mnoderangecnt * page_colors * sizeof (page_t *);
1949 
1950         return (colorsz);
1951 }
1952 
1953 /*
1954  * Called once at startup to configure page_coloring data structures and
1955  * does the 1st page_free()/page_freelist_add().
1956  */
1957 void
1958 page_coloring_setup(caddr_t pcmemaddr)
1959 {
1960         int     i;
1961         int     j;
1962         int     k;
1963         caddr_t addr;
1964         int     colors;
1965 
1966         /*
1967          * do page coloring setup
1968          */
1969         addr = pcmemaddr;
1970 
1971         mnoderanges = (mnoderange_t *)addr;
1972         addr += (mnoderangecnt * sizeof (mnoderange_t));
1973 
1974         mnode_range_setup(mnoderanges);
1975 
1976         if (physmax4g)
1977                 mtype4g = pfn_2_mtype(0xfffff);
1978 
1979         for (k = 0; k < NPC_MUTEX; k++) {
1980                 fpc_mutex[k] = (kmutex_t *)addr;
1981                 addr += (max_mem_nodes * sizeof (kmutex_t));
1982         }
1983         for (k = 0; k < NPC_MUTEX; k++) {
1984                 cpc_mutex[k] = (kmutex_t *)addr;
1985                 addr += (max_mem_nodes * sizeof (kmutex_t));
1986         }
1987         page_freelists = (page_t ****)addr;
1988         addr += (mnoderangecnt * sizeof (page_t ***));
1989 
1990         page_cachelists = (page_t ***)addr;
1991         addr += (mnoderangecnt * sizeof (page_t **));
1992 
1993         for (i = 0; i < mnoderangecnt; i++) {
1994                 page_freelists[i] = (page_t ***)addr;
1995                 addr += (mmu_page_sizes * sizeof (page_t **));
1996 
1997                 for (j = 0; j < mmu_page_sizes; j++) {
1998                         colors = page_get_pagecolors(j);
1999                         page_freelists[i][j] = (page_t **)addr;
2000                         addr += (colors * sizeof (page_t *));
2001                 }
2002                 page_cachelists[i] = (page_t **)addr;
2003                 addr += (page_colors * sizeof (page_t *));
2004         }
2005 }
2006 
2007 #if defined(__xpv)
2008 /*
2009  * Give back 10% of the io_pool pages to the free list.
2010  * Don't shrink the pool below some absolute minimum.
2011  */
2012 static void
2013 page_io_pool_shrink()
2014 {
2015         int retcnt;
2016         page_t *pp, *pp_first, *pp_last, **curpool;
2017         mfn_t mfn;
2018         int bothpools = 0;
2019 
2020         mutex_enter(&io_pool_lock);
2021         io_pool_shrink_attempts++;      /* should be a kstat? */
2022         retcnt = io_pool_cnt / 10;
2023         if (io_pool_cnt - retcnt < io_pool_cnt_min)
2024                 retcnt = io_pool_cnt - io_pool_cnt_min;
2025         if (retcnt <= 0)
2026                 goto done;
2027         io_pool_shrinks++;      /* should be a kstat? */
2028         curpool = &io_pool_4g;
2029 domore:
2030         /*
2031          * Loop through taking pages from the end of the list
2032          * (highest mfns) till amount to return reached.
2033          */
2034         for (pp = *curpool; pp && retcnt > 0; ) {
2035                 pp_first = pp_last = pp->p_prev;
2036                 if (pp_first == *curpool)
2037                         break;
2038                 retcnt--;
2039                 io_pool_cnt--;
2040                 page_io_pool_sub(curpool, pp_first, pp_last);
2041                 if ((mfn = pfn_to_mfn(pp->p_pagenum)) < start_mfn)
2042                         start_mfn = mfn;
2043                 page_free(pp_first, 1);
2044                 pp = *curpool;
2045         }
2046         if (retcnt != 0 && !bothpools) {
2047                 /*
2048                  * If not enough found in less constrained pool try the
2049                  * more constrained one.
2050                  */
2051                 curpool = &io_pool_16m;
2052                 bothpools = 1;
2053                 goto domore;
2054         }
2055 done:
2056         mutex_exit(&io_pool_lock);
2057 }
2058 
2059 #endif  /* __xpv */
2060 
2061 uint_t
2062 page_create_update_flags_x86(uint_t flags)
2063 {
2064 #if defined(__xpv)
2065         /*
2066          * Check this is an urgent allocation and free pages are depleted.
2067          */
2068         if (!(flags & PG_WAIT) && freemem < desfree)
2069                 page_io_pool_shrink();
2070 #else /* !__xpv */
2071         /*
2072          * page_create_get_something may call this because 4g memory may be
2073          * depleted. Set flags to allow for relocation of base page below
2074          * 4g if necessary.
2075          */
2076         if (physmax4g)
2077                 flags |= (PGI_PGCPSZC0 | PGI_PGCPHIPRI);
2078 #endif /* __xpv */
2079         return (flags);
2080 }
2081 
2082 /*ARGSUSED*/
2083 int
2084 bp_color(struct buf *bp)
2085 {
2086         return (0);
2087 }
2088 
2089 #if defined(__xpv)
2090 
2091 /*
2092  * Take pages out of an io_pool
2093  */
2094 static void
2095 page_io_pool_sub(page_t **poolp, page_t *pp_first, page_t *pp_last)
2096 {
2097         if (*poolp == pp_first) {
2098                 *poolp = pp_last->p_next;
2099                 if (*poolp == pp_first)
2100                         *poolp = NULL;
2101         }
2102         pp_first->p_prev->p_next = pp_last->p_next;
2103         pp_last->p_next->p_prev = pp_first->p_prev;
2104         pp_first->p_prev = pp_last;
2105         pp_last->p_next = pp_first;
2106 }
2107 
2108 /*
2109  * Put a page on the io_pool list. The list is ordered by increasing MFN.
2110  */
2111 static void
2112 page_io_pool_add(page_t **poolp, page_t *pp)
2113 {
2114         page_t  *look;
2115         mfn_t   mfn = mfn_list[pp->p_pagenum];
2116 
2117         if (*poolp == NULL) {
2118                 *poolp = pp;
2119                 pp->p_next = pp;
2120                 pp->p_prev = pp;
2121                 return;
2122         }
2123 
2124         /*
2125          * Since we try to take pages from the high end of the pool
2126          * chances are good that the pages to be put on the list will
2127          * go at or near the end of the list. so start at the end and
2128          * work backwards.
2129          */
2130         look = (*poolp)->p_prev;
2131         while (mfn < mfn_list[look->p_pagenum]) {
2132                 look = look->p_prev;
2133                 if (look == (*poolp)->p_prev)
2134                         break; /* backed all the way to front of list */
2135         }
2136 
2137         /* insert after look */
2138         pp->p_prev = look;
2139         pp->p_next = look->p_next;
2140         pp->p_next->p_prev = pp;
2141         look->p_next = pp;
2142         if (mfn < mfn_list[(*poolp)->p_pagenum]) {
2143                 /*
2144                  * we inserted a new first list element
2145                  * adjust pool pointer to newly inserted element
2146                  */
2147                 *poolp = pp;
2148         }
2149 }
2150 
2151 /*
2152  * Add a page to the io_pool.  Setting the force flag will force the page
2153  * into the io_pool no matter what.
2154  */
2155 static void
2156 add_page_to_pool(page_t *pp, int force)
2157 {
2158         page_t *highest;
2159         page_t *freep = NULL;
2160 
2161         mutex_enter(&io_pool_lock);
2162         /*
2163          * Always keep the scarce low memory pages
2164          */
2165         if (mfn_list[pp->p_pagenum] < PFN_16MEG) {
2166                 ++io_pool_cnt;
2167                 page_io_pool_add(&io_pool_16m, pp);
2168                 goto done;
2169         }
2170         if (io_pool_cnt < io_pool_cnt_max || force || io_pool_4g == NULL) {
2171                 ++io_pool_cnt;
2172                 page_io_pool_add(&io_pool_4g, pp);
2173         } else {
2174                 highest = io_pool_4g->p_prev;
2175                 if (mfn_list[pp->p_pagenum] < mfn_list[highest->p_pagenum]) {
2176                         page_io_pool_sub(&io_pool_4g, highest, highest);
2177                         page_io_pool_add(&io_pool_4g, pp);
2178                         freep = highest;
2179                 } else {
2180                         freep = pp;
2181                 }
2182         }
2183 done:
2184         mutex_exit(&io_pool_lock);
2185         if (freep)
2186                 page_free(freep, 1);
2187 }
2188 
2189 
2190 int contig_pfn_cnt;     /* no of pfns in the contig pfn list */
2191 int contig_pfn_max;     /* capacity of the contig pfn list */
2192 int next_alloc_pfn;     /* next position in list to start a contig search */
2193 int contig_pfnlist_updates;     /* pfn list update count */
2194 int contig_pfnlist_builds;      /* how many times have we (re)built list */
2195 int contig_pfnlist_buildfailed; /* how many times has list build failed */
2196 int create_contig_pending;      /* nonzero means taskq creating contig list */
2197 pfn_t *contig_pfn_list = NULL;  /* list of contig pfns in ascending mfn order */
2198 
2199 /*
2200  * Function to use in sorting a list of pfns by their underlying mfns.
2201  */
2202 static int
2203 mfn_compare(const void *pfnp1, const void *pfnp2)
2204 {
2205         mfn_t mfn1 = mfn_list[*(pfn_t *)pfnp1];
2206         mfn_t mfn2 = mfn_list[*(pfn_t *)pfnp2];
2207 
2208         if (mfn1 > mfn2)
2209                 return (1);
2210         if (mfn1 < mfn2)
2211                 return (-1);
2212         return (0);
2213 }
2214 
2215 /*
2216  * Compact the contig_pfn_list by tossing all the non-contiguous
2217  * elements from the list.
2218  */
2219 static void
2220 compact_contig_pfn_list(void)
2221 {
2222         pfn_t pfn, lapfn, prev_lapfn;
2223         mfn_t mfn;
2224         int i, newcnt = 0;
2225 
2226         prev_lapfn = 0;
2227         for (i = 0; i < contig_pfn_cnt - 1; i++) {
2228                 pfn = contig_pfn_list[i];
2229                 lapfn = contig_pfn_list[i + 1];
2230                 mfn = mfn_list[pfn];
2231                 /*
2232                  * See if next pfn is for a contig mfn
2233                  */
2234                 if (mfn_list[lapfn] != mfn + 1)
2235                         continue;
2236                 /*
2237                  * pfn and lookahead are both put in list
2238                  * unless pfn is the previous lookahead.
2239                  */
2240                 if (pfn != prev_lapfn)
2241                         contig_pfn_list[newcnt++] = pfn;
2242                 contig_pfn_list[newcnt++] = lapfn;
2243                 prev_lapfn = lapfn;
2244         }
2245         for (i = newcnt; i < contig_pfn_cnt; i++)
2246                 contig_pfn_list[i] = 0;
2247         contig_pfn_cnt = newcnt;
2248 }
2249 
2250 /*ARGSUSED*/
2251 static void
2252 call_create_contiglist(void *arg)
2253 {
2254         (void) create_contig_pfnlist(PG_WAIT);
2255 }
2256 
2257 /*
2258  * Create list of freelist pfns that have underlying
2259  * contiguous mfns.  The list is kept in ascending mfn order.
2260  * returns 1 if list created else 0.
2261  */
2262 static int
2263 create_contig_pfnlist(uint_t flags)
2264 {
2265         pfn_t pfn;
2266         page_t *pp;
2267         int ret = 1;
2268 
2269         mutex_enter(&contig_list_lock);
2270         if (contig_pfn_list != NULL)
2271                 goto out;
2272         contig_pfn_max = freemem + (freemem / 10);
2273         contig_pfn_list = kmem_zalloc(contig_pfn_max * sizeof (pfn_t),
2274             (flags & PG_WAIT) ? KM_SLEEP : KM_NOSLEEP);
2275         if (contig_pfn_list == NULL) {
2276                 /*
2277                  * If we could not create the contig list (because
2278                  * we could not sleep for memory).  Dispatch a taskq that can
2279                  * sleep to get the memory.
2280                  */
2281                 if (!create_contig_pending) {
2282                         if (taskq_dispatch(system_taskq, call_create_contiglist,
2283                             NULL, TQ_NOSLEEP) != NULL)
2284                                 create_contig_pending = 1;
2285                 }
2286                 contig_pfnlist_buildfailed++;   /* count list build failures */
2287                 ret = 0;
2288                 goto out;
2289         }
2290         create_contig_pending = 0;
2291         ASSERT(contig_pfn_cnt == 0);
2292         for (pfn = 0; pfn < mfn_count; pfn++) {
2293                 pp = page_numtopp_nolock(pfn);
2294                 if (pp == NULL || !PP_ISFREE(pp))
2295                         continue;
2296                 contig_pfn_list[contig_pfn_cnt] = pfn;
2297                 if (++contig_pfn_cnt == contig_pfn_max)
2298                         break;
2299         }
2300         /*
2301          * Sanity check the new list.
2302          */
2303         if (contig_pfn_cnt < 2) { /* no contig pfns */
2304                 contig_pfn_cnt = 0;
2305                 contig_pfnlist_buildfailed++;
2306                 kmem_free(contig_pfn_list, contig_pfn_max * sizeof (pfn_t));
2307                 contig_pfn_list = NULL;
2308                 contig_pfn_max = 0;
2309                 ret = 0;
2310                 goto out;
2311         }
2312         qsort(contig_pfn_list, contig_pfn_cnt, sizeof (pfn_t), mfn_compare);
2313         compact_contig_pfn_list();
2314         /*
2315          * Make sure next search of the newly created contiguous pfn
2316          * list starts at the beginning of the list.
2317          */
2318         next_alloc_pfn = 0;
2319         contig_pfnlist_builds++;        /* count list builds */
2320 out:
2321         mutex_exit(&contig_list_lock);
2322         return (ret);
2323 }
2324 
2325 
2326 /*
2327  * Toss the current contig pfnlist.  Someone is about to do a massive
2328  * update to pfn<->mfn mappings.  So we have them destroy the list and lock
2329  * it till they are done with their update.
2330  */
2331 void
2332 clear_and_lock_contig_pfnlist()
2333 {
2334         pfn_t *listp = NULL;
2335         size_t listsize;
2336 
2337         mutex_enter(&contig_list_lock);
2338         if (contig_pfn_list != NULL) {
2339                 listp = contig_pfn_list;
2340                 listsize = contig_pfn_max * sizeof (pfn_t);
2341                 contig_pfn_list = NULL;
2342                 contig_pfn_max = contig_pfn_cnt = 0;
2343         }
2344         if (listp != NULL)
2345                 kmem_free(listp, listsize);
2346 }
2347 
2348 /*
2349  * Unlock the contig_pfn_list.  The next attempted use of it will cause
2350  * it to be re-created.
2351  */
2352 void
2353 unlock_contig_pfnlist()
2354 {
2355         mutex_exit(&contig_list_lock);
2356 }
2357 
2358 /*
2359  * Update the contiguous pfn list in response to a pfn <-> mfn reassignment
2360  */
2361 void
2362 update_contig_pfnlist(pfn_t pfn, mfn_t oldmfn, mfn_t newmfn)
2363 {
2364         int probe_hi, probe_lo, probe_pos, insert_after, insert_point;
2365         pfn_t probe_pfn;
2366         mfn_t probe_mfn;
2367         int drop_lock = 0;
2368 
2369         if (mutex_owner(&contig_list_lock) != curthread) {
2370                 drop_lock = 1;
2371                 mutex_enter(&contig_list_lock);
2372         }
2373         if (contig_pfn_list == NULL)
2374                 goto done;
2375         contig_pfnlist_updates++;
2376         /*
2377          * Find the pfn in the current list.  Use a binary chop to locate it.
2378          */
2379         probe_hi = contig_pfn_cnt - 1;
2380         probe_lo = 0;
2381         probe_pos = (probe_hi + probe_lo) / 2;
2382         while ((probe_pfn = contig_pfn_list[probe_pos]) != pfn) {
2383                 if (probe_pos == probe_lo) { /* pfn not in list */
2384                         probe_pos = -1;
2385                         break;
2386                 }
2387                 if (pfn_to_mfn(probe_pfn) <= oldmfn)
2388                         probe_lo = probe_pos;
2389                 else
2390                         probe_hi = probe_pos;
2391                 probe_pos = (probe_hi + probe_lo) / 2;
2392         }
2393         if (probe_pos >= 0) {
2394                 /*
2395                  * Remove pfn from list and ensure next alloc
2396                  * position stays in bounds.
2397                  */
2398                 if (--contig_pfn_cnt <= next_alloc_pfn)
2399                         next_alloc_pfn = 0;
2400                 if (contig_pfn_cnt < 2) { /* no contig pfns */
2401                         contig_pfn_cnt = 0;
2402                         kmem_free(contig_pfn_list,
2403                             contig_pfn_max * sizeof (pfn_t));
2404                         contig_pfn_list = NULL;
2405                         contig_pfn_max = 0;
2406                         goto done;
2407                 }
2408                 ovbcopy(&contig_pfn_list[probe_pos + 1],
2409                     &contig_pfn_list[probe_pos],
2410                     (contig_pfn_cnt - probe_pos) * sizeof (pfn_t));
2411         }
2412         if (newmfn == MFN_INVALID)
2413                 goto done;
2414         /*
2415          * Check if new mfn has adjacent mfns in the list
2416          */
2417         probe_hi = contig_pfn_cnt - 1;
2418         probe_lo = 0;
2419         insert_after = -2;
2420         do {
2421                 probe_pos = (probe_hi + probe_lo) / 2;
2422                 probe_mfn = pfn_to_mfn(contig_pfn_list[probe_pos]);
2423                 if (newmfn == probe_mfn + 1)
2424                         insert_after = probe_pos;
2425                 else if (newmfn == probe_mfn - 1)
2426                         insert_after = probe_pos - 1;
2427                 if (probe_pos == probe_lo)
2428                         break;
2429                 if (probe_mfn <= newmfn)
2430                         probe_lo = probe_pos;
2431                 else
2432                         probe_hi = probe_pos;
2433         } while (insert_after == -2);
2434         /*
2435          * If there is space in the list and there are adjacent mfns
2436          * insert the pfn in to its proper place in the list.
2437          */
2438         if (insert_after != -2 && contig_pfn_cnt + 1 <= contig_pfn_max) {
2439                 insert_point = insert_after + 1;
2440                 ovbcopy(&contig_pfn_list[insert_point],
2441                     &contig_pfn_list[insert_point + 1],
2442                     (contig_pfn_cnt - insert_point) * sizeof (pfn_t));
2443                 contig_pfn_list[insert_point] = pfn;
2444                 contig_pfn_cnt++;
2445         }
2446 done:
2447         if (drop_lock)
2448                 mutex_exit(&contig_list_lock);
2449 }
2450 
2451 /*
2452  * Called to (re-)populate the io_pool from the free page lists.
2453  */
2454 long
2455 populate_io_pool(void)
2456 {
2457         pfn_t pfn;
2458         mfn_t mfn, max_mfn;
2459         page_t *pp;
2460 
2461         /*
2462          * Figure out the bounds of the pool on first invocation.
2463          * We use a percentage of memory for the io pool size.
2464          * we allow that to shrink, but not to less than a fixed minimum
2465          */
2466         if (io_pool_cnt_max == 0) {
2467                 io_pool_cnt_max = physmem / (100 / io_pool_physmem_pct);
2468                 io_pool_cnt_lowater = io_pool_cnt_max;
2469                 /*
2470                  * This is the first time in populate_io_pool, grab a va to use
2471                  * when we need to allocate pages.
2472                  */
2473                 io_pool_kva = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP);
2474         }
2475         /*
2476          * If we are out of pages in the pool, then grow the size of the pool
2477          */
2478         if (io_pool_cnt == 0) {
2479                 /*
2480                  * Grow the max size of the io pool by 5%, but never more than
2481                  * 25% of physical memory.
2482                  */
2483                 if (io_pool_cnt_max < physmem / 4)
2484                         io_pool_cnt_max += io_pool_cnt_max / 20;
2485         }
2486         io_pool_grows++;        /* should be a kstat? */
2487 
2488         /*
2489          * Get highest mfn on this platform, but limit to the 32 bit DMA max.
2490          */
2491         (void) mfn_to_pfn(start_mfn);
2492         max_mfn = MIN(cached_max_mfn, PFN_4GIG);
2493         for (mfn = start_mfn; mfn < max_mfn; start_mfn = ++mfn) {
2494                 pfn = mfn_to_pfn(mfn);
2495                 if (pfn & PFN_IS_FOREIGN_MFN)
2496                         continue;
2497                 /*
2498                  * try to allocate it from free pages
2499                  */
2500                 pp = page_numtopp_alloc(pfn);
2501                 if (pp == NULL)
2502                         continue;
2503                 PP_CLRFREE(pp);
2504                 add_page_to_pool(pp, 1);
2505                 if (io_pool_cnt >= io_pool_cnt_max)
2506                         break;
2507         }
2508 
2509         return (io_pool_cnt);
2510 }
2511 
2512 /*
2513  * Destroy a page that was being used for DMA I/O. It may or
2514  * may not actually go back to the io_pool.
2515  */
2516 void
2517 page_destroy_io(page_t *pp)
2518 {
2519         mfn_t mfn = mfn_list[pp->p_pagenum];
2520 
2521         /*
2522          * When the page was alloc'd a reservation was made, release it now
2523          */
2524         page_unresv(1);
2525         /*
2526          * Unload translations, if any, then hash out the
2527          * page to erase its identity.
2528          */
2529         (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
2530         page_hashout(pp, NULL);
2531 
2532         /*
2533          * If the page came from the free lists, just put it back to them.
2534          * DomU pages always go on the free lists as well.
2535          */
2536         if (!DOMAIN_IS_INITDOMAIN(xen_info) || mfn >= PFN_4GIG) {
2537                 page_free(pp, 1);
2538                 return;
2539         }
2540 
2541         add_page_to_pool(pp, 0);
2542 }
2543 
2544 
2545 long contig_searches;           /* count of times contig pages requested */
2546 long contig_search_restarts;    /* count of contig ranges tried */
2547 long contig_search_failed;      /* count of contig alloc failures */
2548 
2549 /*
2550  * Free partial page list
2551  */
2552 static void
2553 free_partial_list(page_t **pplist)
2554 {
2555         page_t *pp;
2556 
2557         while (*pplist != NULL) {
2558                 pp = *pplist;
2559                 page_io_pool_sub(pplist, pp, pp);
2560                 page_free(pp, 1);
2561         }
2562 }
2563 
2564 /*
2565  * Look thru the contiguous pfns that are not part of the io_pool for
2566  * contiguous free pages.  Return a list of the found pages or NULL.
2567  */
2568 page_t *
2569 find_contig_free(uint_t npages, uint_t flags, uint64_t pfnseg,
2570     pgcnt_t pfnalign)
2571 {
2572         page_t *pp, *plist = NULL;
2573         mfn_t mfn, prev_mfn, start_mfn;
2574         pfn_t pfn;
2575         int pages_needed, pages_requested;
2576         int search_start;
2577 
2578         /*
2579          * create the contig pfn list if not already done
2580          */
2581 retry:
2582         mutex_enter(&contig_list_lock);
2583         if (contig_pfn_list == NULL) {
2584                 mutex_exit(&contig_list_lock);
2585                 if (!create_contig_pfnlist(flags)) {
2586                         return (NULL);
2587                 }
2588                 goto retry;
2589         }
2590         contig_searches++;
2591         /*
2592          * Search contiguous pfn list for physically contiguous pages not in
2593          * the io_pool.  Start the search where the last search left off.
2594          */
2595         pages_requested = pages_needed = npages;
2596         search_start = next_alloc_pfn;
2597         start_mfn = prev_mfn = 0;
2598         while (pages_needed) {
2599                 pfn = contig_pfn_list[next_alloc_pfn];
2600                 mfn = pfn_to_mfn(pfn);
2601                 /*
2602                  * Check if mfn is first one or contig to previous one and
2603                  * if page corresponding to mfn is free and that mfn
2604                  * range is not crossing a segment boundary.
2605                  */
2606                 if ((prev_mfn == 0 || mfn == prev_mfn + 1) &&
2607                     (pp = page_numtopp_alloc(pfn)) != NULL &&
2608                     !((mfn & pfnseg) < (start_mfn & pfnseg))) {
2609                         PP_CLRFREE(pp);
2610                         page_io_pool_add(&plist, pp);
2611                         pages_needed--;
2612                         if (prev_mfn == 0) {
2613                                 if (pfnalign &&
2614                                     mfn != P2ROUNDUP(mfn, pfnalign)) {
2615                                         /*
2616                                          * not properly aligned
2617                                          */
2618                                         contig_search_restarts++;
2619                                         free_partial_list(&plist);
2620                                         pages_needed = pages_requested;
2621                                         start_mfn = prev_mfn = 0;
2622                                         goto skip;
2623                                 }
2624                                 start_mfn = mfn;
2625                         }
2626                         prev_mfn = mfn;
2627                 } else {
2628                         contig_search_restarts++;
2629                         free_partial_list(&plist);
2630                         pages_needed = pages_requested;
2631                         start_mfn = prev_mfn = 0;
2632                 }
2633 skip:
2634                 if (++next_alloc_pfn == contig_pfn_cnt)
2635                         next_alloc_pfn = 0;
2636                 if (next_alloc_pfn == search_start)
2637                         break; /* all pfns searched */
2638         }
2639         mutex_exit(&contig_list_lock);
2640         if (pages_needed) {
2641                 contig_search_failed++;
2642                 /*
2643                  * Failed to find enough contig pages.
2644                  * free partial page list
2645                  */
2646                 free_partial_list(&plist);
2647         }
2648         return (plist);
2649 }
2650 
2651 /*
2652  * Search the reserved io pool pages for a page range with the
2653  * desired characteristics.
2654  */
2655 page_t *
2656 page_io_pool_alloc(ddi_dma_attr_t *mattr, int contig, pgcnt_t minctg)
2657 {
2658         page_t *pp_first, *pp_last;
2659         page_t *pp, **poolp;
2660         pgcnt_t nwanted, pfnalign;
2661         uint64_t pfnseg;
2662         mfn_t mfn, tmfn, hi_mfn, lo_mfn;
2663         int align, attempt = 0;
2664 
2665         if (minctg == 1)
2666                 contig = 0;
2667         lo_mfn = mmu_btop(mattr->dma_attr_addr_lo);
2668         hi_mfn = mmu_btop(mattr->dma_attr_addr_hi);
2669         pfnseg = mmu_btop(mattr->dma_attr_seg);
2670         align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer);
2671         if (align > MMU_PAGESIZE)
2672                 pfnalign = mmu_btop(align);
2673         else
2674                 pfnalign = 0;
2675 
2676 try_again:
2677         /*
2678          * See if we want pages for a legacy device
2679          */
2680         if (hi_mfn < PFN_16MEG)
2681                 poolp = &io_pool_16m;
2682         else
2683                 poolp = &io_pool_4g;
2684 try_smaller:
2685         /*
2686          * Take pages from I/O pool. We'll use pages from the highest
2687          * MFN range possible.
2688          */
2689         pp_first = pp_last = NULL;
2690         mutex_enter(&io_pool_lock);
2691         nwanted = minctg;
2692         for (pp = *poolp; pp && nwanted > 0; ) {
2693                 pp = pp->p_prev;
2694 
2695                 /*
2696                  * skip pages above allowable range
2697                  */
2698                 mfn = mfn_list[pp->p_pagenum];
2699                 if (hi_mfn < mfn)
2700                         goto skip;
2701 
2702                 /*
2703                  * stop at pages below allowable range
2704                  */
2705                 if (lo_mfn > mfn)
2706                         break;
2707 restart:
2708                 if (pp_last == NULL) {
2709                         /*
2710                          * Check alignment
2711                          */
2712                         tmfn = mfn - (minctg - 1);
2713                         if (pfnalign && tmfn != P2ROUNDUP(tmfn, pfnalign))
2714                                 goto skip; /* not properly aligned */
2715                         /*
2716                          * Check segment
2717                          */
2718                         if ((mfn & pfnseg) < (tmfn & pfnseg))
2719                                 goto skip; /* crosses seg boundary */
2720                         /*
2721                          * Start building page list
2722                          */
2723                         pp_first = pp_last = pp;
2724                         nwanted--;
2725                 } else {
2726                         /*
2727                          * check physical contiguity if required
2728                          */
2729                         if (contig &&
2730                             mfn_list[pp_first->p_pagenum] != mfn + 1) {
2731                                 /*
2732                                  * not a contiguous page, restart list.
2733                                  */
2734                                 pp_last = NULL;
2735                                 nwanted = minctg;
2736                                 goto restart;
2737                         } else { /* add page to list */
2738                                 pp_first = pp;
2739                                 nwanted--;
2740                         }
2741                 }
2742 skip:
2743                 if (pp == *poolp)
2744                         break;
2745         }
2746 
2747         /*
2748          * If we didn't find memory. Try the more constrained pool, then
2749          * sweep free pages into the DMA pool and try again.
2750          */
2751         if (nwanted != 0) {
2752                 mutex_exit(&io_pool_lock);
2753                 /*
2754                  * If we were looking in the less constrained pool and
2755                  * didn't find pages, try the more constrained pool.
2756                  */
2757                 if (poolp == &io_pool_4g) {
2758                         poolp = &io_pool_16m;
2759                         goto try_smaller;
2760                 }
2761                 kmem_reap();
2762                 if (++attempt < 4) {
2763                         /*
2764                          * Grab some more io_pool pages
2765                          */
2766                         (void) populate_io_pool();
2767                         goto try_again; /* go around and retry */
2768                 }
2769                 return (NULL);
2770         }
2771         /*
2772          * Found the pages, now snip them from the list
2773          */
2774         page_io_pool_sub(poolp, pp_first, pp_last);
2775         io_pool_cnt -= minctg;
2776         /*
2777          * reset low water mark
2778          */
2779         if (io_pool_cnt < io_pool_cnt_lowater)
2780                 io_pool_cnt_lowater = io_pool_cnt;
2781         mutex_exit(&io_pool_lock);
2782         return (pp_first);
2783 }
2784 
2785 page_t *
2786 page_swap_with_hypervisor(struct vnode *vp, u_offset_t off, caddr_t vaddr,
2787     ddi_dma_attr_t *mattr, uint_t flags, pgcnt_t minctg)
2788 {
2789         uint_t kflags;
2790         int order, extra, extpages, i, contig, nbits, extents;
2791         page_t *pp, *expp, *pp_first, **pplist = NULL;
2792         mfn_t *mfnlist = NULL;
2793 
2794         contig = flags & PG_PHYSCONTIG;
2795         if (minctg == 1)
2796                 contig = 0;
2797         flags &= ~PG_PHYSCONTIG;
2798         kflags = flags & PG_WAIT ? KM_SLEEP : KM_NOSLEEP;
2799         /*
2800          * Hypervisor will allocate extents, if we want contig
2801          * pages extent must be >= minctg
2802          */
2803         if (contig) {
2804                 order = highbit(minctg) - 1;
2805                 if (minctg & ((1 << order) - 1))
2806                         order++;
2807                 extpages = 1 << order;
2808         } else {
2809                 order = 0;
2810                 extpages = minctg;
2811         }
2812         if (extpages > minctg) {
2813                 extra = extpages - minctg;
2814                 if (!page_resv(extra, kflags))
2815                         return (NULL);
2816         }
2817         pp_first = NULL;
2818         pplist = kmem_alloc(extpages * sizeof (page_t *), kflags);
2819         if (pplist == NULL)
2820                 goto balloon_fail;
2821         mfnlist = kmem_alloc(extpages * sizeof (mfn_t), kflags);
2822         if (mfnlist == NULL)
2823                 goto balloon_fail;
2824         pp = page_create_va(vp, off, minctg * PAGESIZE, flags, &kvseg, vaddr);
2825         if (pp == NULL)
2826                 goto balloon_fail;
2827         pp_first = pp;
2828         if (extpages > minctg) {
2829                 /*
2830                  * fill out the rest of extent pages to swap
2831                  * with the hypervisor
2832                  */
2833                 for (i = 0; i < extra; i++) {
2834                         expp = page_create_va(vp,
2835                             (u_offset_t)(uintptr_t)io_pool_kva,
2836                             PAGESIZE, flags, &kvseg, io_pool_kva);
2837                         if (expp == NULL)
2838                                 goto balloon_fail;
2839                         (void) hat_pageunload(expp, HAT_FORCE_PGUNLOAD);
2840                         page_io_unlock(expp);
2841                         page_hashout(expp, NULL);
2842                         page_io_lock(expp);
2843                         /*
2844                          * add page to end of list
2845                          */
2846                         expp->p_prev = pp_first->p_prev;
2847                         expp->p_next = pp_first;
2848                         expp->p_prev->p_next = expp;
2849                         pp_first->p_prev = expp;
2850                 }
2851 
2852         }
2853         for (i = 0; i < extpages; i++) {
2854                 pplist[i] = pp;
2855                 pp = pp->p_next;
2856         }
2857         nbits = highbit(mattr->dma_attr_addr_hi);
2858         extents = contig ? 1 : minctg;
2859         if (balloon_replace_pages(extents, pplist, nbits, order,
2860             mfnlist) != extents) {
2861                 if (ioalloc_dbg)
2862                         cmn_err(CE_NOTE, "request to hypervisor"
2863                             " for %d pages, maxaddr %" PRIx64 " failed",
2864                             extpages, mattr->dma_attr_addr_hi);
2865                 goto balloon_fail;
2866         }
2867 
2868         kmem_free(pplist, extpages * sizeof (page_t *));
2869         kmem_free(mfnlist, extpages * sizeof (mfn_t));
2870         /*
2871          * Return any excess pages to free list
2872          */
2873         if (extpages > minctg) {
2874                 for (i = 0; i < extra; i++) {
2875                         pp = pp_first->p_prev;
2876                         page_sub(&pp_first, pp);
2877                         page_io_unlock(pp);
2878                         page_unresv(1);
2879                         page_free(pp, 1);
2880                 }
2881         }
2882         return (pp_first);
2883 balloon_fail:
2884         /*
2885          * Return pages to free list and return failure
2886          */
2887         while (pp_first != NULL) {
2888                 pp = pp_first;
2889                 page_sub(&pp_first, pp);
2890                 page_io_unlock(pp);
2891                 if (pp->p_vnode != NULL)
2892                         page_hashout(pp, NULL);
2893                 page_free(pp, 1);
2894         }
2895         if (pplist)
2896                 kmem_free(pplist, extpages * sizeof (page_t *));
2897         if (mfnlist)
2898                 kmem_free(mfnlist, extpages * sizeof (mfn_t));
2899         page_unresv(extpages - minctg);
2900         return (NULL);
2901 }
2902 
2903 static void
2904 return_partial_alloc(page_t *plist)
2905 {
2906         page_t *pp;
2907 
2908         while (plist != NULL) {
2909                 pp = plist;
2910                 page_sub(&plist, pp);
2911                 page_io_unlock(pp);
2912                 page_destroy_io(pp);
2913         }
2914 }
2915 
2916 static page_t *
2917 page_get_contigpages(
2918         struct vnode    *vp,
2919         u_offset_t      off,
2920         int             *npagesp,
2921         uint_t          flags,
2922         caddr_t         vaddr,
2923         ddi_dma_attr_t  *mattr)
2924 {
2925         mfn_t   max_mfn = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL);
2926         page_t  *plist; /* list to return */
2927         page_t  *pp, *mcpl;
2928         int     contig, anyaddr, npages, getone = 0;
2929         mfn_t   lo_mfn;
2930         mfn_t   hi_mfn;
2931         pgcnt_t pfnalign = 0;
2932         int     align, sgllen;
2933         uint64_t pfnseg;
2934         pgcnt_t minctg;
2935 
2936         npages = *npagesp;
2937         ASSERT(mattr != NULL);
2938         lo_mfn = mmu_btop(mattr->dma_attr_addr_lo);
2939         hi_mfn = mmu_btop(mattr->dma_attr_addr_hi);
2940         sgllen = mattr->dma_attr_sgllen;
2941         pfnseg = mmu_btop(mattr->dma_attr_seg);
2942         align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer);
2943         if (align > MMU_PAGESIZE)
2944                 pfnalign = mmu_btop(align);
2945 
2946         contig = flags & PG_PHYSCONTIG;
2947         if (npages == -1) {
2948                 npages = 1;
2949                 pfnalign = 0;
2950         }
2951         /*
2952          * Clear the contig flag if only one page is needed.
2953          */
2954         if (npages == 1) {
2955                 getone = 1;
2956                 contig = 0;
2957         }
2958 
2959         /*
2960          * Check if any page in the system is fine.
2961          */
2962         anyaddr = lo_mfn == 0 && hi_mfn >= max_mfn;
2963         if (!contig && anyaddr && !pfnalign) {
2964                 flags &= ~PG_PHYSCONTIG;
2965                 plist = page_create_va(vp, off, npages * MMU_PAGESIZE,
2966                     flags, &kvseg, vaddr);
2967                 if (plist != NULL) {
2968                         *npagesp = 0;
2969                         return (plist);
2970                 }
2971         }
2972         plist = NULL;
2973         minctg = howmany(npages, sgllen);
2974         while (npages > sgllen || getone) {
2975                 if (minctg > npages)
2976                         minctg = npages;
2977                 mcpl = NULL;
2978                 /*
2979                  * We could want contig pages with no address range limits.
2980                  */
2981                 if (anyaddr && contig) {
2982                         /*
2983                          * Look for free contig pages to satisfy the request.
2984                          */
2985                         mcpl = find_contig_free(minctg, flags, pfnseg,
2986                             pfnalign);
2987                 }
2988                 /*
2989                  * Try the reserved io pools next
2990                  */
2991                 if (mcpl == NULL)
2992                         mcpl = page_io_pool_alloc(mattr, contig, minctg);
2993                 if (mcpl != NULL) {
2994                         pp = mcpl;
2995                         do {
2996                                 if (!page_hashin(pp, vp, off, NULL)) {
2997                                         panic("page_get_contigpages:"
2998                                             " hashin failed"
2999                                             " pp %p, vp %p, off %llx",
3000                                             (void *)pp, (void *)vp, off);
3001                                 }
3002                                 off += MMU_PAGESIZE;
3003                                 PP_CLRFREE(pp);
3004                                 PP_CLRAGED(pp);
3005                                 page_set_props(pp, P_REF);
3006                                 page_io_lock(pp);
3007                                 pp = pp->p_next;
3008                         } while (pp != mcpl);
3009                 } else {
3010                         /*
3011                          * Hypervisor exchange doesn't handle segment or
3012                          * alignment constraints
3013                          */
3014                         if (mattr->dma_attr_seg < mattr->dma_attr_addr_hi ||
3015                             pfnalign)
3016                                 goto fail;
3017                         /*
3018                          * Try exchanging pages with the hypervisor
3019                          */
3020                         mcpl = page_swap_with_hypervisor(vp, off, vaddr, mattr,
3021                             flags, minctg);
3022                         if (mcpl == NULL)
3023                                 goto fail;
3024                         off += minctg * MMU_PAGESIZE;
3025                 }
3026                 check_dma(mattr, mcpl, minctg);
3027                 /*
3028                  * Here with a minctg run of contiguous pages, add them to the
3029                  * list we will return for this request.
3030                  */
3031                 page_list_concat(&plist, &mcpl);
3032                 npages -= minctg;
3033                 *npagesp = npages;
3034                 sgllen--;
3035                 if (getone)
3036                         break;
3037         }
3038         return (plist);
3039 fail:
3040         return_partial_alloc(plist);
3041         return (NULL);
3042 }
3043 
3044 /*
3045  * Allocator for domain 0 I/O pages. We match the required
3046  * DMA attributes and contiguity constraints.
3047  */
3048 /*ARGSUSED*/
3049 page_t *
3050 page_create_io(
3051         struct vnode    *vp,
3052         u_offset_t      off,
3053         uint_t          bytes,
3054         uint_t          flags,
3055         struct as       *as,
3056         caddr_t         vaddr,
3057         ddi_dma_attr_t  *mattr)
3058 {
3059         page_t  *plist = NULL, *pp;
3060         int     npages = 0, contig, anyaddr, pages_req;
3061         mfn_t   lo_mfn;
3062         mfn_t   hi_mfn;
3063         pgcnt_t pfnalign = 0;
3064         int     align;
3065         int     is_domu = 0;
3066         int     dummy, bytes_got;
3067         mfn_t   max_mfn = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL);
3068 
3069         ASSERT(mattr != NULL);
3070         lo_mfn = mmu_btop(mattr->dma_attr_addr_lo);
3071         hi_mfn = mmu_btop(mattr->dma_attr_addr_hi);
3072         align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer);
3073         if (align > MMU_PAGESIZE)
3074                 pfnalign = mmu_btop(align);
3075 
3076         /*
3077          * Clear the contig flag if only one page is needed or the scatter
3078          * gather list length is >= npages.
3079          */
3080         pages_req = npages = mmu_btopr(bytes);
3081         contig = (flags & PG_PHYSCONTIG);
3082         bytes = P2ROUNDUP(bytes, MMU_PAGESIZE);
3083         if (bytes == MMU_PAGESIZE || mattr->dma_attr_sgllen >= npages)
3084                 contig = 0;
3085 
3086         /*
3087          * Check if any old page in the system is fine.
3088          * DomU should always go down this path.
3089          */
3090         is_domu = !DOMAIN_IS_INITDOMAIN(xen_info);
3091         anyaddr = lo_mfn == 0 && hi_mfn >= max_mfn && !pfnalign;
3092         if ((!contig && anyaddr) || is_domu) {
3093                 flags &= ~PG_PHYSCONTIG;
3094                 plist = page_create_va(vp, off, bytes, flags, &kvseg, vaddr);
3095                 if (plist != NULL)
3096                         return (plist);
3097                 else if (is_domu)
3098                         return (NULL); /* no memory available */
3099         }
3100         /*
3101          * DomU should never reach here
3102          */
3103         if (contig) {
3104                 plist = page_get_contigpages(vp, off, &npages, flags, vaddr,
3105                     mattr);
3106                 if (plist == NULL)
3107                         goto fail;
3108                 bytes_got = (pages_req - npages) << MMU_PAGESHIFT;
3109                 vaddr += bytes_got;
3110                 off += bytes_got;
3111                 /*
3112                  * We now have all the contiguous pages we need, but
3113                  * we may still need additional non-contiguous pages.
3114                  */
3115         }
3116         /*
3117          * now loop collecting the requested number of pages, these do
3118          * not have to be contiguous pages but we will use the contig
3119          * page alloc code to get the pages since it will honor any
3120          * other constraints the pages may have.
3121          */
3122         while (npages--) {
3123                 dummy = -1;
3124                 pp = page_get_contigpages(vp, off, &dummy, flags, vaddr, mattr);
3125                 if (pp == NULL)
3126                         goto fail;
3127                 page_add(&plist, pp);
3128                 vaddr += MMU_PAGESIZE;
3129                 off += MMU_PAGESIZE;
3130         }
3131         return (plist);
3132 fail:
3133         /*
3134          * Failed to get enough pages, return ones we did get
3135          */
3136         return_partial_alloc(plist);
3137         return (NULL);
3138 }
3139 
3140 /*
3141  * Lock and return the page with the highest mfn that we can find.  last_mfn
3142  * holds the last one found, so the next search can start from there.  We
3143  * also keep a counter so that we don't loop forever if the machine has no
3144  * free pages.
3145  *
3146  * This is called from the balloon thread to find pages to give away.  new_high
3147  * is used when new mfn's have been added to the system - we will reset our
3148  * search if the new mfn's are higher than our current search position.
3149  */
3150 page_t *
3151 page_get_high_mfn(mfn_t new_high)
3152 {
3153         static mfn_t last_mfn = 0;
3154         pfn_t pfn;
3155         page_t *pp;
3156         ulong_t loop_count = 0;
3157 
3158         if (new_high > last_mfn)
3159                 last_mfn = new_high;
3160 
3161         for (; loop_count < mfn_count; loop_count++, last_mfn--) {
3162                 if (last_mfn == 0) {
3163                         last_mfn = cached_max_mfn;
3164                 }
3165 
3166                 pfn = mfn_to_pfn(last_mfn);
3167                 if (pfn & PFN_IS_FOREIGN_MFN)
3168                         continue;
3169 
3170                 /* See if the page is free.  If so, lock it. */
3171                 pp = page_numtopp_alloc(pfn);
3172                 if (pp == NULL)
3173                         continue;
3174                 PP_CLRFREE(pp);
3175 
3176                 ASSERT(PAGE_EXCL(pp));
3177                 ASSERT(pp->p_vnode == NULL);
3178                 ASSERT(!hat_page_is_mapped(pp));
3179                 last_mfn--;
3180                 return (pp);
3181         }
3182         return (NULL);
3183 }
3184 
3185 #else /* !__xpv */
3186 
3187 /*
3188  * get a page from any list with the given mnode
3189  */
3190 static page_t *
3191 page_get_mnode_anylist(ulong_t origbin, uchar_t szc, uint_t flags,
3192     int mnode, int mtype, ddi_dma_attr_t *dma_attr)
3193 {
3194         kmutex_t                *pcm;
3195         int                     i;
3196         page_t                  *pp;
3197         page_t                  *first_pp;
3198         uint64_t                pgaddr;
3199         ulong_t                 bin;
3200         int                     mtypestart;
3201         int                     plw_initialized;
3202         page_list_walker_t      plw;
3203 
3204         VM_STAT_ADD(pga_vmstats.pgma_alloc);
3205 
3206         ASSERT((flags & PG_MATCH_COLOR) == 0);
3207         ASSERT(szc == 0);
3208         ASSERT(dma_attr != NULL);
3209 
3210         MTYPE_START(mnode, mtype, flags);
3211         if (mtype < 0) {
3212                 VM_STAT_ADD(pga_vmstats.pgma_allocempty);
3213                 return (NULL);
3214         }
3215 
3216         mtypestart = mtype;
3217 
3218         bin = origbin;
3219 
3220         /*
3221          * check up to page_colors + 1 bins - origbin may be checked twice
3222          * because of BIN_STEP skip
3223          */
3224         do {
3225                 plw_initialized = 0;
3226 
3227                 for (plw.plw_count = 0;
3228                     plw.plw_count < page_colors; plw.plw_count++) {
3229 
3230                         if (PAGE_FREELISTS(mnode, szc, bin, mtype) == NULL)
3231                                 goto nextfreebin;
3232 
3233                         pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
3234                         mutex_enter(pcm);
3235                         pp = PAGE_FREELISTS(mnode, szc, bin, mtype);
3236                         first_pp = pp;
3237                         while (pp != NULL) {
3238                                 if (IS_DUMP_PAGE(pp) || page_trylock(pp,
3239                                     SE_EXCL) == 0) {
3240                                         pp = pp->p_next;
3241                                         if (pp == first_pp) {
3242                                                 pp = NULL;
3243                                         }
3244                                         continue;
3245                                 }
3246 
3247                                 ASSERT(PP_ISFREE(pp));
3248                                 ASSERT(PP_ISAGED(pp));
3249                                 ASSERT(pp->p_vnode == NULL);
3250                                 ASSERT(pp->p_hash == NULL);
3251                                 ASSERT(pp->p_offset == (u_offset_t)-1);
3252                                 ASSERT(pp->p_szc == szc);
3253                                 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
3254                                 /* check if page within DMA attributes */
3255                                 pgaddr = pa_to_ma(pfn_to_pa(pp->p_pagenum));
3256                                 if ((pgaddr >= dma_attr->dma_attr_addr_lo) &&
3257                                     (pgaddr + MMU_PAGESIZE - 1 <=
3258                                     dma_attr->dma_attr_addr_hi)) {
3259                                         break;
3260                                 }
3261 
3262                                 /* continue looking */
3263                                 page_unlock(pp);
3264                                 pp = pp->p_next;
3265                                 if (pp == first_pp)
3266                                         pp = NULL;
3267 
3268                         }
3269                         if (pp != NULL) {
3270                                 ASSERT(mtype == PP_2_MTYPE(pp));
3271                                 ASSERT(pp->p_szc == 0);
3272 
3273                                 /* found a page with specified DMA attributes */
3274                                 page_sub(&PAGE_FREELISTS(mnode, szc, bin,
3275                                     mtype), pp);
3276                                 page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST);
3277 
3278                                 if ((PP_ISFREE(pp) == 0) ||
3279                                     (PP_ISAGED(pp) == 0)) {
3280                                         cmn_err(CE_PANIC, "page %p is not free",
3281                                             (void *)pp);
3282                                 }
3283 
3284                                 mutex_exit(pcm);
3285                                 check_dma(dma_attr, pp, 1);
3286                                 VM_STAT_ADD(pga_vmstats.pgma_allocok);
3287                                 return (pp);
3288                         }
3289                         mutex_exit(pcm);
3290 nextfreebin:
3291                         if (plw_initialized == 0) {
3292                                 page_list_walk_init(szc, 0, bin, 1, 0, &plw);
3293                                 ASSERT(plw.plw_ceq_dif == page_colors);
3294                                 plw_initialized = 1;
3295                         }
3296 
3297                         if (plw.plw_do_split) {
3298                                 pp = page_freelist_split(szc, bin, mnode,
3299                                     mtype,
3300                                     mmu_btop(dma_attr->dma_attr_addr_lo),
3301                                     mmu_btop(dma_attr->dma_attr_addr_hi + 1),
3302                                     &plw);
3303                                 if (pp != NULL) {
3304                                         check_dma(dma_attr, pp, 1);
3305                                         return (pp);
3306                                 }
3307                         }
3308 
3309                         bin = page_list_walk_next_bin(szc, bin, &plw);
3310                 }
3311 
3312                 MTYPE_NEXT(mnode, mtype, flags);
3313         } while (mtype >= 0);
3314 
3315         /* failed to find a page in the freelist; try it in the cachelist */
3316 
3317         /* reset mtype start for cachelist search */
3318         mtype = mtypestart;
3319         ASSERT(mtype >= 0);
3320 
3321         /* start with the bin of matching color */
3322         bin = origbin;
3323 
3324         do {
3325                 for (i = 0; i <= page_colors; i++) {
3326                         if (PAGE_CACHELISTS(mnode, bin, mtype) == NULL)
3327                                 goto nextcachebin;
3328                         pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST);
3329                         mutex_enter(pcm);
3330                         pp = PAGE_CACHELISTS(mnode, bin, mtype);
3331                         first_pp = pp;
3332                         while (pp != NULL) {
3333                                 if (IS_DUMP_PAGE(pp) || page_trylock(pp,
3334                                     SE_EXCL) == 0) {
3335                                         pp = pp->p_next;
3336                                         if (pp == first_pp)
3337                                                 pp = NULL;
3338                                         continue;
3339                                 }
3340                                 ASSERT(pp->p_vnode);
3341                                 ASSERT(PP_ISAGED(pp) == 0);
3342                                 ASSERT(pp->p_szc == 0);
3343                                 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
3344 
3345                                 /* check if page within DMA attributes */
3346 
3347                                 pgaddr = pa_to_ma(pfn_to_pa(pp->p_pagenum));
3348                                 if ((pgaddr >= dma_attr->dma_attr_addr_lo) &&
3349                                     (pgaddr + MMU_PAGESIZE - 1 <=
3350                                     dma_attr->dma_attr_addr_hi)) {
3351                                         break;
3352                                 }
3353 
3354                                 /* continue looking */
3355                                 page_unlock(pp);
3356                                 pp = pp->p_next;
3357                                 if (pp == first_pp)
3358                                         pp = NULL;
3359                         }
3360 
3361                         if (pp != NULL) {
3362                                 ASSERT(mtype == PP_2_MTYPE(pp));
3363                                 ASSERT(pp->p_szc == 0);
3364 
3365                                 /* found a page with specified DMA attributes */
3366                                 page_sub(&PAGE_CACHELISTS(mnode, bin,
3367                                     mtype), pp);
3368                                 page_ctr_sub(mnode, mtype, pp, PG_CACHE_LIST);
3369 
3370                                 mutex_exit(pcm);
3371                                 ASSERT(pp->p_vnode);
3372                                 ASSERT(PP_ISAGED(pp) == 0);
3373                                 check_dma(dma_attr, pp, 1);
3374                                 VM_STAT_ADD(pga_vmstats.pgma_allocok);
3375                                 return (pp);
3376                         }
3377                         mutex_exit(pcm);
3378 nextcachebin:
3379                         bin += (i == 0) ? BIN_STEP : 1;
3380                         bin &= page_colors_mask;
3381                 }
3382                 MTYPE_NEXT(mnode, mtype, flags);
3383         } while (mtype >= 0);
3384 
3385         VM_STAT_ADD(pga_vmstats.pgma_allocfailed);
3386         return (NULL);
3387 }
3388 
3389 /*
3390  * This function is similar to page_get_freelist()/page_get_cachelist()
3391  * but it searches both the lists to find a page with the specified
3392  * color (or no color) and DMA attributes. The search is done in the
3393  * freelist first and then in the cache list within the highest memory
3394  * range (based on DMA attributes) before searching in the lower
3395  * memory ranges.
3396  *
3397  * Note: This function is called only by page_create_io().
3398  */
3399 /*ARGSUSED*/
3400 static page_t *
3401 page_get_anylist(struct vnode *vp, u_offset_t off, struct as *as, caddr_t vaddr,
3402     size_t size, uint_t flags, ddi_dma_attr_t *dma_attr, lgrp_t *lgrp)
3403 {
3404         uint_t          bin;
3405         int             mtype;
3406         page_t          *pp;
3407         int             n;
3408         int             m;
3409         int             szc;
3410         int             fullrange;
3411         int             mnode;
3412         int             local_failed_stat = 0;
3413         lgrp_mnode_cookie_t     lgrp_cookie;
3414 
3415         VM_STAT_ADD(pga_vmstats.pga_alloc);
3416 
3417         /* only base pagesize currently supported */
3418         if (size != MMU_PAGESIZE)
3419                 return (NULL);
3420 
3421         /*
3422          * If we're passed a specific lgroup, we use it.  Otherwise,
3423          * assume first-touch placement is desired.
3424          */
3425         if (!LGRP_EXISTS(lgrp))
3426                 lgrp = lgrp_home_lgrp();
3427 
3428         /* LINTED */
3429         AS_2_BIN(as, seg, vp, vaddr, bin, 0);
3430 
3431         /*
3432          * Only hold one freelist or cachelist lock at a time, that way we
3433          * can start anywhere and not have to worry about lock
3434          * ordering.
3435          */
3436         if (dma_attr == NULL) {
3437                 n = mtype16m;
3438                 m = mtypetop;
3439                 fullrange = 1;
3440                 VM_STAT_ADD(pga_vmstats.pga_nulldmaattr);
3441         } else {
3442                 pfn_t pfnlo = mmu_btop(dma_attr->dma_attr_addr_lo);
3443                 pfn_t pfnhi = mmu_btop(dma_attr->dma_attr_addr_hi);
3444 
3445                 /*
3446                  * We can guarantee alignment only for page boundary.
3447                  */
3448                 if (dma_attr->dma_attr_align > MMU_PAGESIZE)
3449                         return (NULL);
3450 
3451                 /* Sanity check the dma_attr */
3452                 if (pfnlo > pfnhi)
3453                         return (NULL);
3454 
3455                 n = pfn_2_mtype(pfnlo);
3456                 m = pfn_2_mtype(pfnhi);
3457 
3458                 fullrange = ((pfnlo == mnoderanges[n].mnr_pfnlo) &&
3459                     (pfnhi >= mnoderanges[m].mnr_pfnhi));
3460         }
3461         VM_STAT_COND_ADD(fullrange == 0, pga_vmstats.pga_notfullrange);
3462 
3463         szc = 0;
3464 
3465         /* cylcing thru mtype handled by RANGE0 if n == mtype16m */
3466         if (n == mtype16m) {
3467                 flags |= PGI_MT_RANGE0;
3468                 n = m;
3469         }
3470 
3471         /*
3472          * Try local memory node first, but try remote if we can't
3473          * get a page of the right color.
3474          */
3475         LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_HIER);
3476         while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3477                 /*
3478                  * allocate pages from high pfn to low.
3479                  */
3480                 mtype = m;
3481                 do {
3482                         if (fullrange != 0) {
3483                                 pp = page_get_mnode_freelist(mnode,
3484                                     bin, mtype, szc, flags);
3485                                 if (pp == NULL) {
3486                                         pp = page_get_mnode_cachelist(
3487                                             bin, flags, mnode, mtype);
3488                                 }
3489                         } else {
3490                                 pp = page_get_mnode_anylist(bin, szc,
3491                                     flags, mnode, mtype, dma_attr);
3492                         }
3493                         if (pp != NULL) {
3494                                 VM_STAT_ADD(pga_vmstats.pga_allocok);
3495                                 check_dma(dma_attr, pp, 1);
3496                                 return (pp);
3497                         }
3498                 } while (mtype != n &&
3499                     (mtype = mnoderanges[mtype].mnr_next) != -1);
3500                 if (!local_failed_stat) {
3501                         lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1);
3502                         local_failed_stat = 1;
3503                 }
3504         }
3505         VM_STAT_ADD(pga_vmstats.pga_allocfailed);
3506 
3507         return (NULL);
3508 }
3509 
3510 /*
3511  * page_create_io()
3512  *
3513  * This function is a copy of page_create_va() with an additional
3514  * argument 'mattr' that specifies DMA memory requirements to
3515  * the page list functions. This function is used by the segkmem
3516  * allocator so it is only to create new pages (i.e PG_EXCL is
3517  * set).
3518  *
3519  * Note: This interface is currently used by x86 PSM only and is
3520  *       not fully specified so the commitment level is only for
3521  *       private interface specific to x86. This interface uses PSM
3522  *       specific page_get_anylist() interface.
3523  */
3524 
3525 #define PAGE_HASH_SEARCH(index, pp, vp, off) { \
3526         for ((pp) = page_hash[(index)]; (pp); (pp) = (pp)->p_hash) { \
3527                 if ((pp)->p_vnode == (vp) && (pp)->p_offset == (off)) \
3528                         break; \
3529         } \
3530 }
3531 
3532 
3533 page_t *
3534 page_create_io(
3535         struct vnode    *vp,
3536         u_offset_t      off,
3537         uint_t          bytes,
3538         uint_t          flags,
3539         struct as       *as,
3540         caddr_t         vaddr,
3541         ddi_dma_attr_t  *mattr) /* DMA memory attributes if any */
3542 {
3543         page_t          *plist = NULL;
3544         uint_t          plist_len = 0;
3545         pgcnt_t         npages;
3546         page_t          *npp = NULL;
3547         uint_t          pages_req;
3548         page_t          *pp;
3549         kmutex_t        *phm = NULL;
3550         uint_t          index;
3551 
3552         TRACE_4(TR_FAC_VM, TR_PAGE_CREATE_START,
3553             "page_create_start:vp %p off %llx bytes %u flags %x",
3554             vp, off, bytes, flags);
3555 
3556         ASSERT((flags & ~(PG_EXCL | PG_WAIT | PG_PHYSCONTIG)) == 0);
3557 
3558         pages_req = npages = mmu_btopr(bytes);
3559 
3560         /*
3561          * Do the freemem and pcf accounting.
3562          */
3563         if (!page_create_wait(npages, flags)) {
3564                 return (NULL);
3565         }
3566 
3567         TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SUCCESS,
3568             "page_create_success:vp %p off %llx", vp, off);
3569 
3570         /*
3571          * If satisfying this request has left us with too little
3572          * memory, start the wheels turning to get some back.  The
3573          * first clause of the test prevents waking up the pageout
3574          * daemon in situations where it would decide that there's
3575          * nothing to do.
3576          */
3577         if (nscan < desscan && freemem < minfree) {
3578                 TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL,
3579                     "pageout_cv_signal:freemem %ld", freemem);
3580                 cv_signal(&proc_pageout->p_cv);
3581         }
3582 
3583         if (flags & PG_PHYSCONTIG) {
3584 
3585                 plist = page_get_contigpage(&npages, mattr, 1);
3586                 if (plist == NULL) {
3587                         page_create_putback(npages);
3588                         return (NULL);
3589                 }
3590 
3591                 pp = plist;
3592 
3593                 do {
3594                         if (!page_hashin(pp, vp, off, NULL)) {
3595                                 panic("pg_creat_io: hashin failed %p %p %llx",
3596                                     (void *)pp, (void *)vp, off);
3597                         }
3598                         VM_STAT_ADD(page_create_new);
3599                         off += MMU_PAGESIZE;
3600                         PP_CLRFREE(pp);
3601                         PP_CLRAGED(pp);
3602                         page_set_props(pp, P_REF);
3603                         pp = pp->p_next;
3604                 } while (pp != plist);
3605 
3606                 if (!npages) {
3607                         check_dma(mattr, plist, pages_req);
3608                         return (plist);
3609                 } else {
3610                         vaddr += (pages_req - npages) << MMU_PAGESHIFT;
3611                 }
3612 
3613                 /*
3614                  * fall-thru:
3615                  *
3616                  * page_get_contigpage returns when npages <= sgllen.
3617                  * Grab the rest of the non-contig pages below from anylist.
3618                  */
3619         }
3620 
3621         /*
3622          * Loop around collecting the requested number of pages.
3623          * Most of the time, we have to `create' a new page. With
3624          * this in mind, pull the page off the free list before
3625          * getting the hash lock.  This will minimize the hash
3626          * lock hold time, nesting, and the like.  If it turns
3627          * out we don't need the page, we put it back at the end.
3628          */
3629         while (npages--) {
3630                 phm = NULL;
3631 
3632                 index = PAGE_HASH_FUNC(vp, off);
3633 top:
3634                 ASSERT(phm == NULL);
3635                 ASSERT(index == PAGE_HASH_FUNC(vp, off));
3636                 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
3637 
3638                 if (npp == NULL) {
3639                         /*
3640                          * Try to get the page of any color either from
3641                          * the freelist or from the cache list.
3642                          */
3643                         npp = page_get_anylist(vp, off, as, vaddr, MMU_PAGESIZE,
3644                             flags & ~PG_MATCH_COLOR, mattr, NULL);
3645                         if (npp == NULL) {
3646                                 if (mattr == NULL) {
3647                                         /*
3648                                          * Not looking for a special page;
3649                                          * panic!
3650                                          */
3651                                         panic("no page found %d", (int)npages);
3652                                 }
3653                                 /*
3654                                  * No page found! This can happen
3655                                  * if we are looking for a page
3656                                  * within a specific memory range
3657                                  * for DMA purposes. If PG_WAIT is
3658                                  * specified then we wait for a
3659                                  * while and then try again. The
3660                                  * wait could be forever if we
3661                                  * don't get the page(s) we need.
3662                                  *
3663                                  * Note: XXX We really need a mechanism
3664                                  * to wait for pages in the desired
3665                                  * range. For now, we wait for any
3666                                  * pages and see if we can use it.
3667                                  */
3668 
3669                                 if ((mattr != NULL) && (flags & PG_WAIT)) {
3670                                         delay(10);
3671                                         goto top;
3672                                 }
3673                                 goto fail; /* undo accounting stuff */
3674                         }
3675 
3676                         if (PP_ISAGED(npp) == 0) {
3677                                 /*
3678                                  * Since this page came from the
3679                                  * cachelist, we must destroy the
3680                                  * old vnode association.
3681                                  */
3682                                 page_hashout(npp, (kmutex_t *)NULL);
3683                         }
3684                 }
3685 
3686                 /*
3687                  * We own this page!
3688                  */
3689                 ASSERT(PAGE_EXCL(npp));
3690                 ASSERT(npp->p_vnode == NULL);
3691                 ASSERT(!hat_page_is_mapped(npp));
3692                 PP_CLRFREE(npp);
3693                 PP_CLRAGED(npp);
3694 
3695                 /*
3696                  * Here we have a page in our hot little mits and are
3697                  * just waiting to stuff it on the appropriate lists.
3698                  * Get the mutex and check to see if it really does
3699                  * not exist.
3700                  */
3701                 phm = PAGE_HASH_MUTEX(index);
3702                 mutex_enter(phm);
3703                 PAGE_HASH_SEARCH(index, pp, vp, off);
3704                 if (pp == NULL) {
3705                         VM_STAT_ADD(page_create_new);
3706                         pp = npp;
3707                         npp = NULL;
3708                         if (!page_hashin(pp, vp, off, phm)) {
3709                                 /*
3710                                  * Since we hold the page hash mutex and
3711                                  * just searched for this page, page_hashin
3712                                  * had better not fail.  If it does, that
3713                                  * means somethread did not follow the
3714                                  * page hash mutex rules.  Panic now and
3715                                  * get it over with.  As usual, go down
3716                                  * holding all the locks.
3717                                  */
3718                                 ASSERT(MUTEX_HELD(phm));
3719                                 panic("page_create: hashin fail %p %p %llx %p",
3720                                     (void *)pp, (void *)vp, off, (void *)phm);
3721 
3722                         }
3723                         ASSERT(MUTEX_HELD(phm));
3724                         mutex_exit(phm);
3725                         phm = NULL;
3726 
3727                         /*
3728                          * Hat layer locking need not be done to set
3729                          * the following bits since the page is not hashed
3730                          * and was on the free list (i.e., had no mappings).
3731                          *
3732                          * Set the reference bit to protect
3733                          * against immediate pageout
3734                          *
3735                          * XXXmh modify freelist code to set reference
3736                          * bit so we don't have to do it here.
3737                          */
3738                         page_set_props(pp, P_REF);
3739                 } else {
3740                         ASSERT(MUTEX_HELD(phm));
3741                         mutex_exit(phm);
3742                         phm = NULL;
3743                         /*
3744                          * NOTE: This should not happen for pages associated
3745                          *       with kernel vnode 'kvp'.
3746                          */
3747                         /* XX64 - to debug why this happens! */
3748                         ASSERT(!VN_ISKAS(vp));
3749                         if (VN_ISKAS(vp))
3750                                 cmn_err(CE_NOTE,
3751                                     "page_create: page not expected "
3752                                     "in hash list for kernel vnode - pp 0x%p",
3753                                     (void *)pp);
3754                         VM_STAT_ADD(page_create_exists);
3755                         goto fail;
3756                 }
3757 
3758                 /*
3759                  * Got a page!  It is locked.  Acquire the i/o
3760                  * lock since we are going to use the p_next and
3761                  * p_prev fields to link the requested pages together.
3762                  */
3763                 page_io_lock(pp);
3764                 page_add(&plist, pp);
3765                 plist = plist->p_next;
3766                 off += MMU_PAGESIZE;
3767                 vaddr += MMU_PAGESIZE;
3768         }
3769 
3770         check_dma(mattr, plist, pages_req);
3771         return (plist);
3772 
3773 fail:
3774         if (npp != NULL) {
3775                 /*
3776                  * Did not need this page after all.
3777                  * Put it back on the free list.
3778                  */
3779                 VM_STAT_ADD(page_create_putbacks);
3780                 PP_SETFREE(npp);
3781                 PP_SETAGED(npp);
3782                 npp->p_offset = (u_offset_t)-1;
3783                 page_list_add(npp, PG_FREE_LIST | PG_LIST_TAIL);
3784                 page_unlock(npp);
3785         }
3786 
3787         /*
3788          * Give up the pages we already got.
3789          */
3790         while (plist != NULL) {
3791                 pp = plist;
3792                 page_sub(&plist, pp);
3793                 page_io_unlock(pp);
3794                 plist_len++;
3795                 /*LINTED: constant in conditional ctx*/
3796                 VN_DISPOSE(pp, B_INVAL, 0, kcred);
3797         }
3798 
3799         /*
3800          * VN_DISPOSE does freemem accounting for the pages in plist
3801          * by calling page_free. So, we need to undo the pcf accounting
3802          * for only the remaining pages.
3803          */
3804         VM_STAT_ADD(page_create_putbacks);
3805         page_create_putback(pages_req - plist_len);
3806 
3807         return (NULL);
3808 }
3809 #endif /* !__xpv */
3810 
3811 
3812 /*
3813  * Copy the data from the physical page represented by "frompp" to
3814  * that represented by "topp". ppcopy uses CPU->cpu_caddr1 and
3815  * CPU->cpu_caddr2.  It assumes that no one uses either map at interrupt
3816  * level and no one sleeps with an active mapping there.
3817  *
3818  * Note that the ref/mod bits in the page_t's are not affected by
3819  * this operation, hence it is up to the caller to update them appropriately.
3820  */
3821 int
3822 ppcopy(page_t *frompp, page_t *topp)
3823 {
3824         caddr_t         pp_addr1;
3825         caddr_t         pp_addr2;
3826         hat_mempte_t    pte1;
3827         hat_mempte_t    pte2;
3828         kmutex_t        *ppaddr_mutex;
3829         label_t         ljb;
3830         int             ret = 1;
3831 
3832         ASSERT_STACK_ALIGNED();
3833         ASSERT(PAGE_LOCKED(frompp));
3834         ASSERT(PAGE_LOCKED(topp));
3835 
3836         if (kpm_enable) {
3837                 pp_addr1 = hat_kpm_page2va(frompp, 0);
3838                 pp_addr2 = hat_kpm_page2va(topp, 0);
3839                 kpreempt_disable();
3840         } else {
3841                 /*
3842                  * disable pre-emption so that CPU can't change
3843                  */
3844                 kpreempt_disable();
3845 
3846                 pp_addr1 = CPU->cpu_caddr1;
3847                 pp_addr2 = CPU->cpu_caddr2;
3848                 pte1 = CPU->cpu_caddr1pte;
3849                 pte2 = CPU->cpu_caddr2pte;
3850 
3851                 ppaddr_mutex = &CPU->cpu_ppaddr_mutex;
3852                 mutex_enter(ppaddr_mutex);
3853 
3854                 hat_mempte_remap(page_pptonum(frompp), pp_addr1, pte1,
3855                     PROT_READ | HAT_STORECACHING_OK, HAT_LOAD_NOCONSIST);
3856                 hat_mempte_remap(page_pptonum(topp), pp_addr2, pte2,
3857                     PROT_READ | PROT_WRITE | HAT_STORECACHING_OK,
3858                     HAT_LOAD_NOCONSIST);
3859         }
3860 
3861         if (on_fault(&ljb)) {
3862                 ret = 0;
3863                 goto faulted;
3864         }
3865         if (use_sse_pagecopy)
3866 #ifdef __xpv
3867                 page_copy_no_xmm(pp_addr2, pp_addr1);
3868 #else
3869                 hwblkpagecopy(pp_addr1, pp_addr2);
3870 #endif
3871         else
3872                 bcopy(pp_addr1, pp_addr2, PAGESIZE);
3873 
3874         no_fault();
3875 faulted:
3876         if (!kpm_enable) {
3877 #ifdef __xpv
3878                 /*
3879                  * We can't leave unused mappings laying about under the
3880                  * hypervisor, so blow them away.
3881                  */
3882                 if (HYPERVISOR_update_va_mapping((uintptr_t)pp_addr1, 0,
3883                     UVMF_INVLPG | UVMF_LOCAL) < 0)
3884                         panic("HYPERVISOR_update_va_mapping() failed");
3885                 if (HYPERVISOR_update_va_mapping((uintptr_t)pp_addr2, 0,
3886                     UVMF_INVLPG | UVMF_LOCAL) < 0)
3887                         panic("HYPERVISOR_update_va_mapping() failed");
3888 #endif
3889                 mutex_exit(ppaddr_mutex);
3890         }
3891         kpreempt_enable();
3892         return (ret);
3893 }
3894 
3895 void
3896 pagezero(page_t *pp, uint_t off, uint_t len)
3897 {
3898         ASSERT(PAGE_LOCKED(pp));
3899         pfnzero(page_pptonum(pp), off, len);
3900 }
3901 
3902 /*
3903  * Zero the physical page from off to off + len given by pfn
3904  * without changing the reference and modified bits of page.
3905  *
3906  * We use this using CPU private page address #2, see ppcopy() for more info.
3907  * pfnzero() must not be called at interrupt level.
3908  */
3909 void
3910 pfnzero(pfn_t pfn, uint_t off, uint_t len)
3911 {
3912         caddr_t         pp_addr2;
3913         hat_mempte_t    pte2;
3914         kmutex_t        *ppaddr_mutex = NULL;
3915 
3916         ASSERT_STACK_ALIGNED();
3917         ASSERT(len <= MMU_PAGESIZE);
3918         ASSERT(off <= MMU_PAGESIZE);
3919         ASSERT(off + len <= MMU_PAGESIZE);
3920 
3921         if (kpm_enable && !pfn_is_foreign(pfn)) {
3922                 pp_addr2 = hat_kpm_pfn2va(pfn);
3923                 kpreempt_disable();
3924         } else {
3925                 kpreempt_disable();
3926 
3927                 pp_addr2 = CPU->cpu_caddr2;
3928                 pte2 = CPU->cpu_caddr2pte;
3929 
3930                 ppaddr_mutex = &CPU->cpu_ppaddr_mutex;
3931                 mutex_enter(ppaddr_mutex);
3932 
3933                 hat_mempte_remap(pfn, pp_addr2, pte2,
3934                     PROT_READ | PROT_WRITE | HAT_STORECACHING_OK,
3935                     HAT_LOAD_NOCONSIST);
3936         }
3937 
3938         if (use_sse_pagezero) {
3939 #ifdef __xpv
3940                 uint_t rem;
3941 
3942                 /*
3943                  * zero a byte at a time until properly aligned for
3944                  * block_zero_no_xmm().
3945                  */
3946                 while (!P2NPHASE(off, ((uint_t)BLOCKZEROALIGN)) && len-- > 0)
3947                         pp_addr2[off++] = 0;
3948 
3949                 /*
3950                  * Now use faster block_zero_no_xmm() for any range
3951                  * that is properly aligned and sized.
3952                  */
3953                 rem = P2PHASE(len, ((uint_t)BLOCKZEROALIGN));
3954                 len -= rem;
3955                 if (len != 0) {
3956                         block_zero_no_xmm(pp_addr2 + off, len);
3957                         off += len;
3958                 }
3959 
3960                 /*
3961                  * zero remainder with byte stores.
3962                  */
3963                 while (rem-- > 0)
3964                         pp_addr2[off++] = 0;
3965 #else
3966                 hwblkclr(pp_addr2 + off, len);
3967 #endif
3968         } else {
3969                 bzero(pp_addr2 + off, len);
3970         }
3971 
3972         if (!kpm_enable || pfn_is_foreign(pfn)) {
3973 #ifdef __xpv
3974                 /*
3975                  * On the hypervisor this page might get used for a page
3976                  * table before any intervening change to this mapping,
3977                  * so blow it away.
3978                  */
3979                 if (HYPERVISOR_update_va_mapping((uintptr_t)pp_addr2, 0,
3980                     UVMF_INVLPG) < 0)
3981                         panic("HYPERVISOR_update_va_mapping() failed");
3982 #endif
3983                 mutex_exit(ppaddr_mutex);
3984         }
3985 
3986         kpreempt_enable();
3987 }
3988 
3989 /*
3990  * Platform-dependent page scrub call.
3991  */
3992 void
3993 pagescrub(page_t *pp, uint_t off, uint_t len)
3994 {
3995         /*
3996          * For now, we rely on the fact that pagezero() will
3997          * always clear UEs.
3998          */
3999         pagezero(pp, off, len);
4000 }
4001 
4002 /*
4003  * set up two private addresses for use on a given CPU for use in ppcopy()
4004  */
4005 void
4006 setup_vaddr_for_ppcopy(struct cpu *cpup)
4007 {
4008         void *addr;
4009         hat_mempte_t pte_pa;
4010 
4011         addr = vmem_alloc(heap_arena, mmu_ptob(1), VM_SLEEP);
4012         pte_pa = hat_mempte_setup(addr);
4013         cpup->cpu_caddr1 = addr;
4014         cpup->cpu_caddr1pte = pte_pa;
4015 
4016         addr = vmem_alloc(heap_arena, mmu_ptob(1), VM_SLEEP);
4017         pte_pa = hat_mempte_setup(addr);
4018         cpup->cpu_caddr2 = addr;
4019         cpup->cpu_caddr2pte = pte_pa;
4020 
4021         mutex_init(&cpup->cpu_ppaddr_mutex, NULL, MUTEX_DEFAULT, NULL);
4022 }
4023 
4024 /*
4025  * Undo setup_vaddr_for_ppcopy
4026  */
4027 void
4028 teardown_vaddr_for_ppcopy(struct cpu *cpup)
4029 {
4030         mutex_destroy(&cpup->cpu_ppaddr_mutex);
4031 
4032         hat_mempte_release(cpup->cpu_caddr2, cpup->cpu_caddr2pte);
4033         cpup->cpu_caddr2pte = 0;
4034         vmem_free(heap_arena, cpup->cpu_caddr2, mmu_ptob(1));
4035         cpup->cpu_caddr2 = 0;
4036 
4037         hat_mempte_release(cpup->cpu_caddr1, cpup->cpu_caddr1pte);
4038         cpup->cpu_caddr1pte = 0;
4039         vmem_free(heap_arena, cpup->cpu_caddr1, mmu_ptob(1));
4040         cpup->cpu_caddr1 = 0;
4041 }
4042 
4043 /*
4044  * Function for flushing D-cache when performing module relocations
4045  * to an alternate mapping.  Unnecessary on Intel / AMD platforms.
4046  */
4047 void
4048 dcache_flushall()
4049 {}
4050 
4051 /*
4052  * Allocate a memory page.  The argument 'seed' can be any pseudo-random
4053  * number to vary where the pages come from.  This is quite a hacked up
4054  * method -- it works for now, but really needs to be fixed up a bit.
4055  *
4056  * We currently use page_create_va() on the kvp with fake offsets,
4057  * segments and virt address.  This is pretty bogus, but was copied from the
4058  * old hat_i86.c code.  A better approach would be to specify either mnode
4059  * random or mnode local and takes a page from whatever color has the MOST
4060  * available - this would have a minimal impact on page coloring.
4061  */
4062 page_t *
4063 page_get_physical(uintptr_t seed)
4064 {
4065         page_t *pp;
4066         u_offset_t offset;
4067         static struct seg tmpseg;
4068         static uintptr_t ctr = 0;
4069 
4070         /*
4071          * This code is gross, we really need a simpler page allocator.
4072          *
4073          * We need to assign an offset for the page to call page_create_va()
4074          * To avoid conflicts with other pages, we get creative with the offset.
4075          * For 32 bits, we need an offset > 4Gig
4076          * For 64 bits, need an offset somewhere in the VA hole.
4077          */
4078         offset = seed;
4079         if (offset > kernelbase)
4080                 offset -= kernelbase;
4081         offset <<= MMU_PAGESHIFT;
4082 #if defined(__amd64)
4083         offset += mmu.hole_start;       /* something in VA hole */
4084 #else
4085         offset += 1ULL << 40;     /* something > 4 Gig */
4086 #endif
4087 
4088         if (page_resv(1, KM_NOSLEEP) == 0)
4089                 return (NULL);
4090 
4091 #ifdef  DEBUG
4092         pp = page_exists(&kvp, offset);
4093         if (pp != NULL)
4094                 panic("page already exists %p", (void *)pp);
4095 #endif
4096 
4097         pp = page_create_va(&kvp, offset, MMU_PAGESIZE, PG_EXCL,
4098             &tmpseg, (caddr_t)(ctr += MMU_PAGESIZE));       /* changing VA usage */
4099         if (pp != NULL) {
4100                 page_io_unlock(pp);
4101                 page_downgrade(pp);
4102         }
4103         return (pp);
4104 }