1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
  23  */
  24 /*
  25  * Copyright (c) 2010, Intel Corporation.
  26  * All rights reserved.
  27  * Copyright 2019, Joyent, Inc.
  28  */
  29 
  30 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  31 /*      All Rights Reserved   */
  32 
  33 /*
  34  * Portions of this source code were derived from Berkeley 4.3 BSD
  35  * under license from the Regents of the University of California.
  36  */
  37 
  38 /*
  39  * UNIX machine dependent virtual memory support.
  40  */
  41 
  42 #include <sys/types.h>
  43 #include <sys/param.h>
  44 #include <sys/systm.h>
  45 #include <sys/user.h>
  46 #include <sys/proc.h>
  47 #include <sys/kmem.h>
  48 #include <sys/vmem.h>
  49 #include <sys/buf.h>
  50 #include <sys/cpuvar.h>
  51 #include <sys/lgrp.h>
  52 #include <sys/disp.h>
  53 #include <sys/vm.h>
  54 #include <sys/mman.h>
  55 #include <sys/vnode.h>
  56 #include <sys/cred.h>
  57 #include <sys/exec.h>
  58 #include <sys/exechdr.h>
  59 #include <sys/debug.h>
  60 #include <sys/vmsystm.h>
  61 #include <sys/swap.h>
  62 #include <sys/dumphdr.h>
  63 #include <sys/random.h>
  64 
  65 #include <vm/hat.h>
  66 #include <vm/as.h>
  67 #include <vm/seg.h>
  68 #include <vm/seg_kp.h>
  69 #include <vm/seg_vn.h>
  70 #include <vm/page.h>
  71 #include <vm/seg_kmem.h>
  72 #include <vm/seg_kpm.h>
  73 #include <vm/vm_dep.h>
  74 
  75 #include <sys/cpu.h>
  76 #include <sys/vm_machparam.h>
  77 #include <sys/memlist.h>
  78 #include <sys/bootconf.h> /* XXX the memlist stuff belongs in memlist_plat.h */
  79 #include <vm/hat_i86.h>
  80 #include <sys/x86_archext.h>
  81 #include <sys/elf_386.h>
  82 #include <sys/cmn_err.h>
  83 #include <sys/archsystm.h>
  84 #include <sys/machsystm.h>
  85 #include <sys/secflags.h>
  86 
  87 #include <sys/vtrace.h>
  88 #include <sys/ddidmareq.h>
  89 #include <sys/promif.h>
  90 #include <sys/memnode.h>
  91 #include <sys/stack.h>
  92 #include <util/qsort.h>
  93 #include <sys/taskq.h>
  94 
  95 #ifdef __xpv
  96 
  97 #include <sys/hypervisor.h>
  98 #include <sys/xen_mmu.h>
  99 #include <sys/balloon_impl.h>
 100 
 101 /*
 102  * domain 0 pages usable for DMA are kept pre-allocated and kept in
 103  * distinct lists, ordered by increasing mfn.
 104  */
 105 static kmutex_t io_pool_lock;
 106 static kmutex_t contig_list_lock;
 107 static page_t *io_pool_4g;      /* pool for 32 bit dma limited devices */
 108 static page_t *io_pool_16m;     /* pool for 24 bit dma limited legacy devices */
 109 static long io_pool_cnt;
 110 static long io_pool_cnt_max = 0;
 111 #define DEFAULT_IO_POOL_MIN     128
 112 static long io_pool_cnt_min = DEFAULT_IO_POOL_MIN;
 113 static long io_pool_cnt_lowater = 0;
 114 static long io_pool_shrink_attempts; /* how many times did we try to shrink */
 115 static long io_pool_shrinks;    /* how many times did we really shrink */
 116 static long io_pool_grows;      /* how many times did we grow */
 117 static mfn_t start_mfn = 1;
 118 static caddr_t io_pool_kva;     /* use to alloc pages when needed */
 119 
 120 static int create_contig_pfnlist(uint_t);
 121 
 122 /*
 123  * percentage of phys mem to hold in the i/o pool
 124  */
 125 #define DEFAULT_IO_POOL_PCT     2
 126 static long io_pool_physmem_pct = DEFAULT_IO_POOL_PCT;
 127 static void page_io_pool_sub(page_t **, page_t *, page_t *);
 128 int ioalloc_dbg = 0;
 129 
 130 #endif /* __xpv */
 131 
 132 uint_t vac_colors = 1;
 133 
 134 int largepagesupport = 0;
 135 extern uint_t page_create_new;
 136 extern uint_t page_create_exists;
 137 extern uint_t page_create_putbacks;
 138 /*
 139  * Allow users to disable the kernel's use of SSE.
 140  */
 141 extern int use_sse_pagecopy, use_sse_pagezero;
 142 
 143 /*
 144  * combined memory ranges from mnode and memranges[] to manage single
 145  * mnode/mtype dimension in the page lists.
 146  */
 147 typedef struct {
 148         pfn_t   mnr_pfnlo;
 149         pfn_t   mnr_pfnhi;
 150         int     mnr_mnode;
 151         int     mnr_memrange;           /* index into memranges[] */
 152         int     mnr_next;               /* next lower PA mnoderange */
 153         int     mnr_exists;
 154         /* maintain page list stats */
 155         pgcnt_t mnr_mt_clpgcnt;         /* cache list cnt */
 156         pgcnt_t mnr_mt_flpgcnt[MMU_PAGE_SIZES]; /* free list cnt per szc */
 157         pgcnt_t mnr_mt_totcnt;          /* sum of cache and free lists */
 158 #ifdef DEBUG
 159         struct mnr_mts {                /* mnode/mtype szc stats */
 160                 pgcnt_t mnr_mts_pgcnt;
 161                 int     mnr_mts_colors;
 162                 pgcnt_t *mnr_mtsc_pgcnt;
 163         }       *mnr_mts;
 164 #endif
 165 } mnoderange_t;
 166 
 167 #define MEMRANGEHI(mtype)                                               \
 168         ((mtype > 0) ? memranges[mtype - 1] - 1: physmax)
 169 #define MEMRANGELO(mtype)       (memranges[mtype])
 170 
 171 #define MTYPE_FREEMEM(mt)       (mnoderanges[mt].mnr_mt_totcnt)
 172 
 173 /*
 174  * As the PC architecture evolved memory up was clumped into several
 175  * ranges for various historical I/O devices to do DMA.
 176  * < 16Meg - ISA bus
 177  * < 2Gig - ???
 178  * < 4Gig - PCI bus or drivers that don't understand PAE mode
 179  *
 180  * These are listed in reverse order, so that we can skip over unused
 181  * ranges on machines with small memories.
 182  *
 183  * For now under the Hypervisor, we'll only ever have one memrange.
 184  */
 185 #define PFN_4GIG        0x100000
 186 #define PFN_16MEG       0x1000
 187 /* Indices into the memory range (arch_memranges) array. */
 188 #define MRI_4G          0
 189 #define MRI_2G          1
 190 #define MRI_16M         2
 191 #define MRI_0           3
 192 static pfn_t arch_memranges[NUM_MEM_RANGES] = {
 193     PFN_4GIG,   /* pfn range for 4G and above */
 194     0x80000,    /* pfn range for 2G-4G */
 195     PFN_16MEG,  /* pfn range for 16M-2G */
 196     0x00000,    /* pfn range for 0-16M */
 197 };
 198 pfn_t *memranges = &arch_memranges[0];
 199 int nranges = NUM_MEM_RANGES;
 200 
 201 /*
 202  * This combines mem_node_config and memranges into one data
 203  * structure to be used for page list management.
 204  */
 205 static mnoderange_t *mnoderanges;
 206 static int mnoderangecnt;
 207 static int mtype4g;
 208 static int mtype16m;
 209 static int mtypetop;
 210 
 211 /*
 212  * 4g memory management variables for systems with more than 4g of memory:
 213  *
 214  * physical memory below 4g is required for 32bit dma devices and, currently,
 215  * for kmem memory. On systems with more than 4g of memory, the pool of memory
 216  * below 4g can be depleted without any paging activity given that there is
 217  * likely to be sufficient memory above 4g.
 218  *
 219  * physmax4g is set true if the largest pfn is over 4g. The rest of the
 220  * 4g memory management code is enabled only when physmax4g is true.
 221  *
 222  * maxmem4g is the count of the maximum number of pages on the page lists
 223  * with physical addresses below 4g. It can be a lot less then 4g given that
 224  * BIOS may reserve large chunks of space below 4g for hot plug pci devices,
 225  * agp aperture etc.
 226  *
 227  * freemem4g maintains the count of the number of available pages on the
 228  * page lists with physical addresses below 4g.
 229  *
 230  * DESFREE4G specifies the desired amount of below 4g memory. It defaults to
 231  * 6% (desfree4gshift = 4) of maxmem4g.
 232  *
 233  * RESTRICT4G_ALLOC returns true if freemem4g falls below DESFREE4G
 234  * and the amount of physical memory above 4g is greater than freemem4g.
 235  * In this case, page_get_* routines will restrict below 4g allocations
 236  * for requests that don't specifically require it.
 237  */
 238 
 239 #define DESFREE4G       (maxmem4g >> desfree4gshift)
 240 
 241 #define RESTRICT4G_ALLOC                                        \
 242         (physmax4g && (freemem4g < DESFREE4G) && ((freemem4g << 1) < freemem))
 243 
 244 static pgcnt_t  maxmem4g;
 245 static pgcnt_t  freemem4g;
 246 static int      physmax4g;
 247 static int      desfree4gshift = 4;     /* maxmem4g shift to derive DESFREE4G */
 248 
 249 /*
 250  * 16m memory management:
 251  *
 252  * reserve some amount of physical memory below 16m for legacy devices.
 253  *
 254  * RESTRICT16M_ALLOC returns true if an there are sufficient free pages above
 255  * 16m or if the 16m pool drops below DESFREE16M.
 256  *
 257  * In this case, general page allocations via page_get_{free,cache}list
 258  * routines will be restricted from allocating from the 16m pool. Allocations
 259  * that require specific pfn ranges (page_get_anylist) and PG_PANIC allocations
 260  * are not restricted.
 261  */
 262 
 263 #define FREEMEM16M      MTYPE_FREEMEM(mtype16m)
 264 #define DESFREE16M      desfree16m
 265 #define RESTRICT16M_ALLOC(freemem, pgcnt, flags) \
 266         (mtype16m != -1 && (freemem != 0) && ((flags & PG_PANIC) == 0) && \
 267             ((freemem >= (FREEMEM16M)) || \
 268             (FREEMEM16M  < (DESFREE16M + pgcnt))))
 269 
 270 static pgcnt_t  desfree16m = 0x380;
 271 
 272 /*
 273  * This can be patched via /etc/system to allow old non-PAE aware device
 274  * drivers to use kmem_alloc'd memory on 32 bit systems with > 4Gig RAM.
 275  */
 276 int restricted_kmemalloc = 0;
 277 
 278 #ifdef VM_STATS
 279 struct {
 280         ulong_t pga_alloc;
 281         ulong_t pga_notfullrange;
 282         ulong_t pga_nulldmaattr;
 283         ulong_t pga_allocok;
 284         ulong_t pga_allocfailed;
 285         ulong_t pgma_alloc;
 286         ulong_t pgma_allocok;
 287         ulong_t pgma_allocfailed;
 288         ulong_t pgma_allocempty;
 289 } pga_vmstats;
 290 #endif
 291 
 292 uint_t mmu_page_sizes;
 293 
 294 /* How many page sizes the users can see */
 295 uint_t mmu_exported_page_sizes;
 296 
 297 /* page sizes that legacy applications can see */
 298 uint_t mmu_legacy_page_sizes;
 299 
 300 /*
 301  * Number of pages in 1 GB.  Don't enable automatic large pages if we have
 302  * fewer than this many pages.
 303  */
 304 pgcnt_t shm_lpg_min_physmem = 1 << (30 - MMU_PAGESHIFT);
 305 pgcnt_t privm_lpg_min_physmem = 1 << (30 - MMU_PAGESHIFT);
 306 
 307 /*
 308  * Maximum and default segment size tunables for user private
 309  * and shared anon memory, and user text and initialized data.
 310  * These can be patched via /etc/system to allow large pages
 311  * to be used for mapping application private and shared anon memory.
 312  */
 313 size_t mcntl0_lpsize = MMU_PAGESIZE;
 314 size_t max_uheap_lpsize = MMU_PAGESIZE;
 315 size_t default_uheap_lpsize = MMU_PAGESIZE;
 316 size_t max_ustack_lpsize = MMU_PAGESIZE;
 317 size_t default_ustack_lpsize = MMU_PAGESIZE;
 318 size_t max_privmap_lpsize = MMU_PAGESIZE;
 319 size_t max_uidata_lpsize = MMU_PAGESIZE;
 320 size_t max_utext_lpsize = MMU_PAGESIZE;
 321 size_t max_shm_lpsize = MMU_PAGESIZE;
 322 
 323 
 324 /*
 325  * initialized by page_coloring_init().
 326  */
 327 uint_t  page_colors;
 328 uint_t  page_colors_mask;
 329 uint_t  page_coloring_shift;
 330 int     cpu_page_colors;
 331 static uint_t   l2_colors;
 332 
 333 /*
 334  * Page freelists and cachelists are dynamically allocated once mnoderangecnt
 335  * and page_colors are calculated from the l2 cache n-way set size.  Within a
 336  * mnode range, the page freelist and cachelist are hashed into bins based on
 337  * color. This makes it easier to search for a page within a specific memory
 338  * range.
 339  */
 340 #define PAGE_COLORS_MIN 16
 341 
 342 page_t ****page_freelists;
 343 page_t ***page_cachelists;
 344 
 345 
 346 /*
 347  * Used by page layer to know about page sizes
 348  */
 349 hw_pagesize_t hw_page_array[MAX_NUM_LEVEL + 1];
 350 
 351 kmutex_t        *fpc_mutex[NPC_MUTEX];
 352 kmutex_t        *cpc_mutex[NPC_MUTEX];
 353 
 354 /* Lock to protect mnoderanges array for memory DR operations. */
 355 static kmutex_t mnoderange_lock;
 356 
 357 /*
 358  * Only let one thread at a time try to coalesce large pages, to
 359  * prevent them from working against each other.
 360  */
 361 static kmutex_t contig_lock;
 362 #define CONTIG_LOCK()   mutex_enter(&contig_lock);
 363 #define CONTIG_UNLOCK() mutex_exit(&contig_lock);
 364 
 365 #define PFN_16M         (mmu_btop((uint64_t)0x1000000))
 366 
 367 caddr_t
 368 i86devmap(pfn_t pf, pgcnt_t pgcnt, uint_t prot)
 369 {
 370         caddr_t addr;
 371         caddr_t addr1;
 372         page_t *pp;
 373 
 374         addr1 = addr = vmem_alloc(heap_arena, mmu_ptob(pgcnt), VM_SLEEP);
 375 
 376         for (; pgcnt != 0; addr += MMU_PAGESIZE, ++pf, --pgcnt) {
 377                 pp = page_numtopp_nolock(pf);
 378                 if (pp == NULL) {
 379                         hat_devload(kas.a_hat, addr, MMU_PAGESIZE, pf,
 380                             prot | HAT_NOSYNC, HAT_LOAD_LOCK);
 381                 } else {
 382                         hat_memload(kas.a_hat, addr, pp,
 383                             prot | HAT_NOSYNC, HAT_LOAD_LOCK);
 384                 }
 385         }
 386 
 387         return (addr1);
 388 }
 389 
 390 /*
 391  * This routine is like page_numtopp, but accepts only free pages, which
 392  * it allocates (unfrees) and returns with the exclusive lock held.
 393  * It is used by machdep.c/dma_init() to find contiguous free pages.
 394  */
 395 page_t *
 396 page_numtopp_alloc(pfn_t pfnum)
 397 {
 398         page_t *pp;
 399 
 400 retry:
 401         pp = page_numtopp_nolock(pfnum);
 402         if (pp == NULL) {
 403                 return (NULL);
 404         }
 405 
 406         if (!page_trylock(pp, SE_EXCL)) {
 407                 return (NULL);
 408         }
 409 
 410         if (page_pptonum(pp) != pfnum) {
 411                 page_unlock(pp);
 412                 goto retry;
 413         }
 414 
 415         if (!PP_ISFREE(pp)) {
 416                 page_unlock(pp);
 417                 return (NULL);
 418         }
 419         if (pp->p_szc) {
 420                 page_demote_free_pages(pp);
 421                 page_unlock(pp);
 422                 goto retry;
 423         }
 424 
 425         /* If associated with a vnode, destroy mappings */
 426 
 427         if (pp->p_vnode) {
 428 
 429                 page_destroy_free(pp);
 430 
 431                 if (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_NO_RECLAIM)) {
 432                         return (NULL);
 433                 }
 434 
 435                 if (page_pptonum(pp) != pfnum) {
 436                         page_unlock(pp);
 437                         goto retry;
 438                 }
 439         }
 440 
 441         if (!PP_ISFREE(pp)) {
 442                 page_unlock(pp);
 443                 return (NULL);
 444         }
 445 
 446         if (!page_reclaim(pp, (kmutex_t *)NULL))
 447                 return (NULL);
 448 
 449         return (pp);
 450 }
 451 
 452 /*
 453  * Return the optimum page size for a given mapping
 454  */
 455 /*ARGSUSED*/
 456 size_t
 457 map_pgsz(int maptype, struct proc *p, caddr_t addr, size_t len, int memcntl)
 458 {
 459         level_t l = 0;
 460         size_t pgsz = MMU_PAGESIZE;
 461         size_t max_lpsize;
 462         uint_t mszc;
 463 
 464         ASSERT(maptype != MAPPGSZ_VA);
 465 
 466         if (maptype != MAPPGSZ_ISM && physmem < privm_lpg_min_physmem) {
 467                 return (MMU_PAGESIZE);
 468         }
 469 
 470         switch (maptype) {
 471         case MAPPGSZ_HEAP:
 472         case MAPPGSZ_STK:
 473                 max_lpsize = memcntl ? mcntl0_lpsize : (maptype ==
 474                     MAPPGSZ_HEAP ? max_uheap_lpsize : max_ustack_lpsize);
 475                 if (max_lpsize == MMU_PAGESIZE) {
 476                         return (MMU_PAGESIZE);
 477                 }
 478                 if (len == 0) {
 479                         len = (maptype == MAPPGSZ_HEAP) ? p->p_brkbase +
 480                             p->p_brksize - p->p_bssbase : p->p_stksize;
 481                 }
 482                 len = (maptype == MAPPGSZ_HEAP) ? MAX(len,
 483                     default_uheap_lpsize) : MAX(len, default_ustack_lpsize);
 484 
 485                 /*
 486                  * use the pages size that best fits len
 487                  */
 488                 for (l = mmu.umax_page_level; l > 0; --l) {
 489                         if (LEVEL_SIZE(l) > max_lpsize || len < LEVEL_SIZE(l)) {
 490                                 continue;
 491                         } else {
 492                                 pgsz = LEVEL_SIZE(l);
 493                         }
 494                         break;
 495                 }
 496 
 497                 mszc = (maptype == MAPPGSZ_HEAP ? p->p_brkpageszc :
 498                     p->p_stkpageszc);
 499                 if (addr == 0 && (pgsz < hw_page_array[mszc].hp_size)) {
 500                         pgsz = hw_page_array[mszc].hp_size;
 501                 }
 502                 return (pgsz);
 503 
 504         case MAPPGSZ_ISM:
 505                 for (l = mmu.umax_page_level; l > 0; --l) {
 506                         if (len >= LEVEL_SIZE(l))
 507                                 return (LEVEL_SIZE(l));
 508                 }
 509                 return (LEVEL_SIZE(0));
 510         }
 511         return (pgsz);
 512 }
 513 
 514 static uint_t
 515 map_szcvec(caddr_t addr, size_t size, uintptr_t off, size_t max_lpsize,
 516     size_t min_physmem)
 517 {
 518         caddr_t eaddr = addr + size;
 519         uint_t szcvec = 0;
 520         caddr_t raddr;
 521         caddr_t readdr;
 522         size_t  pgsz;
 523         int i;
 524 
 525         if (physmem < min_physmem || max_lpsize <= MMU_PAGESIZE) {
 526                 return (0);
 527         }
 528 
 529         for (i = mmu_exported_page_sizes - 1; i > 0; i--) {
 530                 pgsz = page_get_pagesize(i);
 531                 if (pgsz > max_lpsize) {
 532                         continue;
 533                 }
 534                 raddr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
 535                 readdr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
 536                 if (raddr < addr || raddr >= readdr) {
 537                         continue;
 538                 }
 539                 if (P2PHASE((uintptr_t)addr ^ off, pgsz)) {
 540                         continue;
 541                 }
 542                 /*
 543                  * Set szcvec to the remaining page sizes.
 544                  */
 545                 szcvec = ((1 << (i + 1)) - 1) & ~1;
 546                 break;
 547         }
 548         return (szcvec);
 549 }
 550 
 551 /*
 552  * Return a bit vector of large page size codes that
 553  * can be used to map [addr, addr + len) region.
 554  */
 555 /*ARGSUSED*/
 556 uint_t
 557 map_pgszcvec(caddr_t addr, size_t size, uintptr_t off, int flags, int type,
 558     int memcntl)
 559 {
 560         size_t max_lpsize = mcntl0_lpsize;
 561 
 562         if (mmu.max_page_level == 0)
 563                 return (0);
 564 
 565         if (flags & MAP_TEXT) {
 566                 if (!memcntl)
 567                         max_lpsize = max_utext_lpsize;
 568                 return (map_szcvec(addr, size, off, max_lpsize,
 569                     shm_lpg_min_physmem));
 570 
 571         } else if (flags & MAP_INITDATA) {
 572                 if (!memcntl)
 573                         max_lpsize = max_uidata_lpsize;
 574                 return (map_szcvec(addr, size, off, max_lpsize,
 575                     privm_lpg_min_physmem));
 576 
 577         } else if (type == MAPPGSZC_SHM) {
 578                 if (!memcntl)
 579                         max_lpsize = max_shm_lpsize;
 580                 return (map_szcvec(addr, size, off, max_lpsize,
 581                     shm_lpg_min_physmem));
 582 
 583         } else if (type == MAPPGSZC_HEAP) {
 584                 if (!memcntl)
 585                         max_lpsize = max_uheap_lpsize;
 586                 return (map_szcvec(addr, size, off, max_lpsize,
 587                     privm_lpg_min_physmem));
 588 
 589         } else if (type == MAPPGSZC_STACK) {
 590                 if (!memcntl)
 591                         max_lpsize = max_ustack_lpsize;
 592                 return (map_szcvec(addr, size, off, max_lpsize,
 593                     privm_lpg_min_physmem));
 594 
 595         } else {
 596                 if (!memcntl)
 597                         max_lpsize = max_privmap_lpsize;
 598                 return (map_szcvec(addr, size, off, max_lpsize,
 599                     privm_lpg_min_physmem));
 600         }
 601 }
 602 
 603 /*
 604  * Handle a pagefault.
 605  */
 606 faultcode_t
 607 pagefault(
 608         caddr_t addr,
 609         enum fault_type type,
 610         enum seg_rw rw,
 611         int iskernel)
 612 {
 613         struct as *as;
 614         struct hat *hat;
 615         struct proc *p;
 616         kthread_t *t;
 617         faultcode_t res;
 618         caddr_t base;
 619         size_t len;
 620         int err;
 621         int mapped_red;
 622         uintptr_t ea;
 623 
 624         ASSERT_STACK_ALIGNED();
 625 
 626         if (INVALID_VADDR(addr))
 627                 return (FC_NOMAP);
 628 
 629         mapped_red = segkp_map_red();
 630 
 631         if (iskernel) {
 632                 as = &kas;
 633                 hat = as->a_hat;
 634         } else {
 635                 t = curthread;
 636                 p = ttoproc(t);
 637                 as = p->p_as;
 638                 hat = as->a_hat;
 639         }
 640 
 641         /*
 642          * Dispatch pagefault.
 643          */
 644         res = as_fault(hat, as, addr, 1, type, rw);
 645 
 646         /*
 647          * If this isn't a potential unmapped hole in the user's
 648          * UNIX data or stack segments, just return status info.
 649          */
 650         if (res != FC_NOMAP || iskernel)
 651                 goto out;
 652 
 653         /*
 654          * Check to see if we happened to faulted on a currently unmapped
 655          * part of the UNIX data or stack segments.  If so, create a zfod
 656          * mapping there and then try calling the fault routine again.
 657          */
 658         base = p->p_brkbase;
 659         len = p->p_brksize;
 660 
 661         if (addr < base || addr >= base + len) {          /* data seg? */
 662                 base = (caddr_t)p->p_usrstack - p->p_stksize;
 663                 len = p->p_stksize;
 664                 if (addr < base || addr >= p->p_usrstack) {    /* stack seg? */
 665                         /* not in either UNIX data or stack segments */
 666                         res = FC_NOMAP;
 667                         goto out;
 668                 }
 669         }
 670 
 671         /*
 672          * the rest of this function implements a 3.X 4.X 5.X compatibility
 673          * This code is probably not needed anymore
 674          */
 675         if (p->p_model == DATAMODEL_ILP32) {
 676 
 677                 /* expand the gap to the page boundaries on each side */
 678                 ea = P2ROUNDUP((uintptr_t)base + len, MMU_PAGESIZE);
 679                 base = (caddr_t)P2ALIGN((uintptr_t)base, MMU_PAGESIZE);
 680                 len = ea - (uintptr_t)base;
 681 
 682                 as_rangelock(as);
 683                 if (as_gap(as, MMU_PAGESIZE, &base, &len, AH_CONTAIN, addr) ==
 684                     0) {
 685                         err = as_map(as, base, len, segvn_create, zfod_argsp);
 686                         as_rangeunlock(as);
 687                         if (err) {
 688                                 res = FC_MAKE_ERR(err);
 689                                 goto out;
 690                         }
 691                 } else {
 692                         /*
 693                          * This page is already mapped by another thread after
 694                          * we returned from as_fault() above.  We just fall
 695                          * through as_fault() below.
 696                          */
 697                         as_rangeunlock(as);
 698                 }
 699 
 700                 res = as_fault(hat, as, addr, 1, F_INVAL, rw);
 701         }
 702 
 703 out:
 704         if (mapped_red)
 705                 segkp_unmap_red();
 706 
 707         return (res);
 708 }
 709 
 710 void
 711 map_addr(caddr_t *addrp, size_t len, offset_t off, int vacalign, uint_t flags)
 712 {
 713         struct proc *p = curproc;
 714         caddr_t userlimit = (flags & _MAP_LOW32) ?
 715             (caddr_t)_userlimit32 : p->p_as->a_userlimit;
 716 
 717         map_addr_proc(addrp, len, off, vacalign, userlimit, curproc, flags);
 718 }
 719 
 720 /*ARGSUSED*/
 721 int
 722 map_addr_vacalign_check(caddr_t addr, u_offset_t off)
 723 {
 724         return (0);
 725 }
 726 
 727 /*
 728  * The maximum amount a randomized mapping will be slewed.  We should perhaps
 729  * arrange things so these tunables can be separate for mmap, mmapobj, and
 730  * ld.so
 731  */
 732 size_t aslr_max_map_skew = 256 * 1024 * 1024; /* 256MB */
 733 
 734 /*
 735  * map_addr_proc() is the routine called when the system is to
 736  * choose an address for the user.  We will pick an address
 737  * range which is the highest available below userlimit.
 738  *
 739  * Every mapping will have a redzone of a single page on either side of
 740  * the request. This is done to leave one page unmapped between segments.
 741  * This is not required, but it's useful for the user because if their
 742  * program strays across a segment boundary, it will catch a fault
 743  * immediately making debugging a little easier.  Currently the redzone
 744  * is mandatory.
 745  *
 746  * addrp is a value/result parameter.
 747  *      On input it is a hint from the user to be used in a completely
 748  *      machine dependent fashion.  We decide to completely ignore this hint.
 749  *      If MAP_ALIGN was specified, addrp contains the minimal alignment, which
 750  *      must be some "power of two" multiple of pagesize.
 751  *
 752  *      On output it is NULL if no address can be found in the current
 753  *      processes address space or else an address that is currently
 754  *      not mapped for len bytes with a page of red zone on either side.
 755  *
 756  *      vacalign is not needed on x86 (it's for viturally addressed caches)
 757  */
 758 /*ARGSUSED*/
 759 void
 760 map_addr_proc(
 761         caddr_t *addrp,
 762         size_t len,
 763         offset_t off,
 764         int vacalign,
 765         caddr_t userlimit,
 766         struct proc *p,
 767         uint_t flags)
 768 {
 769         struct as *as = p->p_as;
 770         caddr_t addr;
 771         caddr_t base;
 772         size_t slen;
 773         size_t align_amount;
 774 
 775         ASSERT32(userlimit == as->a_userlimit);
 776 
 777         base = p->p_brkbase;
 778 #if defined(__amd64)
 779         if (p->p_model == DATAMODEL_NATIVE) {
 780                 if (userlimit < as->a_userlimit) {
 781                         /*
 782                          * This happens when a program wants to map
 783                          * something in a range that's accessible to a
 784                          * program in a smaller address space.  For example,
 785                          * a 64-bit program calling mmap32(2) to guarantee
 786                          * that the returned address is below 4Gbytes.
 787                          */
 788                         ASSERT((uintptr_t)userlimit < ADDRESS_C(0xffffffff));
 789 
 790                         if (userlimit > base)
 791                                 slen = userlimit - base;
 792                         else {
 793                                 *addrp = NULL;
 794                                 return;
 795                         }
 796                 } else {
 797                         /*
 798                          * With the stack positioned at a higher address than
 799                          * the heap for 64-bit processes, it is necessary to be
 800                          * mindful of its location and potential size.
 801                          *
 802                          * Unallocated space above the top of the stack (that
 803                          * is, at a lower address) but still within the bounds
 804                          * of the stack limit should be considered unavailable.
 805                          *
 806                          * As the 64-bit stack guard is mapped in immediately
 807                          * adjacent to the stack limit boundary, this prevents
 808                          * new mappings from having accidentally dangerous
 809                          * proximity to the stack.
 810                          */
 811                         slen = p->p_usrstack - base -
 812                             ((p->p_stk_ctl + PAGEOFFSET) & PAGEMASK);
 813                 }
 814         } else
 815 #endif /* defined(__amd64) */
 816                 slen = userlimit - base;
 817 
 818         /* Make len be a multiple of PAGESIZE */
 819         len = (len + PAGEOFFSET) & PAGEMASK;
 820 
 821         /*
 822          * figure out what the alignment should be
 823          *
 824          * XX64 -- is there an ELF_AMD64_MAXPGSZ or is it the same????
 825          */
 826         if (len <= ELF_386_MAXPGSZ) {
 827                 /*
 828                  * Align virtual addresses to ensure that ELF shared libraries
 829                  * are mapped with the appropriate alignment constraints by
 830                  * the run-time linker.
 831                  */
 832                 align_amount = ELF_386_MAXPGSZ;
 833         } else {
 834                 /*
 835                  * For 32-bit processes, only those which have specified
 836                  * MAP_ALIGN and an addr will be aligned on a larger page size.
 837                  * Not doing so can potentially waste up to 1G of process
 838                  * address space.
 839                  */
 840                 int lvl = (p->p_model == DATAMODEL_ILP32) ? 1 :
 841                     mmu.umax_page_level;
 842 
 843                 while (lvl && len < LEVEL_SIZE(lvl))
 844                         --lvl;
 845 
 846                 align_amount = LEVEL_SIZE(lvl);
 847         }
 848         if ((flags & MAP_ALIGN) && ((uintptr_t)*addrp > align_amount))
 849                 align_amount = (uintptr_t)*addrp;
 850 
 851         ASSERT(ISP2(align_amount));
 852         ASSERT(align_amount == 0 || align_amount >= PAGESIZE);
 853 
 854         off = off & (align_amount - 1);
 855 
 856         /*
 857          * Look for a large enough hole starting below userlimit.
 858          * After finding it, use the upper part.
 859          */
 860         if (as_gap_aligned(as, len, &base, &slen, AH_HI, NULL, align_amount,
 861             PAGESIZE, off) == 0) {
 862                 caddr_t as_addr;
 863 
 864                 /*
 865                  * addr is the highest possible address to use since we have
 866                  * a PAGESIZE redzone at the beginning and end.
 867                  */
 868                 addr = base + slen - (PAGESIZE + len);
 869                 as_addr = addr;
 870                 /*
 871                  * Round address DOWN to the alignment amount and
 872                  * add the offset in.
 873                  * If addr is greater than as_addr, len would not be large
 874                  * enough to include the redzone, so we must adjust down
 875                  * by the alignment amount.
 876                  */
 877                 addr = (caddr_t)((uintptr_t)addr & (~(align_amount - 1)));
 878                 addr += (uintptr_t)off;
 879                 if (addr > as_addr) {
 880                         addr -= align_amount;
 881                 }
 882 
 883                 /*
 884                  * If randomization is requested, slew the allocation
 885                  * backwards, within the same gap, by a random amount.
 886                  */
 887                 if (flags & _MAP_RANDOMIZE) {
 888                         uint32_t slew;
 889 
 890                         (void) random_get_pseudo_bytes((uint8_t *)&slew,
 891                             sizeof (slew));
 892 
 893                         slew = slew % MIN(aslr_max_map_skew, (addr - base));
 894                         addr -= P2ALIGN(slew, align_amount);
 895                 }
 896 
 897                 ASSERT(addr > base);
 898                 ASSERT(addr + len < base + slen);
 899                 ASSERT(((uintptr_t)addr & (align_amount - 1)) ==
 900                     ((uintptr_t)(off)));
 901                 *addrp = addr;
 902         } else {
 903                 *addrp = NULL;  /* no more virtual space */
 904         }
 905 }
 906 
 907 int valid_va_range_aligned_wraparound;
 908 
 909 /*
 910  * Determine whether [*basep, *basep + *lenp) contains a mappable range of
 911  * addresses at least "minlen" long, where the base of the range is at "off"
 912  * phase from an "align" boundary and there is space for a "redzone"-sized
 913  * redzone on either side of the range.  On success, 1 is returned and *basep
 914  * and *lenp are adjusted to describe the acceptable range (including
 915  * the redzone).  On failure, 0 is returned.
 916  */
 917 /*ARGSUSED3*/
 918 int
 919 valid_va_range_aligned(caddr_t *basep, size_t *lenp, size_t minlen, int dir,
 920     size_t align, size_t redzone, size_t off)
 921 {
 922         uintptr_t hi, lo;
 923         size_t tot_len;
 924 
 925         ASSERT(align == 0 ? off == 0 : off < align);
 926         ASSERT(ISP2(align));
 927         ASSERT(align == 0 || align >= PAGESIZE);
 928 
 929         lo = (uintptr_t)*basep;
 930         hi = lo + *lenp;
 931         tot_len = minlen + 2 * redzone; /* need at least this much space */
 932 
 933         /*
 934          * If hi rolled over the top, try cutting back.
 935          */
 936         if (hi < lo) {
 937                 *lenp = 0UL - lo - 1UL;
 938                 /* See if this really happens. If so, then we figure out why */
 939                 valid_va_range_aligned_wraparound++;
 940                 hi = lo + *lenp;
 941         }
 942         if (*lenp < tot_len) {
 943                 return (0);
 944         }
 945 
 946 #if defined(__amd64)
 947         /*
 948          * Deal with a possible hole in the address range between
 949          * hole_start and hole_end that should never be mapped.
 950          */
 951         if (lo < hole_start) {
 952                 if (hi > hole_start) {
 953                         if (hi < hole_end) {
 954                                 hi = hole_start;
 955                         } else {
 956                                 /* lo < hole_start && hi >= hole_end */
 957                                 if (dir == AH_LO) {
 958                                         /*
 959                                          * prefer lowest range
 960                                          */
 961                                         if (hole_start - lo >= tot_len)
 962                                                 hi = hole_start;
 963                                         else if (hi - hole_end >= tot_len)
 964                                                 lo = hole_end;
 965                                         else
 966                                                 return (0);
 967                                 } else {
 968                                         /*
 969                                          * prefer highest range
 970                                          */
 971                                         if (hi - hole_end >= tot_len)
 972                                                 lo = hole_end;
 973                                         else if (hole_start - lo >= tot_len)
 974                                                 hi = hole_start;
 975                                         else
 976                                                 return (0);
 977                                 }
 978                         }
 979                 }
 980         } else {
 981                 /* lo >= hole_start */
 982                 if (hi < hole_end)
 983                         return (0);
 984                 if (lo < hole_end)
 985                         lo = hole_end;
 986         }
 987 #endif
 988 
 989         if (hi - lo < tot_len)
 990                 return (0);
 991 
 992         if (align > 1) {
 993                 uintptr_t tlo = lo + redzone;
 994                 uintptr_t thi = hi - redzone;
 995                 tlo = (uintptr_t)P2PHASEUP(tlo, align, off);
 996                 if (tlo < lo + redzone) {
 997                         return (0);
 998                 }
 999                 if (thi < tlo || thi - tlo < minlen) {
1000                         return (0);
1001                 }
1002         }
1003 
1004         *basep = (caddr_t)lo;
1005         *lenp = hi - lo;
1006         return (1);
1007 }
1008 
1009 /*
1010  * Determine whether [*basep, *basep + *lenp) contains a mappable range of
1011  * addresses at least "minlen" long.  On success, 1 is returned and *basep
1012  * and *lenp are adjusted to describe the acceptable range.  On failure, 0
1013  * is returned.
1014  */
1015 int
1016 valid_va_range(caddr_t *basep, size_t *lenp, size_t minlen, int dir)
1017 {
1018         return (valid_va_range_aligned(basep, lenp, minlen, dir, 0, 0, 0));
1019 }
1020 
1021 /*
1022  * Default to forbidding the first 64k of address space.  This protects most
1023  * reasonably sized structures from dereferences through NULL:
1024  *     ((foo_t *)0)->bar
1025  */
1026 uintptr_t forbidden_null_mapping_sz = 0x10000;
1027 
1028 /*
1029  * Determine whether [addr, addr+len] are valid user addresses.
1030  */
1031 /*ARGSUSED*/
1032 int
1033 valid_usr_range(caddr_t addr, size_t len, uint_t prot, struct as *as,
1034     caddr_t userlimit)
1035 {
1036         caddr_t eaddr = addr + len;
1037 
1038         if (eaddr <= addr || addr >= userlimit || eaddr > userlimit)
1039                 return (RANGE_BADADDR);
1040 
1041         if ((addr <= (caddr_t)forbidden_null_mapping_sz) &&
1042             as->a_proc != NULL &&
1043             secflag_enabled(as->a_proc, PROC_SEC_FORBIDNULLMAP))
1044                 return (RANGE_BADADDR);
1045 
1046 #if defined(__amd64)
1047         /*
1048          * Check for the VA hole
1049          */
1050         if (eaddr > (caddr_t)hole_start && addr < (caddr_t)hole_end)
1051                 return (RANGE_BADADDR);
1052 #endif
1053 
1054         return (RANGE_OKAY);
1055 }
1056 
1057 /*
1058  * Return 1 if the page frame is onboard memory, else 0.
1059  */
1060 int
1061 pf_is_memory(pfn_t pf)
1062 {
1063         if (pfn_is_foreign(pf))
1064                 return (0);
1065         return (address_in_memlist(phys_install, pfn_to_pa(pf), 1));
1066 }
1067 
1068 /*
1069  * return the memrange containing pfn
1070  */
1071 int
1072 memrange_num(pfn_t pfn)
1073 {
1074         int n;
1075 
1076         for (n = 0; n < nranges - 1; ++n) {
1077                 if (pfn >= memranges[n])
1078                         break;
1079         }
1080         return (n);
1081 }
1082 
1083 /*
1084  * return the mnoderange containing pfn
1085  */
1086 /*ARGSUSED*/
1087 int
1088 pfn_2_mtype(pfn_t pfn)
1089 {
1090 #if defined(__xpv)
1091         return (0);
1092 #else
1093         int     n;
1094 
1095         /* Always start from highest pfn and work our way down */
1096         for (n = mtypetop; n != -1; n = mnoderanges[n].mnr_next) {
1097                 if (pfn >= mnoderanges[n].mnr_pfnlo) {
1098                         break;
1099                 }
1100         }
1101         return (n);
1102 #endif
1103 }
1104 
1105 #if !defined(__xpv)
1106 /*
1107  * is_contigpage_free:
1108  *      returns a page list of contiguous pages. It minimally has to return
1109  *      minctg pages. Caller determines minctg based on the scatter-gather
1110  *      list length.
1111  *
1112  *      pfnp is set to the next page frame to search on return.
1113  */
1114 static page_t *
1115 is_contigpage_free(
1116         pfn_t *pfnp,
1117         pgcnt_t *pgcnt,
1118         pgcnt_t minctg,
1119         uint64_t pfnseg,
1120         int iolock)
1121 {
1122         int     i = 0;
1123         pfn_t   pfn = *pfnp;
1124         page_t  *pp;
1125         page_t  *plist = NULL;
1126 
1127         /*
1128          * fail if pfn + minctg crosses a segment boundary.
1129          * Adjust for next starting pfn to begin at segment boundary.
1130          */
1131 
1132         if (((*pfnp + minctg - 1) & pfnseg) < (*pfnp & pfnseg)) {
1133                 *pfnp = roundup(*pfnp, pfnseg + 1);
1134                 return (NULL);
1135         }
1136 
1137         do {
1138 retry:
1139                 pp = page_numtopp_nolock(pfn + i);
1140                 if ((pp == NULL) || IS_DUMP_PAGE(pp) ||
1141                     (page_trylock(pp, SE_EXCL) == 0)) {
1142                         (*pfnp)++;
1143                         break;
1144                 }
1145                 if (page_pptonum(pp) != pfn + i) {
1146                         page_unlock(pp);
1147                         goto retry;
1148                 }
1149 
1150                 if (!(PP_ISFREE(pp))) {
1151                         page_unlock(pp);
1152                         (*pfnp)++;
1153                         break;
1154                 }
1155 
1156                 if (!PP_ISAGED(pp)) {
1157                         page_list_sub(pp, PG_CACHE_LIST);
1158                         page_hashout(pp, (kmutex_t *)NULL);
1159                 } else {
1160                         page_list_sub(pp, PG_FREE_LIST);
1161                 }
1162 
1163                 if (iolock)
1164                         page_io_lock(pp);
1165                 page_list_concat(&plist, &pp);
1166 
1167                 /*
1168                  * exit loop when pgcnt satisfied or segment boundary reached.
1169                  */
1170 
1171         } while ((++i < *pgcnt) && ((pfn + i) & pfnseg));
1172 
1173         *pfnp += i;             /* set to next pfn to search */
1174 
1175         if (i >= minctg) {
1176                 *pgcnt -= i;
1177                 return (plist);
1178         }
1179 
1180         /*
1181          * failure: minctg not satisfied.
1182          *
1183          * if next request crosses segment boundary, set next pfn
1184          * to search from the segment boundary.
1185          */
1186         if (((*pfnp + minctg - 1) & pfnseg) < (*pfnp & pfnseg))
1187                 *pfnp = roundup(*pfnp, pfnseg + 1);
1188 
1189         /* clean up any pages already allocated */
1190 
1191         while (plist) {
1192                 pp = plist;
1193                 page_sub(&plist, pp);
1194                 page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
1195                 if (iolock)
1196                         page_io_unlock(pp);
1197                 page_unlock(pp);
1198         }
1199 
1200         return (NULL);
1201 }
1202 #endif  /* !__xpv */
1203 
1204 /*
1205  * verify that pages being returned from allocator have correct DMA attribute
1206  */
1207 #ifndef DEBUG
1208 #define check_dma(a, b, c) (void)(0)
1209 #else
1210 static void
1211 check_dma(ddi_dma_attr_t *dma_attr, page_t *pp, int cnt)
1212 {
1213         if (dma_attr == NULL)
1214                 return;
1215 
1216         while (cnt-- > 0) {
1217                 if (pa_to_ma(pfn_to_pa(pp->p_pagenum)) <
1218                     dma_attr->dma_attr_addr_lo)
1219                         panic("PFN (pp=%p) below dma_attr_addr_lo", (void *)pp);
1220                 if (pa_to_ma(pfn_to_pa(pp->p_pagenum)) >=
1221                     dma_attr->dma_attr_addr_hi)
1222                         panic("PFN (pp=%p) above dma_attr_addr_hi", (void *)pp);
1223                 pp = pp->p_next;
1224         }
1225 }
1226 #endif
1227 
1228 #if !defined(__xpv)
1229 static page_t *
1230 page_get_contigpage(pgcnt_t *pgcnt, ddi_dma_attr_t *mattr, int iolock)
1231 {
1232         pfn_t           pfn;
1233         int             sgllen;
1234         uint64_t        pfnseg;
1235         pgcnt_t         minctg;
1236         page_t          *pplist = NULL, *plist;
1237         uint64_t        lo, hi;
1238         pgcnt_t         pfnalign = 0;
1239         static pfn_t    startpfn;
1240         static pgcnt_t  lastctgcnt;
1241         uintptr_t       align;
1242 
1243         CONTIG_LOCK();
1244 
1245         if (mattr) {
1246                 lo = mmu_btop((mattr->dma_attr_addr_lo + MMU_PAGEOFFSET));
1247                 hi = mmu_btop(mattr->dma_attr_addr_hi);
1248                 if (hi >= physmax)
1249                         hi = physmax - 1;
1250                 sgllen = mattr->dma_attr_sgllen;
1251                 pfnseg = mmu_btop(mattr->dma_attr_seg);
1252 
1253                 align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer);
1254                 if (align > MMU_PAGESIZE)
1255                         pfnalign = mmu_btop(align);
1256 
1257                 /*
1258                  * in order to satisfy the request, must minimally
1259                  * acquire minctg contiguous pages
1260                  */
1261                 minctg = howmany(*pgcnt, sgllen);
1262 
1263                 ASSERT(hi >= lo);
1264 
1265                 /*
1266                  * start from where last searched if the minctg >= lastctgcnt
1267                  */
1268                 if (minctg < lastctgcnt || startpfn < lo || startpfn > hi)
1269                         startpfn = lo;
1270         } else {
1271                 hi = physmax - 1;
1272                 lo = 0;
1273                 sgllen = 1;
1274                 pfnseg = mmu.highest_pfn;
1275                 minctg = *pgcnt;
1276 
1277                 if (minctg < lastctgcnt)
1278                         startpfn = lo;
1279         }
1280         lastctgcnt = minctg;
1281 
1282         ASSERT(pfnseg + 1 >= (uint64_t)minctg);
1283 
1284         /* conserve 16m memory - start search above 16m when possible */
1285         if (hi > PFN_16M && startpfn < PFN_16M)
1286                 startpfn = PFN_16M;
1287 
1288         pfn = startpfn;
1289         if (pfnalign)
1290                 pfn = P2ROUNDUP(pfn, pfnalign);
1291 
1292         while (pfn + minctg - 1 <= hi) {
1293 
1294                 plist = is_contigpage_free(&pfn, pgcnt, minctg, pfnseg, iolock);
1295                 if (plist) {
1296                         page_list_concat(&pplist, &plist);
1297                         sgllen--;
1298                         /*
1299                          * return when contig pages no longer needed
1300                          */
1301                         if (!*pgcnt || ((*pgcnt <= sgllen) && !pfnalign)) {
1302                                 startpfn = pfn;
1303                                 CONTIG_UNLOCK();
1304                                 check_dma(mattr, pplist, *pgcnt);
1305                                 return (pplist);
1306                         }
1307                         minctg = howmany(*pgcnt, sgllen);
1308                 }
1309                 if (pfnalign)
1310                         pfn = P2ROUNDUP(pfn, pfnalign);
1311         }
1312 
1313         /* cannot find contig pages in specified range */
1314         if (startpfn == lo) {
1315                 CONTIG_UNLOCK();
1316                 return (NULL);
1317         }
1318 
1319         /* did not start with lo previously */
1320         pfn = lo;
1321         if (pfnalign)
1322                 pfn = P2ROUNDUP(pfn, pfnalign);
1323 
1324         /* allow search to go above startpfn */
1325         while (pfn < startpfn) {
1326 
1327                 plist = is_contigpage_free(&pfn, pgcnt, minctg, pfnseg, iolock);
1328                 if (plist != NULL) {
1329 
1330                         page_list_concat(&pplist, &plist);
1331                         sgllen--;
1332 
1333                         /*
1334                          * return when contig pages no longer needed
1335                          */
1336                         if (!*pgcnt || ((*pgcnt <= sgllen) && !pfnalign)) {
1337                                 startpfn = pfn;
1338                                 CONTIG_UNLOCK();
1339                                 check_dma(mattr, pplist, *pgcnt);
1340                                 return (pplist);
1341                         }
1342                         minctg = howmany(*pgcnt, sgllen);
1343                 }
1344                 if (pfnalign)
1345                         pfn = P2ROUNDUP(pfn, pfnalign);
1346         }
1347         CONTIG_UNLOCK();
1348         return (NULL);
1349 }
1350 #endif  /* !__xpv */
1351 
1352 /*
1353  * mnode_range_cnt() calculates the number of memory ranges for mnode and
1354  * memranges[]. Used to determine the size of page lists and mnoderanges.
1355  */
1356 int
1357 mnode_range_cnt(int mnode)
1358 {
1359 #if defined(__xpv)
1360         ASSERT(mnode == 0);
1361         return (1);
1362 #else   /* __xpv */
1363         int     mri;
1364         int     mnrcnt = 0;
1365 
1366         if (mem_node_config[mnode].exists != 0) {
1367                 mri = nranges - 1;
1368 
1369                 /* find the memranges index below contained in mnode range */
1370 
1371                 while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase)
1372                         mri--;
1373 
1374                 /*
1375                  * increment mnode range counter when memranges or mnode
1376                  * boundary is reached.
1377                  */
1378                 while (mri >= 0 &&
1379                     mem_node_config[mnode].physmax >= MEMRANGELO(mri)) {
1380                         mnrcnt++;
1381                         if (mem_node_config[mnode].physmax > MEMRANGEHI(mri))
1382                                 mri--;
1383                         else
1384                                 break;
1385                 }
1386         }
1387         ASSERT(mnrcnt <= MAX_MNODE_MRANGES);
1388         return (mnrcnt);
1389 #endif  /* __xpv */
1390 }
1391 
1392 static int
1393 mnoderange_cmp(const void *v1, const void *v2)
1394 {
1395         const mnoderange_t *m1 = v1;
1396         const mnoderange_t *m2 = v2;
1397 
1398         if (m1->mnr_pfnlo < m2->mnr_pfnlo)
1399                 return (-1);
1400         return (m1->mnr_pfnlo > m2->mnr_pfnlo);
1401 }
1402 
1403 void
1404 mnode_range_setup(mnoderange_t *mnoderanges)
1405 {
1406         mnoderange_t *mp;
1407         size_t nr_ranges;
1408         size_t mnode;
1409 
1410         for (mnode = 0, nr_ranges = 0, mp = mnoderanges;
1411             mnode < max_mem_nodes; mnode++) {
1412                 size_t mri = nranges - 1;
1413 
1414                 if (mem_node_config[mnode].exists == 0)
1415                         continue;
1416 
1417                 while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase)
1418                         mri--;
1419 
1420                 while (mri >= 0 && mem_node_config[mnode].physmax >=
1421                     MEMRANGELO(mri)) {
1422                         mp->mnr_pfnlo = MAX(MEMRANGELO(mri),
1423                             mem_node_config[mnode].physbase);
1424                         mp->mnr_pfnhi = MIN(MEMRANGEHI(mri),
1425                             mem_node_config[mnode].physmax);
1426                         mp->mnr_mnode = mnode;
1427                         mp->mnr_memrange = mri;
1428                         mp->mnr_next = -1;
1429                         mp->mnr_exists = 1;
1430                         mp++;
1431                         nr_ranges++;
1432                         if (mem_node_config[mnode].physmax > MEMRANGEHI(mri))
1433                                 mri--;
1434                         else
1435                                 break;
1436                 }
1437         }
1438 
1439         /*
1440          * mnoderangecnt can be larger than nr_ranges when memory DR is
1441          * supposedly supported.
1442          */
1443         VERIFY3U(nr_ranges, <=, mnoderangecnt);
1444 
1445         qsort(mnoderanges, nr_ranges, sizeof (mnoderange_t), mnoderange_cmp);
1446 
1447         /*
1448          * If some intrepid soul takes the axe to the memory DR code, we can
1449          * remove ->mnr_next altogether, as we just sorted by ->mnr_pfnlo order.
1450          *
1451          * The VERIFY3U() above can be "==" then too.
1452          */
1453         for (size_t i = 1; i < nr_ranges; i++)
1454                 mnoderanges[i].mnr_next = i - 1;
1455 
1456         mtypetop = nr_ranges - 1;
1457         mtype16m = pfn_2_mtype(PFN_16MEG - 1); /* Can be -1 ... */
1458         if (physmax4g)
1459                 mtype4g = pfn_2_mtype(0xfffff);
1460 }
1461 
1462 #ifndef __xpv
1463 /*
1464  * Update mnoderanges for memory hot-add DR operations.
1465  */
1466 static void
1467 mnode_range_add(int mnode)
1468 {
1469         int     *prev;
1470         int     n, mri;
1471         pfn_t   start, end;
1472         extern  void membar_sync(void);
1473 
1474         ASSERT(0 <= mnode && mnode < max_mem_nodes);
1475         ASSERT(mem_node_config[mnode].exists);
1476         start = mem_node_config[mnode].physbase;
1477         end = mem_node_config[mnode].physmax;
1478         ASSERT(start <= end);
1479         mutex_enter(&mnoderange_lock);
1480 
1481 #ifdef  DEBUG
1482         /* Check whether it interleaves with other memory nodes. */
1483         for (n = mtypetop; n != -1; n = mnoderanges[n].mnr_next) {
1484                 ASSERT(mnoderanges[n].mnr_exists);
1485                 if (mnoderanges[n].mnr_mnode == mnode)
1486                         continue;
1487                 ASSERT(start > mnoderanges[n].mnr_pfnhi ||
1488                     end < mnoderanges[n].mnr_pfnlo);
1489         }
1490 #endif  /* DEBUG */
1491 
1492         mri = nranges - 1;
1493         while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase)
1494                 mri--;
1495         while (mri >= 0 && mem_node_config[mnode].physmax >= MEMRANGELO(mri)) {
1496                 /* Check whether mtype already exists. */
1497                 for (n = mtypetop; n != -1; n = mnoderanges[n].mnr_next) {
1498                         if (mnoderanges[n].mnr_mnode == mnode &&
1499                             mnoderanges[n].mnr_memrange == mri) {
1500                                 mnoderanges[n].mnr_pfnlo = MAX(MEMRANGELO(mri),
1501                                     start);
1502                                 mnoderanges[n].mnr_pfnhi = MIN(MEMRANGEHI(mri),
1503                                     end);
1504                                 break;
1505                         }
1506                 }
1507 
1508                 /* Add a new entry if it doesn't exist yet. */
1509                 if (n == -1) {
1510                         /* Try to find an unused entry in mnoderanges array. */
1511                         for (n = 0; n < mnoderangecnt; n++) {
1512                                 if (mnoderanges[n].mnr_exists == 0)
1513                                         break;
1514                         }
1515                         ASSERT(n < mnoderangecnt);
1516                         mnoderanges[n].mnr_pfnlo = MAX(MEMRANGELO(mri), start);
1517                         mnoderanges[n].mnr_pfnhi = MIN(MEMRANGEHI(mri), end);
1518                         mnoderanges[n].mnr_mnode = mnode;
1519                         mnoderanges[n].mnr_memrange = mri;
1520                         mnoderanges[n].mnr_exists = 1;
1521                         /* Page 0 should always be present. */
1522                         for (prev = &mtypetop;
1523                             mnoderanges[*prev].mnr_pfnlo > start;
1524                             prev = &mnoderanges[*prev].mnr_next) {
1525                                 ASSERT(mnoderanges[*prev].mnr_next >= 0);
1526                                 ASSERT(mnoderanges[*prev].mnr_pfnlo > end);
1527                         }
1528                         mnoderanges[n].mnr_next = *prev;
1529                         membar_sync();
1530                         *prev = n;
1531                 }
1532 
1533                 if (mem_node_config[mnode].physmax > MEMRANGEHI(mri))
1534                         mri--;
1535                 else
1536                         break;
1537         }
1538 
1539         mutex_exit(&mnoderange_lock);
1540 }
1541 
1542 /*
1543  * Update mnoderanges for memory hot-removal DR operations.
1544  */
1545 static void
1546 mnode_range_del(int mnode)
1547 {
1548         _NOTE(ARGUNUSED(mnode));
1549         ASSERT(0 <= mnode && mnode < max_mem_nodes);
1550         /* TODO: support deletion operation. */
1551         ASSERT(0);
1552 }
1553 
1554 void
1555 plat_slice_add(pfn_t start, pfn_t end)
1556 {
1557         mem_node_add_slice(start, end);
1558         if (plat_dr_enabled()) {
1559                 mnode_range_add(PFN_2_MEM_NODE(start));
1560         }
1561 }
1562 
1563 void
1564 plat_slice_del(pfn_t start, pfn_t end)
1565 {
1566         ASSERT(PFN_2_MEM_NODE(start) == PFN_2_MEM_NODE(end));
1567         ASSERT(plat_dr_enabled());
1568         mnode_range_del(PFN_2_MEM_NODE(start));
1569         mem_node_del_slice(start, end);
1570 }
1571 #endif  /* __xpv */
1572 
1573 /*ARGSUSED*/
1574 int
1575 mtype_init(vnode_t *vp, caddr_t vaddr, uint_t *flags, size_t pgsz)
1576 {
1577         int mtype = mtypetop;
1578 
1579 #if !defined(__xpv)
1580 #if defined(__i386)
1581         /*
1582          * set the mtype range
1583          * - kmem requests need to be below 4g if restricted_kmemalloc is set.
1584          * - for non kmem requests, set range to above 4g if memory below 4g
1585          * runs low.
1586          */
1587         if (restricted_kmemalloc && VN_ISKAS(vp) &&
1588             (caddr_t)(vaddr) >= kernelheap &&
1589             (caddr_t)(vaddr) < ekernelheap) {
1590                 ASSERT(physmax4g);
1591                 mtype = mtype4g;
1592                 if (RESTRICT16M_ALLOC(freemem4g - btop(pgsz),
1593                     btop(pgsz), *flags)) {
1594                         *flags |= PGI_MT_RANGE16M;
1595                 } else {
1596                         VM_STAT_ADD(vmm_vmstats.unrestrict16mcnt);
1597                         VM_STAT_COND_ADD((*flags & PG_PANIC),
1598                             vmm_vmstats.pgpanicalloc);
1599                         *flags |= PGI_MT_RANGE0;
1600                 }
1601                 return (mtype);
1602         }
1603 #endif  /* __i386 */
1604 
1605         if (RESTRICT4G_ALLOC) {
1606                 VM_STAT_ADD(vmm_vmstats.restrict4gcnt);
1607                 /* here only for > 4g systems */
1608                 *flags |= PGI_MT_RANGE4G;
1609         } else if (RESTRICT16M_ALLOC(freemem, btop(pgsz), *flags)) {
1610                 *flags |= PGI_MT_RANGE16M;
1611         } else {
1612                 VM_STAT_ADD(vmm_vmstats.unrestrict16mcnt);
1613                 VM_STAT_COND_ADD((*flags & PG_PANIC), vmm_vmstats.pgpanicalloc);
1614                 *flags |= PGI_MT_RANGE0;
1615         }
1616 #endif /* !__xpv */
1617         return (mtype);
1618 }
1619 
1620 
1621 /* mtype init for page_get_replacement_page */
1622 /*ARGSUSED*/
1623 int
1624 mtype_pgr_init(int *flags, page_t *pp, pgcnt_t pgcnt)
1625 {
1626         int mtype = mtypetop;
1627 #if !defined(__xpv)
1628         if (RESTRICT16M_ALLOC(freemem, pgcnt, *flags)) {
1629                 *flags |= PGI_MT_RANGE16M;
1630         } else {
1631                 VM_STAT_ADD(vmm_vmstats.unrestrict16mcnt);
1632                 *flags |= PGI_MT_RANGE0;
1633         }
1634 #endif
1635         return (mtype);
1636 }
1637 
1638 /*
1639  * Determine if the mnode range specified in mtype contains memory belonging
1640  * to memory node mnode.  If flags & PGI_MT_RANGE is set then mtype contains
1641  * the range from high pfn to 0, 16m or 4g.
1642  *
1643  * Return first mnode range type index found otherwise return -1 if none found.
1644  */
1645 int
1646 mtype_func(int mnode, int mtype, uint_t flags)
1647 {
1648         if (flags & PGI_MT_RANGE) {
1649                 int     mnr_lim = MRI_0;
1650 
1651                 if (flags & PGI_MT_NEXT) {
1652                         mtype = mnoderanges[mtype].mnr_next;
1653                 }
1654                 if (flags & PGI_MT_RANGE4G)
1655                         mnr_lim = MRI_4G;       /* exclude 0-4g range */
1656                 else if (flags & PGI_MT_RANGE16M)
1657                         mnr_lim = MRI_16M;      /* exclude 0-16m range */
1658                 while (mtype != -1 &&
1659                     mnoderanges[mtype].mnr_memrange <= mnr_lim) {
1660                         if (mnoderanges[mtype].mnr_mnode == mnode)
1661                                 return (mtype);
1662                         mtype = mnoderanges[mtype].mnr_next;
1663                 }
1664         } else if (mnoderanges[mtype].mnr_mnode == mnode) {
1665                 return (mtype);
1666         }
1667         return (-1);
1668 }
1669 
1670 /*
1671  * Update the page list max counts with the pfn range specified by the
1672  * input parameters.
1673  */
1674 void
1675 mtype_modify_max(pfn_t startpfn, long cnt)
1676 {
1677         int             mtype;
1678         pgcnt_t         inc;
1679         spgcnt_t        scnt = (spgcnt_t)(cnt);
1680         pgcnt_t         acnt = ABS(scnt);
1681         pfn_t           endpfn = startpfn + acnt;
1682         pfn_t           pfn, lo;
1683 
1684         if (!physmax4g)
1685                 return;
1686 
1687         mtype = mtypetop;
1688         for (pfn = endpfn; pfn > startpfn; ) {
1689                 ASSERT(mtype != -1);
1690                 lo = mnoderanges[mtype].mnr_pfnlo;
1691                 if (pfn > lo) {
1692                         if (startpfn >= lo) {
1693                                 inc = pfn - startpfn;
1694                         } else {
1695                                 inc = pfn - lo;
1696                         }
1697                         if (mnoderanges[mtype].mnr_memrange != MRI_4G) {
1698                                 if (scnt > 0)
1699                                         maxmem4g += inc;
1700                                 else
1701                                         maxmem4g -= inc;
1702                         }
1703                         pfn -= inc;
1704                 }
1705                 mtype = mnoderanges[mtype].mnr_next;
1706         }
1707 }
1708 
1709 int
1710 mtype_2_mrange(int mtype)
1711 {
1712         return (mnoderanges[mtype].mnr_memrange);
1713 }
1714 
1715 void
1716 mnodetype_2_pfn(int mnode, int mtype, pfn_t *pfnlo, pfn_t *pfnhi)
1717 {
1718         _NOTE(ARGUNUSED(mnode));
1719         ASSERT(mnoderanges[mtype].mnr_mnode == mnode);
1720         *pfnlo = mnoderanges[mtype].mnr_pfnlo;
1721         *pfnhi = mnoderanges[mtype].mnr_pfnhi;
1722 }
1723 
1724 size_t
1725 plcnt_sz(size_t ctrs_sz)
1726 {
1727 #ifdef DEBUG
1728         int     szc, colors;
1729 
1730         ctrs_sz += mnoderangecnt * sizeof (struct mnr_mts) * mmu_page_sizes;
1731         for (szc = 0; szc < mmu_page_sizes; szc++) {
1732                 colors = page_get_pagecolors(szc);
1733                 ctrs_sz += mnoderangecnt * sizeof (pgcnt_t) * colors;
1734         }
1735 #endif
1736         return (ctrs_sz);
1737 }
1738 
1739 caddr_t
1740 plcnt_init(caddr_t addr)
1741 {
1742 #ifdef DEBUG
1743         int     mt, szc, colors;
1744 
1745         for (mt = 0; mt < mnoderangecnt; mt++) {
1746                 mnoderanges[mt].mnr_mts = (struct mnr_mts *)addr;
1747                 addr += (sizeof (struct mnr_mts) * mmu_page_sizes);
1748                 for (szc = 0; szc < mmu_page_sizes; szc++) {
1749                         colors = page_get_pagecolors(szc);
1750                         mnoderanges[mt].mnr_mts[szc].mnr_mts_colors = colors;
1751                         mnoderanges[mt].mnr_mts[szc].mnr_mtsc_pgcnt =
1752                             (pgcnt_t *)addr;
1753                         addr += (sizeof (pgcnt_t) * colors);
1754                 }
1755         }
1756 #endif
1757         return (addr);
1758 }
1759 
1760 void
1761 plcnt_inc_dec(page_t *pp, int mtype, int szc, long cnt, int flags)
1762 {
1763         _NOTE(ARGUNUSED(pp));
1764 #ifdef DEBUG
1765         int     bin = PP_2_BIN(pp);
1766 
1767         atomic_add_long(&mnoderanges[mtype].mnr_mts[szc].mnr_mts_pgcnt, cnt);
1768         atomic_add_long(&mnoderanges[mtype].mnr_mts[szc].mnr_mtsc_pgcnt[bin],
1769             cnt);
1770 #endif
1771         ASSERT(mtype == PP_2_MTYPE(pp));
1772         if (physmax4g && mnoderanges[mtype].mnr_memrange != MRI_4G)
1773                 atomic_add_long(&freemem4g, cnt);
1774         if (flags & PG_CACHE_LIST)
1775                 atomic_add_long(&mnoderanges[mtype].mnr_mt_clpgcnt, cnt);
1776         else
1777                 atomic_add_long(&mnoderanges[mtype].mnr_mt_flpgcnt[szc], cnt);
1778         atomic_add_long(&mnoderanges[mtype].mnr_mt_totcnt, cnt);
1779 }
1780 
1781 /*
1782  * Returns the free page count for mnode
1783  */
1784 int
1785 mnode_pgcnt(int mnode)
1786 {
1787         int     mtype = mtypetop;
1788         int     flags = PGI_MT_RANGE0;
1789         pgcnt_t pgcnt = 0;
1790 
1791         mtype = mtype_func(mnode, mtype, flags);
1792 
1793         while (mtype != -1) {
1794                 pgcnt += MTYPE_FREEMEM(mtype);
1795                 mtype = mtype_func(mnode, mtype, flags | PGI_MT_NEXT);
1796         }
1797         return (pgcnt);
1798 }
1799 
1800 /*
1801  * Initialize page coloring variables based on the l2 cache parameters.
1802  * Calculate and return memory needed for page coloring data structures.
1803  */
1804 size_t
1805 page_coloring_init(uint_t l2_sz, int l2_linesz, int l2_assoc)
1806 {
1807         _NOTE(ARGUNUSED(l2_linesz));
1808         size_t  colorsz = 0;
1809         int     i;
1810         int     colors;
1811 
1812 #if defined(__xpv)
1813         /*
1814          * Hypervisor domains currently don't have any concept of NUMA.
1815          * Hence we'll act like there is only 1 memrange.
1816          */
1817         i = memrange_num(1);
1818 #else /* !__xpv */
1819         /*
1820          * Reduce the memory ranges lists if we don't have large amounts
1821          * of memory. This avoids searching known empty free lists.
1822          * To support memory DR operations, we need to keep memory ranges
1823          * for possible memory hot-add operations.
1824          */
1825         if (plat_dr_physmax > physmax)
1826                 i = memrange_num(plat_dr_physmax);
1827         else
1828                 i = memrange_num(physmax);
1829 #if defined(__i386)
1830         if (i > MRI_4G)
1831                 restricted_kmemalloc = 0;
1832 #endif
1833         /* physmax greater than 4g */
1834         if (i == MRI_4G)
1835                 physmax4g = 1;
1836 #endif /* !__xpv */
1837         memranges += i;
1838         nranges -= i;
1839 
1840         ASSERT(mmu_page_sizes <= MMU_PAGE_SIZES);
1841 
1842         ASSERT(ISP2(l2_linesz));
1843         ASSERT(l2_sz > MMU_PAGESIZE);
1844 
1845         /* l2_assoc is 0 for fully associative l2 cache */
1846         if (l2_assoc)
1847                 l2_colors = MAX(1, l2_sz / (l2_assoc * MMU_PAGESIZE));
1848         else
1849                 l2_colors = 1;
1850 
1851         ASSERT(ISP2(l2_colors));
1852 
1853         /* for scalability, configure at least PAGE_COLORS_MIN color bins */
1854         page_colors = MAX(l2_colors, PAGE_COLORS_MIN);
1855 
1856         /*
1857          * cpu_page_colors is non-zero when a page color may be spread across
1858          * multiple bins.
1859          */
1860         if (l2_colors < page_colors)
1861                 cpu_page_colors = l2_colors;
1862 
1863         ASSERT(ISP2(page_colors));
1864 
1865         page_colors_mask = page_colors - 1;
1866 
1867         ASSERT(ISP2(CPUSETSIZE()));
1868         page_coloring_shift = lowbit(CPUSETSIZE());
1869 
1870         /* initialize number of colors per page size */
1871         for (i = 0; i <= mmu.max_page_level; i++) {
1872                 hw_page_array[i].hp_size = LEVEL_SIZE(i);
1873                 hw_page_array[i].hp_shift = LEVEL_SHIFT(i);
1874                 hw_page_array[i].hp_pgcnt = LEVEL_SIZE(i) >> LEVEL_SHIFT(0);
1875                 hw_page_array[i].hp_colors = (page_colors_mask >>
1876                     (hw_page_array[i].hp_shift - hw_page_array[0].hp_shift))
1877                     + 1;
1878                 colorequivszc[i] = 0;
1879         }
1880 
1881         /*
1882          * The value of cpu_page_colors determines if additional color bins
1883          * need to be checked for a particular color in the page_get routines.
1884          */
1885         if (cpu_page_colors != 0) {
1886 
1887                 int a = lowbit(page_colors) - lowbit(cpu_page_colors);
1888                 ASSERT(a > 0);
1889                 ASSERT(a < 16);
1890 
1891                 for (i = 0; i <= mmu.max_page_level; i++) {
1892                         if ((colors = hw_page_array[i].hp_colors) <= 1) {
1893                                 colorequivszc[i] = 0;
1894                                 continue;
1895                         }
1896                         while ((colors >> a) == 0)
1897                                 a--;
1898                         ASSERT(a >= 0);
1899 
1900                         /* higher 4 bits encodes color equiv mask */
1901                         colorequivszc[i] = (a << 4);
1902                 }
1903         }
1904 
1905         /* factor in colorequiv to check additional 'equivalent' bins. */
1906         if (colorequiv > 1) {
1907 
1908                 int a = lowbit(colorequiv) - 1;
1909                 if (a > 15)
1910                         a = 15;
1911 
1912                 for (i = 0; i <= mmu.max_page_level; i++) {
1913                         if ((colors = hw_page_array[i].hp_colors) <= 1) {
1914                                 continue;
1915                         }
1916                         while ((colors >> a) == 0)
1917                                 a--;
1918                         if ((a << 4) > colorequivszc[i]) {
1919                                 colorequivszc[i] = (a << 4);
1920                         }
1921                 }
1922         }
1923 
1924         /* size for mnoderanges */
1925         for (mnoderangecnt = 0, i = 0; i < max_mem_nodes; i++)
1926                 mnoderangecnt += mnode_range_cnt(i);
1927         if (plat_dr_support_memory()) {
1928                 /*
1929                  * Reserve enough space for memory DR operations.
1930                  * Two extra mnoderanges for possbile fragmentations,
1931                  * one for the 2G boundary and the other for the 4G boundary.
1932                  * We don't expect a memory board crossing the 16M boundary
1933                  * for memory hot-add operations on x86 platforms.
1934                  */
1935                 mnoderangecnt += 2 + max_mem_nodes - lgrp_plat_node_cnt;
1936         }
1937         colorsz = mnoderangecnt * sizeof (mnoderange_t);
1938 
1939         /* size for fpc_mutex and cpc_mutex */
1940         colorsz += (2 * max_mem_nodes * sizeof (kmutex_t) * NPC_MUTEX);
1941 
1942         /* size of page_freelists */
1943         colorsz += mnoderangecnt * sizeof (page_t ***);
1944         colorsz += mnoderangecnt * mmu_page_sizes * sizeof (page_t **);
1945 
1946         for (i = 0; i < mmu_page_sizes; i++) {
1947                 colors = page_get_pagecolors(i);
1948                 colorsz += mnoderangecnt * colors * sizeof (page_t *);
1949         }
1950 
1951         /* size of page_cachelists */
1952         colorsz += mnoderangecnt * sizeof (page_t **);
1953         colorsz += mnoderangecnt * page_colors * sizeof (page_t *);
1954 
1955         return (colorsz);
1956 }
1957 
1958 /*
1959  * Called once at startup to configure page_coloring data structures and
1960  * does the 1st page_free()/page_freelist_add().
1961  */
1962 void
1963 page_coloring_setup(caddr_t pcmemaddr)
1964 {
1965         int     i;
1966         int     j;
1967         int     k;
1968         caddr_t addr;
1969         int     colors;
1970 
1971         /*
1972          * do page coloring setup
1973          */
1974         addr = pcmemaddr;
1975 
1976         mnoderanges = (mnoderange_t *)addr;
1977         addr += (mnoderangecnt * sizeof (mnoderange_t));
1978 
1979         mnode_range_setup(mnoderanges);
1980 
1981         for (k = 0; k < NPC_MUTEX; k++) {
1982                 fpc_mutex[k] = (kmutex_t *)addr;
1983                 addr += (max_mem_nodes * sizeof (kmutex_t));
1984         }
1985         for (k = 0; k < NPC_MUTEX; k++) {
1986                 cpc_mutex[k] = (kmutex_t *)addr;
1987                 addr += (max_mem_nodes * sizeof (kmutex_t));
1988         }
1989         page_freelists = (page_t ****)addr;
1990         addr += (mnoderangecnt * sizeof (page_t ***));
1991 
1992         page_cachelists = (page_t ***)addr;
1993         addr += (mnoderangecnt * sizeof (page_t **));
1994 
1995         for (i = 0; i < mnoderangecnt; i++) {
1996                 page_freelists[i] = (page_t ***)addr;
1997                 addr += (mmu_page_sizes * sizeof (page_t **));
1998 
1999                 for (j = 0; j < mmu_page_sizes; j++) {
2000                         colors = page_get_pagecolors(j);
2001                         page_freelists[i][j] = (page_t **)addr;
2002                         addr += (colors * sizeof (page_t *));
2003                 }
2004                 page_cachelists[i] = (page_t **)addr;
2005                 addr += (page_colors * sizeof (page_t *));
2006         }
2007 }
2008 
2009 #if defined(__xpv)
2010 /*
2011  * Give back 10% of the io_pool pages to the free list.
2012  * Don't shrink the pool below some absolute minimum.
2013  */
2014 static void
2015 page_io_pool_shrink()
2016 {
2017         int retcnt;
2018         page_t *pp, *pp_first, *pp_last, **curpool;
2019         mfn_t mfn;
2020         int bothpools = 0;
2021 
2022         mutex_enter(&io_pool_lock);
2023         io_pool_shrink_attempts++;      /* should be a kstat? */
2024         retcnt = io_pool_cnt / 10;
2025         if (io_pool_cnt - retcnt < io_pool_cnt_min)
2026                 retcnt = io_pool_cnt - io_pool_cnt_min;
2027         if (retcnt <= 0)
2028                 goto done;
2029         io_pool_shrinks++;      /* should be a kstat? */
2030         curpool = &io_pool_4g;
2031 domore:
2032         /*
2033          * Loop through taking pages from the end of the list
2034          * (highest mfns) till amount to return reached.
2035          */
2036         for (pp = *curpool; pp && retcnt > 0; ) {
2037                 pp_first = pp_last = pp->p_prev;
2038                 if (pp_first == *curpool)
2039                         break;
2040                 retcnt--;
2041                 io_pool_cnt--;
2042                 page_io_pool_sub(curpool, pp_first, pp_last);
2043                 if ((mfn = pfn_to_mfn(pp->p_pagenum)) < start_mfn)
2044                         start_mfn = mfn;
2045                 page_free(pp_first, 1);
2046                 pp = *curpool;
2047         }
2048         if (retcnt != 0 && !bothpools) {
2049                 /*
2050                  * If not enough found in less constrained pool try the
2051                  * more constrained one.
2052                  */
2053                 curpool = &io_pool_16m;
2054                 bothpools = 1;
2055                 goto domore;
2056         }
2057 done:
2058         mutex_exit(&io_pool_lock);
2059 }
2060 
2061 #endif  /* __xpv */
2062 
2063 uint_t
2064 page_create_update_flags_x86(uint_t flags)
2065 {
2066 #if defined(__xpv)
2067         /*
2068          * Check this is an urgent allocation and free pages are depleted.
2069          */
2070         if (!(flags & PG_WAIT) && freemem < desfree)
2071                 page_io_pool_shrink();
2072 #else /* !__xpv */
2073         /*
2074          * page_create_get_something may call this because 4g memory may be
2075          * depleted. Set flags to allow for relocation of base page below
2076          * 4g if necessary.
2077          */
2078         if (physmax4g)
2079                 flags |= (PGI_PGCPSZC0 | PGI_PGCPHIPRI);
2080 #endif /* __xpv */
2081         return (flags);
2082 }
2083 
2084 /*ARGSUSED*/
2085 int
2086 bp_color(struct buf *bp)
2087 {
2088         return (0);
2089 }
2090 
2091 #if defined(__xpv)
2092 
2093 /*
2094  * Take pages out of an io_pool
2095  */
2096 static void
2097 page_io_pool_sub(page_t **poolp, page_t *pp_first, page_t *pp_last)
2098 {
2099         if (*poolp == pp_first) {
2100                 *poolp = pp_last->p_next;
2101                 if (*poolp == pp_first)
2102                         *poolp = NULL;
2103         }
2104         pp_first->p_prev->p_next = pp_last->p_next;
2105         pp_last->p_next->p_prev = pp_first->p_prev;
2106         pp_first->p_prev = pp_last;
2107         pp_last->p_next = pp_first;
2108 }
2109 
2110 /*
2111  * Put a page on the io_pool list. The list is ordered by increasing MFN.
2112  */
2113 static void
2114 page_io_pool_add(page_t **poolp, page_t *pp)
2115 {
2116         page_t  *look;
2117         mfn_t   mfn = mfn_list[pp->p_pagenum];
2118 
2119         if (*poolp == NULL) {
2120                 *poolp = pp;
2121                 pp->p_next = pp;
2122                 pp->p_prev = pp;
2123                 return;
2124         }
2125 
2126         /*
2127          * Since we try to take pages from the high end of the pool
2128          * chances are good that the pages to be put on the list will
2129          * go at or near the end of the list. so start at the end and
2130          * work backwards.
2131          */
2132         look = (*poolp)->p_prev;
2133         while (mfn < mfn_list[look->p_pagenum]) {
2134                 look = look->p_prev;
2135                 if (look == (*poolp)->p_prev)
2136                         break; /* backed all the way to front of list */
2137         }
2138 
2139         /* insert after look */
2140         pp->p_prev = look;
2141         pp->p_next = look->p_next;
2142         pp->p_next->p_prev = pp;
2143         look->p_next = pp;
2144         if (mfn < mfn_list[(*poolp)->p_pagenum]) {
2145                 /*
2146                  * we inserted a new first list element
2147                  * adjust pool pointer to newly inserted element
2148                  */
2149                 *poolp = pp;
2150         }
2151 }
2152 
2153 /*
2154  * Add a page to the io_pool.  Setting the force flag will force the page
2155  * into the io_pool no matter what.
2156  */
2157 static void
2158 add_page_to_pool(page_t *pp, int force)
2159 {
2160         page_t *highest;
2161         page_t *freep = NULL;
2162 
2163         mutex_enter(&io_pool_lock);
2164         /*
2165          * Always keep the scarce low memory pages
2166          */
2167         if (mfn_list[pp->p_pagenum] < PFN_16MEG) {
2168                 ++io_pool_cnt;
2169                 page_io_pool_add(&io_pool_16m, pp);
2170                 goto done;
2171         }
2172         if (io_pool_cnt < io_pool_cnt_max || force || io_pool_4g == NULL) {
2173                 ++io_pool_cnt;
2174                 page_io_pool_add(&io_pool_4g, pp);
2175         } else {
2176                 highest = io_pool_4g->p_prev;
2177                 if (mfn_list[pp->p_pagenum] < mfn_list[highest->p_pagenum]) {
2178                         page_io_pool_sub(&io_pool_4g, highest, highest);
2179                         page_io_pool_add(&io_pool_4g, pp);
2180                         freep = highest;
2181                 } else {
2182                         freep = pp;
2183                 }
2184         }
2185 done:
2186         mutex_exit(&io_pool_lock);
2187         if (freep)
2188                 page_free(freep, 1);
2189 }
2190 
2191 
2192 int contig_pfn_cnt;     /* no of pfns in the contig pfn list */
2193 int contig_pfn_max;     /* capacity of the contig pfn list */
2194 int next_alloc_pfn;     /* next position in list to start a contig search */
2195 int contig_pfnlist_updates;     /* pfn list update count */
2196 int contig_pfnlist_builds;      /* how many times have we (re)built list */
2197 int contig_pfnlist_buildfailed; /* how many times has list build failed */
2198 int create_contig_pending;      /* nonzero means taskq creating contig list */
2199 pfn_t *contig_pfn_list = NULL;  /* list of contig pfns in ascending mfn order */
2200 
2201 /*
2202  * Function to use in sorting a list of pfns by their underlying mfns.
2203  */
2204 static int
2205 mfn_compare(const void *pfnp1, const void *pfnp2)
2206 {
2207         mfn_t mfn1 = mfn_list[*(pfn_t *)pfnp1];
2208         mfn_t mfn2 = mfn_list[*(pfn_t *)pfnp2];
2209 
2210         if (mfn1 > mfn2)
2211                 return (1);
2212         if (mfn1 < mfn2)
2213                 return (-1);
2214         return (0);
2215 }
2216 
2217 /*
2218  * Compact the contig_pfn_list by tossing all the non-contiguous
2219  * elements from the list.
2220  */
2221 static void
2222 compact_contig_pfn_list(void)
2223 {
2224         pfn_t pfn, lapfn, prev_lapfn;
2225         mfn_t mfn;
2226         int i, newcnt = 0;
2227 
2228         prev_lapfn = 0;
2229         for (i = 0; i < contig_pfn_cnt - 1; i++) {
2230                 pfn = contig_pfn_list[i];
2231                 lapfn = contig_pfn_list[i + 1];
2232                 mfn = mfn_list[pfn];
2233                 /*
2234                  * See if next pfn is for a contig mfn
2235                  */
2236                 if (mfn_list[lapfn] != mfn + 1)
2237                         continue;
2238                 /*
2239                  * pfn and lookahead are both put in list
2240                  * unless pfn is the previous lookahead.
2241                  */
2242                 if (pfn != prev_lapfn)
2243                         contig_pfn_list[newcnt++] = pfn;
2244                 contig_pfn_list[newcnt++] = lapfn;
2245                 prev_lapfn = lapfn;
2246         }
2247         for (i = newcnt; i < contig_pfn_cnt; i++)
2248                 contig_pfn_list[i] = 0;
2249         contig_pfn_cnt = newcnt;
2250 }
2251 
2252 /*ARGSUSED*/
2253 static void
2254 call_create_contiglist(void *arg)
2255 {
2256         (void) create_contig_pfnlist(PG_WAIT);
2257 }
2258 
2259 /*
2260  * Create list of freelist pfns that have underlying
2261  * contiguous mfns.  The list is kept in ascending mfn order.
2262  * returns 1 if list created else 0.
2263  */
2264 static int
2265 create_contig_pfnlist(uint_t flags)
2266 {
2267         pfn_t pfn;
2268         page_t *pp;
2269         int ret = 1;
2270 
2271         mutex_enter(&contig_list_lock);
2272         if (contig_pfn_list != NULL)
2273                 goto out;
2274         contig_pfn_max = freemem + (freemem / 10);
2275         contig_pfn_list = kmem_zalloc(contig_pfn_max * sizeof (pfn_t),
2276             (flags & PG_WAIT) ? KM_SLEEP : KM_NOSLEEP);
2277         if (contig_pfn_list == NULL) {
2278                 /*
2279                  * If we could not create the contig list (because
2280                  * we could not sleep for memory).  Dispatch a taskq that can
2281                  * sleep to get the memory.
2282                  */
2283                 if (!create_contig_pending) {
2284                         if (taskq_dispatch(system_taskq, call_create_contiglist,
2285                             NULL, TQ_NOSLEEP) != TASKQID_INVALID)
2286                                 create_contig_pending = 1;
2287                 }
2288                 contig_pfnlist_buildfailed++;   /* count list build failures */
2289                 ret = 0;
2290                 goto out;
2291         }
2292         create_contig_pending = 0;
2293         ASSERT(contig_pfn_cnt == 0);
2294         for (pfn = 0; pfn < mfn_count; pfn++) {
2295                 pp = page_numtopp_nolock(pfn);
2296                 if (pp == NULL || !PP_ISFREE(pp))
2297                         continue;
2298                 contig_pfn_list[contig_pfn_cnt] = pfn;
2299                 if (++contig_pfn_cnt == contig_pfn_max)
2300                         break;
2301         }
2302         /*
2303          * Sanity check the new list.
2304          */
2305         if (contig_pfn_cnt < 2) { /* no contig pfns */
2306                 contig_pfn_cnt = 0;
2307                 contig_pfnlist_buildfailed++;
2308                 kmem_free(contig_pfn_list, contig_pfn_max * sizeof (pfn_t));
2309                 contig_pfn_list = NULL;
2310                 contig_pfn_max = 0;
2311                 ret = 0;
2312                 goto out;
2313         }
2314         qsort(contig_pfn_list, contig_pfn_cnt, sizeof (pfn_t), mfn_compare);
2315         compact_contig_pfn_list();
2316         /*
2317          * Make sure next search of the newly created contiguous pfn
2318          * list starts at the beginning of the list.
2319          */
2320         next_alloc_pfn = 0;
2321         contig_pfnlist_builds++;        /* count list builds */
2322 out:
2323         mutex_exit(&contig_list_lock);
2324         return (ret);
2325 }
2326 
2327 
2328 /*
2329  * Toss the current contig pfnlist.  Someone is about to do a massive
2330  * update to pfn<->mfn mappings.  So we have them destroy the list and lock
2331  * it till they are done with their update.
2332  */
2333 void
2334 clear_and_lock_contig_pfnlist()
2335 {
2336         pfn_t *listp = NULL;
2337         size_t listsize;
2338 
2339         mutex_enter(&contig_list_lock);
2340         if (contig_pfn_list != NULL) {
2341                 listp = contig_pfn_list;
2342                 listsize = contig_pfn_max * sizeof (pfn_t);
2343                 contig_pfn_list = NULL;
2344                 contig_pfn_max = contig_pfn_cnt = 0;
2345         }
2346         if (listp != NULL)
2347                 kmem_free(listp, listsize);
2348 }
2349 
2350 /*
2351  * Unlock the contig_pfn_list.  The next attempted use of it will cause
2352  * it to be re-created.
2353  */
2354 void
2355 unlock_contig_pfnlist()
2356 {
2357         mutex_exit(&contig_list_lock);
2358 }
2359 
2360 /*
2361  * Update the contiguous pfn list in response to a pfn <-> mfn reassignment
2362  */
2363 void
2364 update_contig_pfnlist(pfn_t pfn, mfn_t oldmfn, mfn_t newmfn)
2365 {
2366         int probe_hi, probe_lo, probe_pos, insert_after, insert_point;
2367         pfn_t probe_pfn;
2368         mfn_t probe_mfn;
2369         int drop_lock = 0;
2370 
2371         if (mutex_owner(&contig_list_lock) != curthread) {
2372                 drop_lock = 1;
2373                 mutex_enter(&contig_list_lock);
2374         }
2375         if (contig_pfn_list == NULL)
2376                 goto done;
2377         contig_pfnlist_updates++;
2378         /*
2379          * Find the pfn in the current list.  Use a binary chop to locate it.
2380          */
2381         probe_hi = contig_pfn_cnt - 1;
2382         probe_lo = 0;
2383         probe_pos = (probe_hi + probe_lo) / 2;
2384         while ((probe_pfn = contig_pfn_list[probe_pos]) != pfn) {
2385                 if (probe_pos == probe_lo) { /* pfn not in list */
2386                         probe_pos = -1;
2387                         break;
2388                 }
2389                 if (pfn_to_mfn(probe_pfn) <= oldmfn)
2390                         probe_lo = probe_pos;
2391                 else
2392                         probe_hi = probe_pos;
2393                 probe_pos = (probe_hi + probe_lo) / 2;
2394         }
2395         if (probe_pos >= 0) {
2396                 /*
2397                  * Remove pfn from list and ensure next alloc
2398                  * position stays in bounds.
2399                  */
2400                 if (--contig_pfn_cnt <= next_alloc_pfn)
2401                         next_alloc_pfn = 0;
2402                 if (contig_pfn_cnt < 2) { /* no contig pfns */
2403                         contig_pfn_cnt = 0;
2404                         kmem_free(contig_pfn_list,
2405                             contig_pfn_max * sizeof (pfn_t));
2406                         contig_pfn_list = NULL;
2407                         contig_pfn_max = 0;
2408                         goto done;
2409                 }
2410                 ovbcopy(&contig_pfn_list[probe_pos + 1],
2411                     &contig_pfn_list[probe_pos],
2412                     (contig_pfn_cnt - probe_pos) * sizeof (pfn_t));
2413         }
2414         if (newmfn == MFN_INVALID)
2415                 goto done;
2416         /*
2417          * Check if new mfn has adjacent mfns in the list
2418          */
2419         probe_hi = contig_pfn_cnt - 1;
2420         probe_lo = 0;
2421         insert_after = -2;
2422         do {
2423                 probe_pos = (probe_hi + probe_lo) / 2;
2424                 probe_mfn = pfn_to_mfn(contig_pfn_list[probe_pos]);
2425                 if (newmfn == probe_mfn + 1)
2426                         insert_after = probe_pos;
2427                 else if (newmfn == probe_mfn - 1)
2428                         insert_after = probe_pos - 1;
2429                 if (probe_pos == probe_lo)
2430                         break;
2431                 if (probe_mfn <= newmfn)
2432                         probe_lo = probe_pos;
2433                 else
2434                         probe_hi = probe_pos;
2435         } while (insert_after == -2);
2436         /*
2437          * If there is space in the list and there are adjacent mfns
2438          * insert the pfn in to its proper place in the list.
2439          */
2440         if (insert_after != -2 && contig_pfn_cnt + 1 <= contig_pfn_max) {
2441                 insert_point = insert_after + 1;
2442                 ovbcopy(&contig_pfn_list[insert_point],
2443                     &contig_pfn_list[insert_point + 1],
2444                     (contig_pfn_cnt - insert_point) * sizeof (pfn_t));
2445                 contig_pfn_list[insert_point] = pfn;
2446                 contig_pfn_cnt++;
2447         }
2448 done:
2449         if (drop_lock)
2450                 mutex_exit(&contig_list_lock);
2451 }
2452 
2453 /*
2454  * Called to (re-)populate the io_pool from the free page lists.
2455  */
2456 long
2457 populate_io_pool(void)
2458 {
2459         pfn_t pfn;
2460         mfn_t mfn, max_mfn;
2461         page_t *pp;
2462 
2463         /*
2464          * Figure out the bounds of the pool on first invocation.
2465          * We use a percentage of memory for the io pool size.
2466          * we allow that to shrink, but not to less than a fixed minimum
2467          */
2468         if (io_pool_cnt_max == 0) {
2469                 io_pool_cnt_max = physmem / (100 / io_pool_physmem_pct);
2470                 io_pool_cnt_lowater = io_pool_cnt_max;
2471                 /*
2472                  * This is the first time in populate_io_pool, grab a va to use
2473                  * when we need to allocate pages.
2474                  */
2475                 io_pool_kva = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP);
2476         }
2477         /*
2478          * If we are out of pages in the pool, then grow the size of the pool
2479          */
2480         if (io_pool_cnt == 0) {
2481                 /*
2482                  * Grow the max size of the io pool by 5%, but never more than
2483                  * 25% of physical memory.
2484                  */
2485                 if (io_pool_cnt_max < physmem / 4)
2486                         io_pool_cnt_max += io_pool_cnt_max / 20;
2487         }
2488         io_pool_grows++;        /* should be a kstat? */
2489 
2490         /*
2491          * Get highest mfn on this platform, but limit to the 32 bit DMA max.
2492          */
2493         (void) mfn_to_pfn(start_mfn);
2494         max_mfn = MIN(cached_max_mfn, PFN_4GIG);
2495         for (mfn = start_mfn; mfn < max_mfn; start_mfn = ++mfn) {
2496                 pfn = mfn_to_pfn(mfn);
2497                 if (pfn & PFN_IS_FOREIGN_MFN)
2498                         continue;
2499                 /*
2500                  * try to allocate it from free pages
2501                  */
2502                 pp = page_numtopp_alloc(pfn);
2503                 if (pp == NULL)
2504                         continue;
2505                 PP_CLRFREE(pp);
2506                 add_page_to_pool(pp, 1);
2507                 if (io_pool_cnt >= io_pool_cnt_max)
2508                         break;
2509         }
2510 
2511         return (io_pool_cnt);
2512 }
2513 
2514 /*
2515  * Destroy a page that was being used for DMA I/O. It may or
2516  * may not actually go back to the io_pool.
2517  */
2518 void
2519 page_destroy_io(page_t *pp)
2520 {
2521         mfn_t mfn = mfn_list[pp->p_pagenum];
2522 
2523         /*
2524          * When the page was alloc'd a reservation was made, release it now
2525          */
2526         page_unresv(1);
2527         /*
2528          * Unload translations, if any, then hash out the
2529          * page to erase its identity.
2530          */
2531         (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
2532         page_hashout(pp, NULL);
2533 
2534         /*
2535          * If the page came from the free lists, just put it back to them.
2536          * DomU pages always go on the free lists as well.
2537          */
2538         if (!DOMAIN_IS_INITDOMAIN(xen_info) || mfn >= PFN_4GIG) {
2539                 page_free(pp, 1);
2540                 return;
2541         }
2542 
2543         add_page_to_pool(pp, 0);
2544 }
2545 
2546 
2547 long contig_searches;           /* count of times contig pages requested */
2548 long contig_search_restarts;    /* count of contig ranges tried */
2549 long contig_search_failed;      /* count of contig alloc failures */
2550 
2551 /*
2552  * Free partial page list
2553  */
2554 static void
2555 free_partial_list(page_t **pplist)
2556 {
2557         page_t *pp;
2558 
2559         while (*pplist != NULL) {
2560                 pp = *pplist;
2561                 page_io_pool_sub(pplist, pp, pp);
2562                 page_free(pp, 1);
2563         }
2564 }
2565 
2566 /*
2567  * Look thru the contiguous pfns that are not part of the io_pool for
2568  * contiguous free pages.  Return a list of the found pages or NULL.
2569  */
2570 page_t *
2571 find_contig_free(uint_t npages, uint_t flags, uint64_t pfnseg,
2572     pgcnt_t pfnalign)
2573 {
2574         page_t *pp, *plist = NULL;
2575         mfn_t mfn, prev_mfn, start_mfn;
2576         pfn_t pfn;
2577         int pages_needed, pages_requested;
2578         int search_start;
2579 
2580         /*
2581          * create the contig pfn list if not already done
2582          */
2583 retry:
2584         mutex_enter(&contig_list_lock);
2585         if (contig_pfn_list == NULL) {
2586                 mutex_exit(&contig_list_lock);
2587                 if (!create_contig_pfnlist(flags)) {
2588                         return (NULL);
2589                 }
2590                 goto retry;
2591         }
2592         contig_searches++;
2593         /*
2594          * Search contiguous pfn list for physically contiguous pages not in
2595          * the io_pool.  Start the search where the last search left off.
2596          */
2597         pages_requested = pages_needed = npages;
2598         search_start = next_alloc_pfn;
2599         start_mfn = prev_mfn = 0;
2600         while (pages_needed) {
2601                 pfn = contig_pfn_list[next_alloc_pfn];
2602                 mfn = pfn_to_mfn(pfn);
2603                 /*
2604                  * Check if mfn is first one or contig to previous one and
2605                  * if page corresponding to mfn is free and that mfn
2606                  * range is not crossing a segment boundary.
2607                  */
2608                 if ((prev_mfn == 0 || mfn == prev_mfn + 1) &&
2609                     (pp = page_numtopp_alloc(pfn)) != NULL &&
2610                     !((mfn & pfnseg) < (start_mfn & pfnseg))) {
2611                         PP_CLRFREE(pp);
2612                         page_io_pool_add(&plist, pp);
2613                         pages_needed--;
2614                         if (prev_mfn == 0) {
2615                                 if (pfnalign &&
2616                                     mfn != P2ROUNDUP(mfn, pfnalign)) {
2617                                         /*
2618                                          * not properly aligned
2619                                          */
2620                                         contig_search_restarts++;
2621                                         free_partial_list(&plist);
2622                                         pages_needed = pages_requested;
2623                                         start_mfn = prev_mfn = 0;
2624                                         goto skip;
2625                                 }
2626                                 start_mfn = mfn;
2627                         }
2628                         prev_mfn = mfn;
2629                 } else {
2630                         contig_search_restarts++;
2631                         free_partial_list(&plist);
2632                         pages_needed = pages_requested;
2633                         start_mfn = prev_mfn = 0;
2634                 }
2635 skip:
2636                 if (++next_alloc_pfn == contig_pfn_cnt)
2637                         next_alloc_pfn = 0;
2638                 if (next_alloc_pfn == search_start)
2639                         break; /* all pfns searched */
2640         }
2641         mutex_exit(&contig_list_lock);
2642         if (pages_needed) {
2643                 contig_search_failed++;
2644                 /*
2645                  * Failed to find enough contig pages.
2646                  * free partial page list
2647                  */
2648                 free_partial_list(&plist);
2649         }
2650         return (plist);
2651 }
2652 
2653 /*
2654  * Search the reserved io pool pages for a page range with the
2655  * desired characteristics.
2656  */
2657 page_t *
2658 page_io_pool_alloc(ddi_dma_attr_t *mattr, int contig, pgcnt_t minctg)
2659 {
2660         page_t *pp_first, *pp_last;
2661         page_t *pp, **poolp;
2662         pgcnt_t nwanted, pfnalign;
2663         uint64_t pfnseg;
2664         mfn_t mfn, tmfn, hi_mfn, lo_mfn;
2665         int align, attempt = 0;
2666 
2667         if (minctg == 1)
2668                 contig = 0;
2669         lo_mfn = mmu_btop(mattr->dma_attr_addr_lo);
2670         hi_mfn = mmu_btop(mattr->dma_attr_addr_hi);
2671         pfnseg = mmu_btop(mattr->dma_attr_seg);
2672         align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer);
2673         if (align > MMU_PAGESIZE)
2674                 pfnalign = mmu_btop(align);
2675         else
2676                 pfnalign = 0;
2677 
2678 try_again:
2679         /*
2680          * See if we want pages for a legacy device
2681          */
2682         if (hi_mfn < PFN_16MEG)
2683                 poolp = &io_pool_16m;
2684         else
2685                 poolp = &io_pool_4g;
2686 try_smaller:
2687         /*
2688          * Take pages from I/O pool. We'll use pages from the highest
2689          * MFN range possible.
2690          */
2691         pp_first = pp_last = NULL;
2692         mutex_enter(&io_pool_lock);
2693         nwanted = minctg;
2694         for (pp = *poolp; pp && nwanted > 0; ) {
2695                 pp = pp->p_prev;
2696 
2697                 /*
2698                  * skip pages above allowable range
2699                  */
2700                 mfn = mfn_list[pp->p_pagenum];
2701                 if (hi_mfn < mfn)
2702                         goto skip;
2703 
2704                 /*
2705                  * stop at pages below allowable range
2706                  */
2707                 if (lo_mfn > mfn)
2708                         break;
2709 restart:
2710                 if (pp_last == NULL) {
2711                         /*
2712                          * Check alignment
2713                          */
2714                         tmfn = mfn - (minctg - 1);
2715                         if (pfnalign && tmfn != P2ROUNDUP(tmfn, pfnalign))
2716                                 goto skip; /* not properly aligned */
2717                         /*
2718                          * Check segment
2719                          */
2720                         if ((mfn & pfnseg) < (tmfn & pfnseg))
2721                                 goto skip; /* crosses seg boundary */
2722                         /*
2723                          * Start building page list
2724                          */
2725                         pp_first = pp_last = pp;
2726                         nwanted--;
2727                 } else {
2728                         /*
2729                          * check physical contiguity if required
2730                          */
2731                         if (contig &&
2732                             mfn_list[pp_first->p_pagenum] != mfn + 1) {
2733                                 /*
2734                                  * not a contiguous page, restart list.
2735                                  */
2736                                 pp_last = NULL;
2737                                 nwanted = minctg;
2738                                 goto restart;
2739                         } else { /* add page to list */
2740                                 pp_first = pp;
2741                                 nwanted--;
2742                         }
2743                 }
2744 skip:
2745                 if (pp == *poolp)
2746                         break;
2747         }
2748 
2749         /*
2750          * If we didn't find memory. Try the more constrained pool, then
2751          * sweep free pages into the DMA pool and try again.
2752          */
2753         if (nwanted != 0) {
2754                 mutex_exit(&io_pool_lock);
2755                 /*
2756                  * If we were looking in the less constrained pool and
2757                  * didn't find pages, try the more constrained pool.
2758                  */
2759                 if (poolp == &io_pool_4g) {
2760                         poolp = &io_pool_16m;
2761                         goto try_smaller;
2762                 }
2763                 kmem_reap();
2764                 if (++attempt < 4) {
2765                         /*
2766                          * Grab some more io_pool pages
2767                          */
2768                         (void) populate_io_pool();
2769                         goto try_again; /* go around and retry */
2770                 }
2771                 return (NULL);
2772         }
2773         /*
2774          * Found the pages, now snip them from the list
2775          */
2776         page_io_pool_sub(poolp, pp_first, pp_last);
2777         io_pool_cnt -= minctg;
2778         /*
2779          * reset low water mark
2780          */
2781         if (io_pool_cnt < io_pool_cnt_lowater)
2782                 io_pool_cnt_lowater = io_pool_cnt;
2783         mutex_exit(&io_pool_lock);
2784         return (pp_first);
2785 }
2786 
2787 page_t *
2788 page_swap_with_hypervisor(struct vnode *vp, u_offset_t off, caddr_t vaddr,
2789     ddi_dma_attr_t *mattr, uint_t flags, pgcnt_t minctg)
2790 {
2791         uint_t kflags;
2792         int order, extra, extpages, i, contig, nbits, extents;
2793         page_t *pp, *expp, *pp_first, **pplist = NULL;
2794         mfn_t *mfnlist = NULL;
2795 
2796         contig = flags & PG_PHYSCONTIG;
2797         if (minctg == 1)
2798                 contig = 0;
2799         flags &= ~PG_PHYSCONTIG;
2800         kflags = flags & PG_WAIT ? KM_SLEEP : KM_NOSLEEP;
2801         /*
2802          * Hypervisor will allocate extents, if we want contig
2803          * pages extent must be >= minctg
2804          */
2805         if (contig) {
2806                 order = highbit(minctg) - 1;
2807                 if (minctg & ((1 << order) - 1))
2808                         order++;
2809                 extpages = 1 << order;
2810         } else {
2811                 order = 0;
2812                 extpages = minctg;
2813         }
2814         if (extpages > minctg) {
2815                 extra = extpages - minctg;
2816                 if (!page_resv(extra, kflags))
2817                         return (NULL);
2818         }
2819         pp_first = NULL;
2820         pplist = kmem_alloc(extpages * sizeof (page_t *), kflags);
2821         if (pplist == NULL)
2822                 goto balloon_fail;
2823         mfnlist = kmem_alloc(extpages * sizeof (mfn_t), kflags);
2824         if (mfnlist == NULL)
2825                 goto balloon_fail;
2826         pp = page_create_va(vp, off, minctg * PAGESIZE, flags, &kvseg, vaddr);
2827         if (pp == NULL)
2828                 goto balloon_fail;
2829         pp_first = pp;
2830         if (extpages > minctg) {
2831                 /*
2832                  * fill out the rest of extent pages to swap
2833                  * with the hypervisor
2834                  */
2835                 for (i = 0; i < extra; i++) {
2836                         expp = page_create_va(vp,
2837                             (u_offset_t)(uintptr_t)io_pool_kva,
2838                             PAGESIZE, flags, &kvseg, io_pool_kva);
2839                         if (expp == NULL)
2840                                 goto balloon_fail;
2841                         (void) hat_pageunload(expp, HAT_FORCE_PGUNLOAD);
2842                         page_io_unlock(expp);
2843                         page_hashout(expp, NULL);
2844                         page_io_lock(expp);
2845                         /*
2846                          * add page to end of list
2847                          */
2848                         expp->p_prev = pp_first->p_prev;
2849                         expp->p_next = pp_first;
2850                         expp->p_prev->p_next = expp;
2851                         pp_first->p_prev = expp;
2852                 }
2853 
2854         }
2855         for (i = 0; i < extpages; i++) {
2856                 pplist[i] = pp;
2857                 pp = pp->p_next;
2858         }
2859         nbits = highbit(mattr->dma_attr_addr_hi);
2860         extents = contig ? 1 : minctg;
2861         if (balloon_replace_pages(extents, pplist, nbits, order,
2862             mfnlist) != extents) {
2863                 if (ioalloc_dbg)
2864                         cmn_err(CE_NOTE, "request to hypervisor"
2865                             " for %d pages, maxaddr %" PRIx64 " failed",
2866                             extpages, mattr->dma_attr_addr_hi);
2867                 goto balloon_fail;
2868         }
2869 
2870         kmem_free(pplist, extpages * sizeof (page_t *));
2871         kmem_free(mfnlist, extpages * sizeof (mfn_t));
2872         /*
2873          * Return any excess pages to free list
2874          */
2875         if (extpages > minctg) {
2876                 for (i = 0; i < extra; i++) {
2877                         pp = pp_first->p_prev;
2878                         page_sub(&pp_first, pp);
2879                         page_io_unlock(pp);
2880                         page_unresv(1);
2881                         page_free(pp, 1);
2882                 }
2883         }
2884         return (pp_first);
2885 balloon_fail:
2886         /*
2887          * Return pages to free list and return failure
2888          */
2889         while (pp_first != NULL) {
2890                 pp = pp_first;
2891                 page_sub(&pp_first, pp);
2892                 page_io_unlock(pp);
2893                 if (pp->p_vnode != NULL)
2894                         page_hashout(pp, NULL);
2895                 page_free(pp, 1);
2896         }
2897         if (pplist)
2898                 kmem_free(pplist, extpages * sizeof (page_t *));
2899         if (mfnlist)
2900                 kmem_free(mfnlist, extpages * sizeof (mfn_t));
2901         page_unresv(extpages - minctg);
2902         return (NULL);
2903 }
2904 
2905 static void
2906 return_partial_alloc(page_t *plist)
2907 {
2908         page_t *pp;
2909 
2910         while (plist != NULL) {
2911                 pp = plist;
2912                 page_sub(&plist, pp);
2913                 page_io_unlock(pp);
2914                 page_destroy_io(pp);
2915         }
2916 }
2917 
2918 static page_t *
2919 page_get_contigpages(
2920         struct vnode    *vp,
2921         u_offset_t      off,
2922         int             *npagesp,
2923         uint_t          flags,
2924         caddr_t         vaddr,
2925         ddi_dma_attr_t  *mattr)
2926 {
2927         mfn_t   max_mfn = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL);
2928         page_t  *plist; /* list to return */
2929         page_t  *pp, *mcpl;
2930         int     contig, anyaddr, npages, getone = 0;
2931         mfn_t   lo_mfn;
2932         mfn_t   hi_mfn;
2933         pgcnt_t pfnalign = 0;
2934         int     align, sgllen;
2935         uint64_t pfnseg;
2936         pgcnt_t minctg;
2937 
2938         npages = *npagesp;
2939         ASSERT(mattr != NULL);
2940         lo_mfn = mmu_btop(mattr->dma_attr_addr_lo);
2941         hi_mfn = mmu_btop(mattr->dma_attr_addr_hi);
2942         sgllen = mattr->dma_attr_sgllen;
2943         pfnseg = mmu_btop(mattr->dma_attr_seg);
2944         align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer);
2945         if (align > MMU_PAGESIZE)
2946                 pfnalign = mmu_btop(align);
2947 
2948         contig = flags & PG_PHYSCONTIG;
2949         if (npages == -1) {
2950                 npages = 1;
2951                 pfnalign = 0;
2952         }
2953         /*
2954          * Clear the contig flag if only one page is needed.
2955          */
2956         if (npages == 1) {
2957                 getone = 1;
2958                 contig = 0;
2959         }
2960 
2961         /*
2962          * Check if any page in the system is fine.
2963          */
2964         anyaddr = lo_mfn == 0 && hi_mfn >= max_mfn;
2965         if (!contig && anyaddr && !pfnalign) {
2966                 flags &= ~PG_PHYSCONTIG;
2967                 plist = page_create_va(vp, off, npages * MMU_PAGESIZE,
2968                     flags, &kvseg, vaddr);
2969                 if (plist != NULL) {
2970                         *npagesp = 0;
2971                         return (plist);
2972                 }
2973         }
2974         plist = NULL;
2975         minctg = howmany(npages, sgllen);
2976         while (npages > sgllen || getone) {
2977                 if (minctg > npages)
2978                         minctg = npages;
2979                 mcpl = NULL;
2980                 /*
2981                  * We could want contig pages with no address range limits.
2982                  */
2983                 if (anyaddr && contig) {
2984                         /*
2985                          * Look for free contig pages to satisfy the request.
2986                          */
2987                         mcpl = find_contig_free(minctg, flags, pfnseg,
2988                             pfnalign);
2989                 }
2990                 /*
2991                  * Try the reserved io pools next
2992                  */
2993                 if (mcpl == NULL)
2994                         mcpl = page_io_pool_alloc(mattr, contig, minctg);
2995                 if (mcpl != NULL) {
2996                         pp = mcpl;
2997                         do {
2998                                 if (!page_hashin(pp, vp, off, NULL)) {
2999                                         panic("page_get_contigpages:"
3000                                             " hashin failed"
3001                                             " pp %p, vp %p, off %llx",
3002                                             (void *)pp, (void *)vp, off);
3003                                 }
3004                                 off += MMU_PAGESIZE;
3005                                 PP_CLRFREE(pp);
3006                                 PP_CLRAGED(pp);
3007                                 page_set_props(pp, P_REF);
3008                                 page_io_lock(pp);
3009                                 pp = pp->p_next;
3010                         } while (pp != mcpl);
3011                 } else {
3012                         /*
3013                          * Hypervisor exchange doesn't handle segment or
3014                          * alignment constraints
3015                          */
3016                         if (mattr->dma_attr_seg < mattr->dma_attr_addr_hi ||
3017                             pfnalign)
3018                                 goto fail;
3019                         /*
3020                          * Try exchanging pages with the hypervisor
3021                          */
3022                         mcpl = page_swap_with_hypervisor(vp, off, vaddr, mattr,
3023                             flags, minctg);
3024                         if (mcpl == NULL)
3025                                 goto fail;
3026                         off += minctg * MMU_PAGESIZE;
3027                 }
3028                 check_dma(mattr, mcpl, minctg);
3029                 /*
3030                  * Here with a minctg run of contiguous pages, add them to the
3031                  * list we will return for this request.
3032                  */
3033                 page_list_concat(&plist, &mcpl);
3034                 npages -= minctg;
3035                 *npagesp = npages;
3036                 sgllen--;
3037                 if (getone)
3038                         break;
3039         }
3040         return (plist);
3041 fail:
3042         return_partial_alloc(plist);
3043         return (NULL);
3044 }
3045 
3046 /*
3047  * Allocator for domain 0 I/O pages. We match the required
3048  * DMA attributes and contiguity constraints.
3049  */
3050 /*ARGSUSED*/
3051 page_t *
3052 page_create_io(
3053         struct vnode    *vp,
3054         u_offset_t      off,
3055         uint_t          bytes,
3056         uint_t          flags,
3057         struct as       *as,
3058         caddr_t         vaddr,
3059         ddi_dma_attr_t  *mattr)
3060 {
3061         page_t  *plist = NULL, *pp;
3062         int     npages = 0, contig, anyaddr, pages_req;
3063         mfn_t   lo_mfn;
3064         mfn_t   hi_mfn;
3065         pgcnt_t pfnalign = 0;
3066         int     align;
3067         int     is_domu = 0;
3068         int     dummy, bytes_got;
3069         mfn_t   max_mfn = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL);
3070 
3071         ASSERT(mattr != NULL);
3072         lo_mfn = mmu_btop(mattr->dma_attr_addr_lo);
3073         hi_mfn = mmu_btop(mattr->dma_attr_addr_hi);
3074         align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer);
3075         if (align > MMU_PAGESIZE)
3076                 pfnalign = mmu_btop(align);
3077 
3078         /*
3079          * Clear the contig flag if only one page is needed or the scatter
3080          * gather list length is >= npages.
3081          */
3082         pages_req = npages = mmu_btopr(bytes);
3083         contig = (flags & PG_PHYSCONTIG);
3084         bytes = P2ROUNDUP(bytes, MMU_PAGESIZE);
3085         if (bytes == MMU_PAGESIZE || mattr->dma_attr_sgllen >= npages)
3086                 contig = 0;
3087 
3088         /*
3089          * Check if any old page in the system is fine.
3090          * DomU should always go down this path.
3091          */
3092         is_domu = !DOMAIN_IS_INITDOMAIN(xen_info);
3093         anyaddr = lo_mfn == 0 && hi_mfn >= max_mfn && !pfnalign;
3094         if ((!contig && anyaddr) || is_domu) {
3095                 flags &= ~PG_PHYSCONTIG;
3096                 plist = page_create_va(vp, off, bytes, flags, &kvseg, vaddr);
3097                 if (plist != NULL)
3098                         return (plist);
3099                 else if (is_domu)
3100                         return (NULL); /* no memory available */
3101         }
3102         /*
3103          * DomU should never reach here
3104          */
3105         if (contig) {
3106                 plist = page_get_contigpages(vp, off, &npages, flags, vaddr,
3107                     mattr);
3108                 if (plist == NULL)
3109                         goto fail;
3110                 bytes_got = (pages_req - npages) << MMU_PAGESHIFT;
3111                 vaddr += bytes_got;
3112                 off += bytes_got;
3113                 /*
3114                  * We now have all the contiguous pages we need, but
3115                  * we may still need additional non-contiguous pages.
3116                  */
3117         }
3118         /*
3119          * now loop collecting the requested number of pages, these do
3120          * not have to be contiguous pages but we will use the contig
3121          * page alloc code to get the pages since it will honor any
3122          * other constraints the pages may have.
3123          */
3124         while (npages--) {
3125                 dummy = -1;
3126                 pp = page_get_contigpages(vp, off, &dummy, flags, vaddr, mattr);
3127                 if (pp == NULL)
3128                         goto fail;
3129                 page_add(&plist, pp);
3130                 vaddr += MMU_PAGESIZE;
3131                 off += MMU_PAGESIZE;
3132         }
3133         return (plist);
3134 fail:
3135         /*
3136          * Failed to get enough pages, return ones we did get
3137          */
3138         return_partial_alloc(plist);
3139         return (NULL);
3140 }
3141 
3142 /*
3143  * Lock and return the page with the highest mfn that we can find.  last_mfn
3144  * holds the last one found, so the next search can start from there.  We
3145  * also keep a counter so that we don't loop forever if the machine has no
3146  * free pages.
3147  *
3148  * This is called from the balloon thread to find pages to give away.  new_high
3149  * is used when new mfn's have been added to the system - we will reset our
3150  * search if the new mfn's are higher than our current search position.
3151  */
3152 page_t *
3153 page_get_high_mfn(mfn_t new_high)
3154 {
3155         static mfn_t last_mfn = 0;
3156         pfn_t pfn;
3157         page_t *pp;
3158         ulong_t loop_count = 0;
3159 
3160         if (new_high > last_mfn)
3161                 last_mfn = new_high;
3162 
3163         for (; loop_count < mfn_count; loop_count++, last_mfn--) {
3164                 if (last_mfn == 0) {
3165                         last_mfn = cached_max_mfn;
3166                 }
3167 
3168                 pfn = mfn_to_pfn(last_mfn);
3169                 if (pfn & PFN_IS_FOREIGN_MFN)
3170                         continue;
3171 
3172                 /* See if the page is free.  If so, lock it. */
3173                 pp = page_numtopp_alloc(pfn);
3174                 if (pp == NULL)
3175                         continue;
3176                 PP_CLRFREE(pp);
3177 
3178                 ASSERT(PAGE_EXCL(pp));
3179                 ASSERT(pp->p_vnode == NULL);
3180                 ASSERT(!hat_page_is_mapped(pp));
3181                 last_mfn--;
3182                 return (pp);
3183         }
3184         return (NULL);
3185 }
3186 
3187 #else /* !__xpv */
3188 
3189 /*
3190  * get a page from any list with the given mnode
3191  */
3192 static page_t *
3193 page_get_mnode_anylist(ulong_t origbin, uchar_t szc, uint_t flags,
3194     int mnode, int mtype, ddi_dma_attr_t *dma_attr)
3195 {
3196         kmutex_t                *pcm;
3197         int                     i;
3198         page_t                  *pp;
3199         page_t                  *first_pp;
3200         uint64_t                pgaddr;
3201         ulong_t                 bin;
3202         int                     mtypestart;
3203         int                     plw_initialized;
3204         page_list_walker_t      plw;
3205 
3206         VM_STAT_ADD(pga_vmstats.pgma_alloc);
3207 
3208         ASSERT((flags & PG_MATCH_COLOR) == 0);
3209         ASSERT(szc == 0);
3210         ASSERT(dma_attr != NULL);
3211 
3212         MTYPE_START(mnode, mtype, flags);
3213         if (mtype < 0) {
3214                 VM_STAT_ADD(pga_vmstats.pgma_allocempty);
3215                 return (NULL);
3216         }
3217 
3218         mtypestart = mtype;
3219 
3220         bin = origbin;
3221 
3222         /*
3223          * check up to page_colors + 1 bins - origbin may be checked twice
3224          * because of BIN_STEP skip
3225          */
3226         do {
3227                 plw_initialized = 0;
3228 
3229                 for (plw.plw_count = 0;
3230                     plw.plw_count < page_colors; plw.plw_count++) {
3231 
3232                         if (PAGE_FREELISTS(mnode, szc, bin, mtype) == NULL)
3233                                 goto nextfreebin;
3234 
3235                         pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
3236                         mutex_enter(pcm);
3237                         pp = PAGE_FREELISTS(mnode, szc, bin, mtype);
3238                         first_pp = pp;
3239                         while (pp != NULL) {
3240                                 if (IS_DUMP_PAGE(pp) || page_trylock(pp,
3241                                     SE_EXCL) == 0) {
3242                                         pp = pp->p_next;
3243                                         if (pp == first_pp) {
3244                                                 pp = NULL;
3245                                         }
3246                                         continue;
3247                                 }
3248 
3249                                 ASSERT(PP_ISFREE(pp));
3250                                 ASSERT(PP_ISAGED(pp));
3251                                 ASSERT(pp->p_vnode == NULL);
3252                                 ASSERT(pp->p_hash == NULL);
3253                                 ASSERT(pp->p_offset == (u_offset_t)-1);
3254                                 ASSERT(pp->p_szc == szc);
3255                                 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
3256                                 /* check if page within DMA attributes */
3257                                 pgaddr = pa_to_ma(pfn_to_pa(pp->p_pagenum));
3258                                 if ((pgaddr >= dma_attr->dma_attr_addr_lo) &&
3259                                     (pgaddr + MMU_PAGESIZE - 1 <=
3260                                     dma_attr->dma_attr_addr_hi)) {
3261                                         break;
3262                                 }
3263 
3264                                 /* continue looking */
3265                                 page_unlock(pp);
3266                                 pp = pp->p_next;
3267                                 if (pp == first_pp)
3268                                         pp = NULL;
3269 
3270                         }
3271                         if (pp != NULL) {
3272                                 ASSERT(mtype == PP_2_MTYPE(pp));
3273                                 ASSERT(pp->p_szc == 0);
3274 
3275                                 /* found a page with specified DMA attributes */
3276                                 page_sub(&PAGE_FREELISTS(mnode, szc, bin,
3277                                     mtype), pp);
3278                                 page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST);
3279 
3280                                 if ((PP_ISFREE(pp) == 0) ||
3281                                     (PP_ISAGED(pp) == 0)) {
3282                                         cmn_err(CE_PANIC, "page %p is not free",
3283                                             (void *)pp);
3284                                 }
3285 
3286                                 mutex_exit(pcm);
3287                                 check_dma(dma_attr, pp, 1);
3288                                 VM_STAT_ADD(pga_vmstats.pgma_allocok);
3289                                 return (pp);
3290                         }
3291                         mutex_exit(pcm);
3292 nextfreebin:
3293                         if (plw_initialized == 0) {
3294                                 page_list_walk_init(szc, 0, bin, 1, 0, &plw);
3295                                 ASSERT(plw.plw_ceq_dif == page_colors);
3296                                 plw_initialized = 1;
3297                         }
3298 
3299                         if (plw.plw_do_split) {
3300                                 pp = page_freelist_split(szc, bin, mnode,
3301                                     mtype,
3302                                     mmu_btop(dma_attr->dma_attr_addr_lo),
3303                                     mmu_btop(dma_attr->dma_attr_addr_hi + 1),
3304                                     &plw);
3305                                 if (pp != NULL) {
3306                                         check_dma(dma_attr, pp, 1);
3307                                         return (pp);
3308                                 }
3309                         }
3310 
3311                         bin = page_list_walk_next_bin(szc, bin, &plw);
3312                 }
3313 
3314                 MTYPE_NEXT(mnode, mtype, flags);
3315         } while (mtype >= 0);
3316 
3317         /* failed to find a page in the freelist; try it in the cachelist */
3318 
3319         /* reset mtype start for cachelist search */
3320         mtype = mtypestart;
3321         ASSERT(mtype >= 0);
3322 
3323         /* start with the bin of matching color */
3324         bin = origbin;
3325 
3326         do {
3327                 for (i = 0; i <= page_colors; i++) {
3328                         if (PAGE_CACHELISTS(mnode, bin, mtype) == NULL)
3329                                 goto nextcachebin;
3330                         pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST);
3331                         mutex_enter(pcm);
3332                         pp = PAGE_CACHELISTS(mnode, bin, mtype);
3333                         first_pp = pp;
3334                         while (pp != NULL) {
3335                                 if (IS_DUMP_PAGE(pp) || page_trylock(pp,
3336                                     SE_EXCL) == 0) {
3337                                         pp = pp->p_next;
3338                                         if (pp == first_pp)
3339                                                 pp = NULL;
3340                                         continue;
3341                                 }
3342                                 ASSERT(pp->p_vnode);
3343                                 ASSERT(PP_ISAGED(pp) == 0);
3344                                 ASSERT(pp->p_szc == 0);
3345                                 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
3346 
3347                                 /* check if page within DMA attributes */
3348 
3349                                 pgaddr = pa_to_ma(pfn_to_pa(pp->p_pagenum));
3350                                 if ((pgaddr >= dma_attr->dma_attr_addr_lo) &&
3351                                     (pgaddr + MMU_PAGESIZE - 1 <=
3352                                     dma_attr->dma_attr_addr_hi)) {
3353                                         break;
3354                                 }
3355 
3356                                 /* continue looking */
3357                                 page_unlock(pp);
3358                                 pp = pp->p_next;
3359                                 if (pp == first_pp)
3360                                         pp = NULL;
3361                         }
3362 
3363                         if (pp != NULL) {
3364                                 ASSERT(mtype == PP_2_MTYPE(pp));
3365                                 ASSERT(pp->p_szc == 0);
3366 
3367                                 /* found a page with specified DMA attributes */
3368                                 page_sub(&PAGE_CACHELISTS(mnode, bin,
3369                                     mtype), pp);
3370                                 page_ctr_sub(mnode, mtype, pp, PG_CACHE_LIST);
3371 
3372                                 mutex_exit(pcm);
3373                                 ASSERT(pp->p_vnode);
3374                                 ASSERT(PP_ISAGED(pp) == 0);
3375                                 check_dma(dma_attr, pp, 1);
3376                                 VM_STAT_ADD(pga_vmstats.pgma_allocok);
3377                                 return (pp);
3378                         }
3379                         mutex_exit(pcm);
3380 nextcachebin:
3381                         bin += (i == 0) ? BIN_STEP : 1;
3382                         bin &= page_colors_mask;
3383                 }
3384                 MTYPE_NEXT(mnode, mtype, flags);
3385         } while (mtype >= 0);
3386 
3387         VM_STAT_ADD(pga_vmstats.pgma_allocfailed);
3388         return (NULL);
3389 }
3390 
3391 /*
3392  * This function is similar to page_get_freelist()/page_get_cachelist()
3393  * but it searches both the lists to find a page with the specified
3394  * color (or no color) and DMA attributes. The search is done in the
3395  * freelist first and then in the cache list within the highest memory
3396  * range (based on DMA attributes) before searching in the lower
3397  * memory ranges.
3398  *
3399  * Note: This function is called only by page_create_io().
3400  */
3401 /*ARGSUSED*/
3402 static page_t *
3403 page_get_anylist(struct vnode *vp, u_offset_t off, struct as *as, caddr_t vaddr,
3404     size_t size, uint_t flags, ddi_dma_attr_t *dma_attr, lgrp_t *lgrp)
3405 {
3406         uint_t          bin;
3407         int             mtype;
3408         page_t          *pp;
3409         int             n;
3410         int             m;
3411         int             szc;
3412         int             fullrange;
3413         int             mnode;
3414         int             local_failed_stat = 0;
3415         lgrp_mnode_cookie_t     lgrp_cookie;
3416 
3417         VM_STAT_ADD(pga_vmstats.pga_alloc);
3418 
3419         /* only base pagesize currently supported */
3420         if (size != MMU_PAGESIZE)
3421                 return (NULL);
3422 
3423         /*
3424          * If we're passed a specific lgroup, we use it.  Otherwise,
3425          * assume first-touch placement is desired.
3426          */
3427         if (!LGRP_EXISTS(lgrp))
3428                 lgrp = lgrp_home_lgrp();
3429 
3430         /* LINTED */
3431         AS_2_BIN(as, seg, vp, vaddr, bin, 0);
3432 
3433         /*
3434          * Only hold one freelist or cachelist lock at a time, that way we
3435          * can start anywhere and not have to worry about lock
3436          * ordering.
3437          */
3438         if (dma_attr == NULL) {
3439                 n = mtype16m;
3440                 m = mtypetop;
3441                 fullrange = 1;
3442                 VM_STAT_ADD(pga_vmstats.pga_nulldmaattr);
3443         } else {
3444                 pfn_t pfnlo = mmu_btop(dma_attr->dma_attr_addr_lo);
3445                 pfn_t pfnhi = mmu_btop(dma_attr->dma_attr_addr_hi);
3446 
3447                 /*
3448                  * We can guarantee alignment only for page boundary.
3449                  */
3450                 if (dma_attr->dma_attr_align > MMU_PAGESIZE)
3451                         return (NULL);
3452 
3453                 /* Sanity check the dma_attr */
3454                 if (pfnlo > pfnhi)
3455                         return (NULL);
3456 
3457                 n = pfn_2_mtype(pfnlo);
3458                 m = pfn_2_mtype(pfnhi);
3459 
3460                 fullrange = ((pfnlo == mnoderanges[n].mnr_pfnlo) &&
3461                     (pfnhi >= mnoderanges[m].mnr_pfnhi));
3462         }
3463         VM_STAT_COND_ADD(fullrange == 0, pga_vmstats.pga_notfullrange);
3464 
3465         szc = 0;
3466 
3467         /* cylcing thru mtype handled by RANGE0 if n == mtype16m */
3468         if (n == mtype16m) {
3469                 flags |= PGI_MT_RANGE0;
3470                 n = m;
3471         }
3472 
3473         /*
3474          * Try local memory node first, but try remote if we can't
3475          * get a page of the right color.
3476          */
3477         LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_HIER);
3478         while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3479                 /*
3480                  * allocate pages from high pfn to low.
3481                  */
3482                 mtype = m;
3483                 do {
3484                         if (fullrange != 0) {
3485                                 pp = page_get_mnode_freelist(mnode,
3486                                     bin, mtype, szc, flags);
3487                                 if (pp == NULL) {
3488                                         pp = page_get_mnode_cachelist(
3489                                             bin, flags, mnode, mtype);
3490                                 }
3491                         } else {
3492                                 pp = page_get_mnode_anylist(bin, szc,
3493                                     flags, mnode, mtype, dma_attr);
3494                         }
3495                         if (pp != NULL) {
3496                                 VM_STAT_ADD(pga_vmstats.pga_allocok);
3497                                 check_dma(dma_attr, pp, 1);
3498                                 return (pp);
3499                         }
3500                 } while (mtype != n &&
3501                     (mtype = mnoderanges[mtype].mnr_next) != -1);
3502                 if (!local_failed_stat) {
3503                         lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1);
3504                         local_failed_stat = 1;
3505                 }
3506         }
3507         VM_STAT_ADD(pga_vmstats.pga_allocfailed);
3508 
3509         return (NULL);
3510 }
3511 
3512 /*
3513  * page_create_io()
3514  *
3515  * This function is a copy of page_create_va() with an additional
3516  * argument 'mattr' that specifies DMA memory requirements to
3517  * the page list functions. This function is used by the segkmem
3518  * allocator so it is only to create new pages (i.e PG_EXCL is
3519  * set).
3520  *
3521  * Note: This interface is currently used by x86 PSM only and is
3522  *       not fully specified so the commitment level is only for
3523  *       private interface specific to x86. This interface uses PSM
3524  *       specific page_get_anylist() interface.
3525  */
3526 
3527 #define PAGE_HASH_SEARCH(index, pp, vp, off) { \
3528         for ((pp) = page_hash[(index)]; (pp); (pp) = (pp)->p_hash) { \
3529                 if ((pp)->p_vnode == (vp) && (pp)->p_offset == (off)) \
3530                         break; \
3531         } \
3532 }
3533 
3534 
3535 page_t *
3536 page_create_io(
3537         struct vnode    *vp,
3538         u_offset_t      off,
3539         uint_t          bytes,
3540         uint_t          flags,
3541         struct as       *as,
3542         caddr_t         vaddr,
3543         ddi_dma_attr_t  *mattr) /* DMA memory attributes if any */
3544 {
3545         page_t          *plist = NULL;
3546         uint_t          plist_len = 0;
3547         pgcnt_t         npages;
3548         page_t          *npp = NULL;
3549         uint_t          pages_req;
3550         page_t          *pp;
3551         kmutex_t        *phm = NULL;
3552         uint_t          index;
3553 
3554         TRACE_4(TR_FAC_VM, TR_PAGE_CREATE_START,
3555             "page_create_start:vp %p off %llx bytes %u flags %x",
3556             vp, off, bytes, flags);
3557 
3558         ASSERT((flags & ~(PG_EXCL | PG_WAIT | PG_PHYSCONTIG)) == 0);
3559 
3560         pages_req = npages = mmu_btopr(bytes);
3561 
3562         /*
3563          * Do the freemem and pcf accounting.
3564          */
3565         if (!page_create_wait(npages, flags)) {
3566                 return (NULL);
3567         }
3568 
3569         TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SUCCESS,
3570             "page_create_success:vp %p off %llx", vp, off);
3571 
3572         /*
3573          * If satisfying this request has left us with too little
3574          * memory, start the wheels turning to get some back.  The
3575          * first clause of the test prevents waking up the pageout
3576          * daemon in situations where it would decide that there's
3577          * nothing to do.
3578          */
3579         if (nscan < desscan && freemem < minfree) {
3580                 TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL,
3581                     "pageout_cv_signal:freemem %ld", freemem);
3582                 cv_signal(&proc_pageout->p_cv);
3583         }
3584 
3585         if (flags & PG_PHYSCONTIG) {
3586 
3587                 plist = page_get_contigpage(&npages, mattr, 1);
3588                 if (plist == NULL) {
3589                         page_create_putback(npages);
3590                         return (NULL);
3591                 }
3592 
3593                 pp = plist;
3594 
3595                 do {
3596                         if (!page_hashin(pp, vp, off, NULL)) {
3597                                 panic("pg_creat_io: hashin failed %p %p %llx",
3598                                     (void *)pp, (void *)vp, off);
3599                         }
3600                         VM_STAT_ADD(page_create_new);
3601                         off += MMU_PAGESIZE;
3602                         PP_CLRFREE(pp);
3603                         PP_CLRAGED(pp);
3604                         page_set_props(pp, P_REF);
3605                         pp = pp->p_next;
3606                 } while (pp != plist);
3607 
3608                 if (!npages) {
3609                         check_dma(mattr, plist, pages_req);
3610                         return (plist);
3611                 } else {
3612                         vaddr += (pages_req - npages) << MMU_PAGESHIFT;
3613                 }
3614 
3615                 /*
3616                  * fall-thru:
3617                  *
3618                  * page_get_contigpage returns when npages <= sgllen.
3619                  * Grab the rest of the non-contig pages below from anylist.
3620                  */
3621         }
3622 
3623         /*
3624          * Loop around collecting the requested number of pages.
3625          * Most of the time, we have to `create' a new page. With
3626          * this in mind, pull the page off the free list before
3627          * getting the hash lock.  This will minimize the hash
3628          * lock hold time, nesting, and the like.  If it turns
3629          * out we don't need the page, we put it back at the end.
3630          */
3631         while (npages--) {
3632                 phm = NULL;
3633 
3634                 index = PAGE_HASH_FUNC(vp, off);
3635 top:
3636                 ASSERT(phm == NULL);
3637                 ASSERT(index == PAGE_HASH_FUNC(vp, off));
3638                 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
3639 
3640                 if (npp == NULL) {
3641                         /*
3642                          * Try to get the page of any color either from
3643                          * the freelist or from the cache list.
3644                          */
3645                         npp = page_get_anylist(vp, off, as, vaddr, MMU_PAGESIZE,
3646                             flags & ~PG_MATCH_COLOR, mattr, NULL);
3647                         if (npp == NULL) {
3648                                 if (mattr == NULL) {
3649                                         /*
3650                                          * Not looking for a special page;
3651                                          * panic!
3652                                          */
3653                                         panic("no page found %d", (int)npages);
3654                                 }
3655                                 /*
3656                                  * No page found! This can happen
3657                                  * if we are looking for a page
3658                                  * within a specific memory range
3659                                  * for DMA purposes. If PG_WAIT is
3660                                  * specified then we wait for a
3661                                  * while and then try again. The
3662                                  * wait could be forever if we
3663                                  * don't get the page(s) we need.
3664                                  *
3665                                  * Note: XXX We really need a mechanism
3666                                  * to wait for pages in the desired
3667                                  * range. For now, we wait for any
3668                                  * pages and see if we can use it.
3669                                  */
3670 
3671                                 if ((mattr != NULL) && (flags & PG_WAIT)) {
3672                                         delay(10);
3673                                         goto top;
3674                                 }
3675                                 goto fail; /* undo accounting stuff */
3676                         }
3677 
3678                         if (PP_ISAGED(npp) == 0) {
3679                                 /*
3680                                  * Since this page came from the
3681                                  * cachelist, we must destroy the
3682                                  * old vnode association.
3683                                  */
3684                                 page_hashout(npp, (kmutex_t *)NULL);
3685                         }
3686                 }
3687 
3688                 /*
3689                  * We own this page!
3690                  */
3691                 ASSERT(PAGE_EXCL(npp));
3692                 ASSERT(npp->p_vnode == NULL);
3693                 ASSERT(!hat_page_is_mapped(npp));
3694                 PP_CLRFREE(npp);
3695                 PP_CLRAGED(npp);
3696 
3697                 /*
3698                  * Here we have a page in our hot little mits and are
3699                  * just waiting to stuff it on the appropriate lists.
3700                  * Get the mutex and check to see if it really does
3701                  * not exist.
3702                  */
3703                 phm = PAGE_HASH_MUTEX(index);
3704                 mutex_enter(phm);
3705                 PAGE_HASH_SEARCH(index, pp, vp, off);
3706                 if (pp == NULL) {
3707                         VM_STAT_ADD(page_create_new);
3708                         pp = npp;
3709                         npp = NULL;
3710                         if (!page_hashin(pp, vp, off, phm)) {
3711                                 /*
3712                                  * Since we hold the page hash mutex and
3713                                  * just searched for this page, page_hashin
3714                                  * had better not fail.  If it does, that
3715                                  * means somethread did not follow the
3716                                  * page hash mutex rules.  Panic now and
3717                                  * get it over with.  As usual, go down
3718                                  * holding all the locks.
3719                                  */
3720                                 ASSERT(MUTEX_HELD(phm));
3721                                 panic("page_create: hashin fail %p %p %llx %p",
3722                                     (void *)pp, (void *)vp, off, (void *)phm);
3723 
3724                         }
3725                         ASSERT(MUTEX_HELD(phm));
3726                         mutex_exit(phm);
3727                         phm = NULL;
3728 
3729                         /*
3730                          * Hat layer locking need not be done to set
3731                          * the following bits since the page is not hashed
3732                          * and was on the free list (i.e., had no mappings).
3733                          *
3734                          * Set the reference bit to protect
3735                          * against immediate pageout
3736                          *
3737                          * XXXmh modify freelist code to set reference
3738                          * bit so we don't have to do it here.
3739                          */
3740                         page_set_props(pp, P_REF);
3741                 } else {
3742                         ASSERT(MUTEX_HELD(phm));
3743                         mutex_exit(phm);
3744                         phm = NULL;
3745                         /*
3746                          * NOTE: This should not happen for pages associated
3747                          *       with kernel vnode 'kvp'.
3748                          */
3749                         /* XX64 - to debug why this happens! */
3750                         ASSERT(!VN_ISKAS(vp));
3751                         if (VN_ISKAS(vp))
3752                                 cmn_err(CE_NOTE,
3753                                     "page_create: page not expected "
3754                                     "in hash list for kernel vnode - pp 0x%p",
3755                                     (void *)pp);
3756                         VM_STAT_ADD(page_create_exists);
3757                         goto fail;
3758                 }
3759 
3760                 /*
3761                  * Got a page!  It is locked.  Acquire the i/o
3762                  * lock since we are going to use the p_next and
3763                  * p_prev fields to link the requested pages together.
3764                  */
3765                 page_io_lock(pp);
3766                 page_add(&plist, pp);
3767                 plist = plist->p_next;
3768                 off += MMU_PAGESIZE;
3769                 vaddr += MMU_PAGESIZE;
3770         }
3771 
3772         check_dma(mattr, plist, pages_req);
3773         return (plist);
3774 
3775 fail:
3776         if (npp != NULL) {
3777                 /*
3778                  * Did not need this page after all.
3779                  * Put it back on the free list.
3780                  */
3781                 VM_STAT_ADD(page_create_putbacks);
3782                 PP_SETFREE(npp);
3783                 PP_SETAGED(npp);
3784                 npp->p_offset = (u_offset_t)-1;
3785                 page_list_add(npp, PG_FREE_LIST | PG_LIST_TAIL);
3786                 page_unlock(npp);
3787         }
3788 
3789         /*
3790          * Give up the pages we already got.
3791          */
3792         while (plist != NULL) {
3793                 pp = plist;
3794                 page_sub(&plist, pp);
3795                 page_io_unlock(pp);
3796                 plist_len++;
3797                 /*LINTED: constant in conditional ctx*/
3798                 VN_DISPOSE(pp, B_INVAL, 0, kcred);
3799         }
3800 
3801         /*
3802          * VN_DISPOSE does freemem accounting for the pages in plist
3803          * by calling page_free. So, we need to undo the pcf accounting
3804          * for only the remaining pages.
3805          */
3806         VM_STAT_ADD(page_create_putbacks);
3807         page_create_putback(pages_req - plist_len);
3808 
3809         return (NULL);
3810 }
3811 #endif /* !__xpv */
3812 
3813 
3814 /*
3815  * Copy the data from the physical page represented by "frompp" to
3816  * that represented by "topp". ppcopy uses CPU->cpu_caddr1 and
3817  * CPU->cpu_caddr2.  It assumes that no one uses either map at interrupt
3818  * level and no one sleeps with an active mapping there.
3819  *
3820  * Note that the ref/mod bits in the page_t's are not affected by
3821  * this operation, hence it is up to the caller to update them appropriately.
3822  */
3823 int
3824 ppcopy(page_t *frompp, page_t *topp)
3825 {
3826         caddr_t         pp_addr1;
3827         caddr_t         pp_addr2;
3828         hat_mempte_t    pte1;
3829         hat_mempte_t    pte2;
3830         kmutex_t        *ppaddr_mutex;
3831         label_t         ljb;
3832         int             ret = 1;
3833 
3834         ASSERT_STACK_ALIGNED();
3835         ASSERT(PAGE_LOCKED(frompp));
3836         ASSERT(PAGE_LOCKED(topp));
3837 
3838         if (kpm_enable) {
3839                 pp_addr1 = hat_kpm_page2va(frompp, 0);
3840                 pp_addr2 = hat_kpm_page2va(topp, 0);
3841                 kpreempt_disable();
3842         } else {
3843                 /*
3844                  * disable pre-emption so that CPU can't change
3845                  */
3846                 kpreempt_disable();
3847 
3848                 pp_addr1 = CPU->cpu_caddr1;
3849                 pp_addr2 = CPU->cpu_caddr2;
3850                 pte1 = CPU->cpu_caddr1pte;
3851                 pte2 = CPU->cpu_caddr2pte;
3852 
3853                 ppaddr_mutex = &CPU->cpu_ppaddr_mutex;
3854                 mutex_enter(ppaddr_mutex);
3855 
3856                 hat_mempte_remap(page_pptonum(frompp), pp_addr1, pte1,
3857                     PROT_READ | HAT_STORECACHING_OK, HAT_LOAD_NOCONSIST);
3858                 hat_mempte_remap(page_pptonum(topp), pp_addr2, pte2,
3859                     PROT_READ | PROT_WRITE | HAT_STORECACHING_OK,
3860                     HAT_LOAD_NOCONSIST);
3861         }
3862 
3863         if (on_fault(&ljb)) {
3864                 ret = 0;
3865                 goto faulted;
3866         }
3867         if (use_sse_pagecopy)
3868 #ifdef __xpv
3869                 page_copy_no_xmm(pp_addr2, pp_addr1);
3870 #else
3871                 hwblkpagecopy(pp_addr1, pp_addr2);
3872 #endif
3873         else
3874                 bcopy(pp_addr1, pp_addr2, PAGESIZE);
3875 
3876         no_fault();
3877 faulted:
3878         if (!kpm_enable) {
3879 #ifdef __xpv
3880                 /*
3881                  * We can't leave unused mappings laying about under the
3882                  * hypervisor, so blow them away.
3883                  */
3884                 if (HYPERVISOR_update_va_mapping((uintptr_t)pp_addr1, 0,
3885                     UVMF_INVLPG | UVMF_LOCAL) < 0)
3886                         panic("HYPERVISOR_update_va_mapping() failed");
3887                 if (HYPERVISOR_update_va_mapping((uintptr_t)pp_addr2, 0,
3888                     UVMF_INVLPG | UVMF_LOCAL) < 0)
3889                         panic("HYPERVISOR_update_va_mapping() failed");
3890 #endif
3891                 mutex_exit(ppaddr_mutex);
3892         }
3893         kpreempt_enable();
3894         return (ret);
3895 }
3896 
3897 void
3898 pagezero(page_t *pp, uint_t off, uint_t len)
3899 {
3900         ASSERT(PAGE_LOCKED(pp));
3901         pfnzero(page_pptonum(pp), off, len);
3902 }
3903 
3904 /*
3905  * Zero the physical page from off to off + len given by pfn
3906  * without changing the reference and modified bits of page.
3907  *
3908  * We use this using CPU private page address #2, see ppcopy() for more info.
3909  * pfnzero() must not be called at interrupt level.
3910  */
3911 void
3912 pfnzero(pfn_t pfn, uint_t off, uint_t len)
3913 {
3914         caddr_t         pp_addr2;
3915         hat_mempte_t    pte2;
3916         kmutex_t        *ppaddr_mutex = NULL;
3917 
3918         ASSERT_STACK_ALIGNED();
3919         ASSERT(len <= MMU_PAGESIZE);
3920         ASSERT(off <= MMU_PAGESIZE);
3921         ASSERT(off + len <= MMU_PAGESIZE);
3922 
3923         if (kpm_enable && !pfn_is_foreign(pfn)) {
3924                 pp_addr2 = hat_kpm_pfn2va(pfn);
3925                 kpreempt_disable();
3926         } else {
3927                 kpreempt_disable();
3928 
3929                 pp_addr2 = CPU->cpu_caddr2;
3930                 pte2 = CPU->cpu_caddr2pte;
3931 
3932                 ppaddr_mutex = &CPU->cpu_ppaddr_mutex;
3933                 mutex_enter(ppaddr_mutex);
3934 
3935                 hat_mempte_remap(pfn, pp_addr2, pte2,
3936                     PROT_READ | PROT_WRITE | HAT_STORECACHING_OK,
3937                     HAT_LOAD_NOCONSIST);
3938         }
3939 
3940         if (use_sse_pagezero) {
3941 #ifdef __xpv
3942                 uint_t rem;
3943 
3944                 /*
3945                  * zero a byte at a time until properly aligned for
3946                  * block_zero_no_xmm().
3947                  */
3948                 while (!P2NPHASE(off, ((uint_t)BLOCKZEROALIGN)) && len-- > 0)
3949                         pp_addr2[off++] = 0;
3950 
3951                 /*
3952                  * Now use faster block_zero_no_xmm() for any range
3953                  * that is properly aligned and sized.
3954                  */
3955                 rem = P2PHASE(len, ((uint_t)BLOCKZEROALIGN));
3956                 len -= rem;
3957                 if (len != 0) {
3958                         block_zero_no_xmm(pp_addr2 + off, len);
3959                         off += len;
3960                 }
3961 
3962                 /*
3963                  * zero remainder with byte stores.
3964                  */
3965                 while (rem-- > 0)
3966                         pp_addr2[off++] = 0;
3967 #else
3968                 hwblkclr(pp_addr2 + off, len);
3969 #endif
3970         } else {
3971                 bzero(pp_addr2 + off, len);
3972         }
3973 
3974         if (!kpm_enable || pfn_is_foreign(pfn)) {
3975 #ifdef __xpv
3976                 /*
3977                  * On the hypervisor this page might get used for a page
3978                  * table before any intervening change to this mapping,
3979                  * so blow it away.
3980                  */
3981                 if (HYPERVISOR_update_va_mapping((uintptr_t)pp_addr2, 0,
3982                     UVMF_INVLPG) < 0)
3983                         panic("HYPERVISOR_update_va_mapping() failed");
3984 #endif
3985                 mutex_exit(ppaddr_mutex);
3986         }
3987 
3988         kpreempt_enable();
3989 }
3990 
3991 /*
3992  * Platform-dependent page scrub call.
3993  */
3994 void
3995 pagescrub(page_t *pp, uint_t off, uint_t len)
3996 {
3997         /*
3998          * For now, we rely on the fact that pagezero() will
3999          * always clear UEs.
4000          */
4001         pagezero(pp, off, len);
4002 }
4003 
4004 /*
4005  * set up two private addresses for use on a given CPU for use in ppcopy()
4006  */
4007 void
4008 setup_vaddr_for_ppcopy(struct cpu *cpup)
4009 {
4010         void *addr;
4011         hat_mempte_t pte_pa;
4012 
4013         addr = vmem_alloc(heap_arena, mmu_ptob(1), VM_SLEEP);
4014         pte_pa = hat_mempte_setup(addr);
4015         cpup->cpu_caddr1 = addr;
4016         cpup->cpu_caddr1pte = pte_pa;
4017 
4018         addr = vmem_alloc(heap_arena, mmu_ptob(1), VM_SLEEP);
4019         pte_pa = hat_mempte_setup(addr);
4020         cpup->cpu_caddr2 = addr;
4021         cpup->cpu_caddr2pte = pte_pa;
4022 
4023         mutex_init(&cpup->cpu_ppaddr_mutex, NULL, MUTEX_DEFAULT, NULL);
4024 }
4025 
4026 /*
4027  * Undo setup_vaddr_for_ppcopy
4028  */
4029 void
4030 teardown_vaddr_for_ppcopy(struct cpu *cpup)
4031 {
4032         mutex_destroy(&cpup->cpu_ppaddr_mutex);
4033 
4034         hat_mempte_release(cpup->cpu_caddr2, cpup->cpu_caddr2pte);
4035         cpup->cpu_caddr2pte = 0;
4036         vmem_free(heap_arena, cpup->cpu_caddr2, mmu_ptob(1));
4037         cpup->cpu_caddr2 = 0;
4038 
4039         hat_mempte_release(cpup->cpu_caddr1, cpup->cpu_caddr1pte);
4040         cpup->cpu_caddr1pte = 0;
4041         vmem_free(heap_arena, cpup->cpu_caddr1, mmu_ptob(1));
4042         cpup->cpu_caddr1 = 0;
4043 }
4044 
4045 /*
4046  * Function for flushing D-cache when performing module relocations
4047  * to an alternate mapping.  Unnecessary on Intel / AMD platforms.
4048  */
4049 void
4050 dcache_flushall()
4051 {}
4052 
4053 /*
4054  * Allocate a memory page.  The argument 'seed' can be any pseudo-random
4055  * number to vary where the pages come from.  This is quite a hacked up
4056  * method -- it works for now, but really needs to be fixed up a bit.
4057  *
4058  * We currently use page_create_va() on the kvp with fake offsets,
4059  * segments and virt address.  This is pretty bogus, but was copied from the
4060  * old hat_i86.c code.  A better approach would be to specify either mnode
4061  * random or mnode local and takes a page from whatever color has the MOST
4062  * available - this would have a minimal impact on page coloring.
4063  */
4064 page_t *
4065 page_get_physical(uintptr_t seed)
4066 {
4067         page_t *pp;
4068         u_offset_t offset;
4069         static struct seg tmpseg;
4070         static uintptr_t ctr = 0;
4071 
4072         /*
4073          * This code is gross, we really need a simpler page allocator.
4074          *
4075          * We need to assign an offset for the page to call page_create_va()
4076          * To avoid conflicts with other pages, we get creative with the offset.
4077          * For 32 bits, we need an offset > 4Gig
4078          * For 64 bits, need an offset somewhere in the VA hole.
4079          */
4080         offset = seed;
4081         if (offset > kernelbase)
4082                 offset -= kernelbase;
4083         offset <<= MMU_PAGESHIFT;
4084 #if defined(__amd64)
4085         offset += mmu.hole_start;       /* something in VA hole */
4086 #else
4087         offset += 1ULL << 40;     /* something > 4 Gig */
4088 #endif
4089 
4090         if (page_resv(1, KM_NOSLEEP) == 0)
4091                 return (NULL);
4092 
4093 #ifdef  DEBUG
4094         pp = page_exists(&kvp, offset);
4095         if (pp != NULL)
4096                 panic("page already exists %p", (void *)pp);
4097 #endif
4098 
4099         pp = page_create_va(&kvp, offset, MMU_PAGESIZE, PG_EXCL,
4100             &tmpseg, (caddr_t)(ctr += MMU_PAGESIZE));       /* changing VA usage */
4101         if (pp != NULL) {
4102                 page_io_unlock(pp);
4103                 page_downgrade(pp);
4104         }
4105         return (pp);
4106 }