illumos-gate Old usr/src/uts/i86pc/vm/vm

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
  23  */
  24 /*
  25  * Copyright (c) 2010, Intel Corporation.
  26  * All rights reserved.
  27  * Copyright 2018 Joyent, Inc.
  28  */
  29 
  30 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  31 /*      All Rights Reserved   */
  32 
  33 /*
  34  * Portions of this source code were derived from Berkeley 4.3 BSD
  35  * under license from the Regents of the University of California.
  36  */
  37 
  38 /*
  39  * UNIX machine dependent virtual memory support.
  40  */
  41 
  42 #include <sys/types.h>
  43 #include <sys/param.h>
  44 #include <sys/systm.h>
  45 #include <sys/user.h>
  46 #include <sys/proc.h>
  47 #include <sys/kmem.h>
  48 #include <sys/vmem.h>
  49 #include <sys/buf.h>
  50 #include <sys/cpuvar.h>
  51 #include <sys/lgrp.h>
  52 #include <sys/disp.h>
  53 #include <sys/vm.h>
  54 #include <sys/mman.h>
  55 #include <sys/vnode.h>
  56 #include <sys/cred.h>
  57 #include <sys/exec.h>
  58 #include <sys/exechdr.h>
  59 #include <sys/debug.h>
  60 #include <sys/vmsystm.h>
  61 #include <sys/swap.h>
  62 #include <sys/dumphdr.h>
  63 #include <sys/random.h>
  64 
  65 #include <vm/hat.h>
  66 #include <vm/as.h>
  67 #include <vm/seg.h>
  68 #include <vm/seg_kp.h>
  69 #include <vm/seg_vn.h>
  70 #include <vm/page.h>
  71 #include <vm/seg_kmem.h>
  72 #include <vm/seg_kpm.h>
  73 #include <vm/vm_dep.h>
  74 
  75 #include <sys/cpu.h>
  76 #include <sys/vm_machparam.h>
  77 #include <sys/memlist.h>
  78 #include <sys/bootconf.h> /* XXX the memlist stuff belongs in memlist_plat.h */
  79 #include <vm/hat_i86.h>
  80 #include <sys/x86_archext.h>
  81 #include <sys/elf_386.h>
  82 #include <sys/cmn_err.h>
  83 #include <sys/archsystm.h>
  84 #include <sys/machsystm.h>
  85 #include <sys/secflags.h>
  86 
  87 #include <sys/vtrace.h>
  88 #include <sys/ddidmareq.h>
  89 #include <sys/promif.h>
  90 #include <sys/memnode.h>
  91 #include <sys/stack.h>
  92 #include <util/qsort.h>
  93 #include <sys/taskq.h>
  94 
  95 #ifdef __xpv
  96 
  97 #include <sys/hypervisor.h>
  98 #include <sys/xen_mmu.h>
  99 #include <sys/balloon_impl.h>
 100 
 101 /*
 102  * domain 0 pages usable for DMA are kept pre-allocated and kept in
 103  * distinct lists, ordered by increasing mfn.
 104  */
 105 static kmutex_t io_pool_lock;
 106 static kmutex_t contig_list_lock;
 107 static page_t *io_pool_4g;      /* pool for 32 bit dma limited devices */
 108 static page_t *io_pool_16m;     /* pool for 24 bit dma limited legacy devices */
 109 static long io_pool_cnt;
 110 static long io_pool_cnt_max = 0;
 111 #define DEFAULT_IO_POOL_MIN     128
 112 static long io_pool_cnt_min = DEFAULT_IO_POOL_MIN;
 113 static long io_pool_cnt_lowater = 0;
 114 static long io_pool_shrink_attempts; /* how many times did we try to shrink */
 115 static long io_pool_shrinks;    /* how many times did we really shrink */
 116 static long io_pool_grows;      /* how many times did we grow */
 117 static mfn_t start_mfn = 1;
 118 static caddr_t io_pool_kva;     /* use to alloc pages when needed */
 119 
 120 static int create_contig_pfnlist(uint_t);
 121 
 122 /*
 123  * percentage of phys mem to hold in the i/o pool
 124  */
 125 #define DEFAULT_IO_POOL_PCT     2
 126 static long io_pool_physmem_pct = DEFAULT_IO_POOL_PCT;
 127 static void page_io_pool_sub(page_t **, page_t *, page_t *);
 128 int ioalloc_dbg = 0;
 129 
 130 #endif /* __xpv */
 131 
 132 uint_t vac_colors = 1;
 133 
 134 int largepagesupport = 0;
 135 extern uint_t page_create_new;
 136 extern uint_t page_create_exists;
 137 extern uint_t page_create_putbacks;
 138 /*
 139  * Allow users to disable the kernel's use of SSE.
 140  */
 141 extern int use_sse_pagecopy, use_sse_pagezero;
 142 
 143 /*
 144  * combined memory ranges from mnode and memranges[] to manage single
 145  * mnode/mtype dimension in the page lists.
 146  */
 147 typedef struct {
 148         pfn_t   mnr_pfnlo;
 149         pfn_t   mnr_pfnhi;
 150         int     mnr_mnode;
 151         int     mnr_memrange;           /* index into memranges[] */
 152         int     mnr_next;               /* next lower PA mnoderange */
 153         int     mnr_exists;
 154         /* maintain page list stats */
 155         pgcnt_t mnr_mt_clpgcnt;         /* cache list cnt */
 156         pgcnt_t mnr_mt_flpgcnt[MMU_PAGE_SIZES]; /* free list cnt per szc */
 157         pgcnt_t mnr_mt_totcnt;          /* sum of cache and free lists */
 158 #ifdef DEBUG
 159         struct mnr_mts {                /* mnode/mtype szc stats */
 160                 pgcnt_t mnr_mts_pgcnt;
 161                 int     mnr_mts_colors;
 162                 pgcnt_t *mnr_mtsc_pgcnt;
 163         }       *mnr_mts;
 164 #endif
 165 } mnoderange_t;
 166 
 167 #define MEMRANGEHI(mtype)                                               \
 168         ((mtype > 0) ? memranges[mtype - 1] - 1: physmax)
 169 #define MEMRANGELO(mtype)       (memranges[mtype])
 170 
 171 #define MTYPE_FREEMEM(mt)       (mnoderanges[mt].mnr_mt_totcnt)
 172 
 173 /*
 174  * As the PC architecture evolved memory up was clumped into several
 175  * ranges for various historical I/O devices to do DMA.
 176  * < 16Meg - ISA bus
 177  * < 2Gig - ???
 178  * < 4Gig - PCI bus or drivers that don't understand PAE mode
 179  *
 180  * These are listed in reverse order, so that we can skip over unused
 181  * ranges on machines with small memories.
 182  *
 183  * For now under the Hypervisor, we'll only ever have one memrange.
 184  */
 185 #define PFN_4GIG        0x100000
 186 #define PFN_16MEG       0x1000
 187 /* Indices into the memory range (arch_memranges) array. */
 188 #define MRI_4G          0
 189 #define MRI_2G          1
 190 #define MRI_16M         2
 191 #define MRI_0           3
 192 static pfn_t arch_memranges[NUM_MEM_RANGES] = {
 193     PFN_4GIG,   /* pfn range for 4G and above */
 194     0x80000,    /* pfn range for 2G-4G */
 195     PFN_16MEG,  /* pfn range for 16M-2G */
 196     0x00000,    /* pfn range for 0-16M */
 197 };
 198 pfn_t *memranges = &arch_memranges[0];
 199 int nranges = NUM_MEM_RANGES;
 200 
 201 /*
 202  * This combines mem_node_config and memranges into one data
 203  * structure to be used for page list management.
 204  */
 205 mnoderange_t    *mnoderanges;
 206 int             mnoderangecnt;
 207 int             mtype4g;
 208 int             mtype16m;
 209 int             mtypetop;       /* index of highest pfn'ed mnoderange */
 210 
 211 /*
 212  * 4g memory management variables for systems with more than 4g of memory:
 213  *
 214  * physical memory below 4g is required for 32bit dma devices and, currently,
 215  * for kmem memory. On systems with more than 4g of memory, the pool of memory
 216  * below 4g can be depleted without any paging activity given that there is
 217  * likely to be sufficient memory above 4g.
 218  *
 219  * physmax4g is set true if the largest pfn is over 4g. The rest of the
 220  * 4g memory management code is enabled only when physmax4g is true.
 221  *
 222  * maxmem4g is the count of the maximum number of pages on the page lists
 223  * with physical addresses below 4g. It can be a lot less then 4g given that
 224  * BIOS may reserve large chunks of space below 4g for hot plug pci devices,
 225  * agp aperture etc.
 226  *
 227  * freemem4g maintains the count of the number of available pages on the
 228  * page lists with physical addresses below 4g.
 229  *
 230  * DESFREE4G specifies the desired amount of below 4g memory. It defaults to
 231  * 6% (desfree4gshift = 4) of maxmem4g.
 232  *
 233  * RESTRICT4G_ALLOC returns true if freemem4g falls below DESFREE4G
 234  * and the amount of physical memory above 4g is greater than freemem4g.
 235  * In this case, page_get_* routines will restrict below 4g allocations
 236  * for requests that don't specifically require it.
 237  */
 238 
 239 #define DESFREE4G       (maxmem4g >> desfree4gshift)
 240 
 241 #define RESTRICT4G_ALLOC                                        \
 242         (physmax4g && (freemem4g < DESFREE4G) && ((freemem4g << 1) < freemem))
 243 
 244 static pgcnt_t  maxmem4g;
 245 static pgcnt_t  freemem4g;
 246 static int      physmax4g;
 247 static int      desfree4gshift = 4;     /* maxmem4g shift to derive DESFREE4G */
 248 
 249 /*
 250  * 16m memory management:
 251  *
 252  * reserve some amount of physical memory below 16m for legacy devices.
 253  *
 254  * RESTRICT16M_ALLOC returns true if an there are sufficient free pages above
 255  * 16m or if the 16m pool drops below DESFREE16M.
 256  *
 257  * In this case, general page allocations via page_get_{free,cache}list
 258  * routines will be restricted from allocating from the 16m pool. Allocations
 259  * that require specific pfn ranges (page_get_anylist) and PG_PANIC allocations
 260  * are not restricted.
 261  */
 262 
 263 #define FREEMEM16M      MTYPE_FREEMEM(mtype16m)
 264 #define DESFREE16M      desfree16m
 265 #define RESTRICT16M_ALLOC(freemem, pgcnt, flags)                \
 266         ((freemem != 0) && ((flags & PG_PANIC) == 0) &&             \
 267             ((freemem >= (FREEMEM16M)) ||                    \
 268             (FREEMEM16M  < (DESFREE16M + pgcnt))))
 269 
 270 static pgcnt_t  desfree16m = 0x380;
 271 
 272 /*
 273  * This can be patched via /etc/system to allow old non-PAE aware device
 274  * drivers to use kmem_alloc'd memory on 32 bit systems with > 4Gig RAM.
 275  */
 276 int restricted_kmemalloc = 0;
 277 
 278 #ifdef VM_STATS
 279 struct {
 280         ulong_t pga_alloc;
 281         ulong_t pga_notfullrange;
 282         ulong_t pga_nulldmaattr;
 283         ulong_t pga_allocok;
 284         ulong_t pga_allocfailed;
 285         ulong_t pgma_alloc;
 286         ulong_t pgma_allocok;
 287         ulong_t pgma_allocfailed;
 288         ulong_t pgma_allocempty;
 289 } pga_vmstats;
 290 #endif
 291 
 292 uint_t mmu_page_sizes;
 293 
 294 /* How many page sizes the users can see */
 295 uint_t mmu_exported_page_sizes;
 296 
 297 /* page sizes that legacy applications can see */
 298 uint_t mmu_legacy_page_sizes;
 299 
 300 /*
 301  * Number of pages in 1 GB.  Don't enable automatic large pages if we have
 302  * fewer than this many pages.
 303  */
 304 pgcnt_t shm_lpg_min_physmem = 1 << (30 - MMU_PAGESHIFT);
 305 pgcnt_t privm_lpg_min_physmem = 1 << (30 - MMU_PAGESHIFT);
 306 
 307 /*
 308  * Maximum and default segment size tunables for user private
 309  * and shared anon memory, and user text and initialized data.
 310  * These can be patched via /etc/system to allow large pages
 311  * to be used for mapping application private and shared anon memory.
 312  */
 313 size_t mcntl0_lpsize = MMU_PAGESIZE;
 314 size_t max_uheap_lpsize = MMU_PAGESIZE;
 315 size_t default_uheap_lpsize = MMU_PAGESIZE;
 316 size_t max_ustack_lpsize = MMU_PAGESIZE;
 317 size_t default_ustack_lpsize = MMU_PAGESIZE;
 318 size_t max_privmap_lpsize = MMU_PAGESIZE;
 319 size_t max_uidata_lpsize = MMU_PAGESIZE;
 320 size_t max_utext_lpsize = MMU_PAGESIZE;
 321 size_t max_shm_lpsize = MMU_PAGESIZE;
 322 
 323 
 324 /*
 325  * initialized by page_coloring_init().
 326  */
 327 uint_t  page_colors;
 328 uint_t  page_colors_mask;
 329 uint_t  page_coloring_shift;
 330 int     cpu_page_colors;
 331 static uint_t   l2_colors;
 332 
 333 /*
 334  * Page freelists and cachelists are dynamically allocated once mnoderangecnt
 335  * and page_colors are calculated from the l2 cache n-way set size.  Within a
 336  * mnode range, the page freelist and cachelist are hashed into bins based on
 337  * color. This makes it easier to search for a page within a specific memory
 338  * range.
 339  */
 340 #define PAGE_COLORS_MIN 16
 341 
 342 page_t ****page_freelists;
 343 page_t ***page_cachelists;
 344 
 345 
 346 /*
 347  * Used by page layer to know about page sizes
 348  */
 349 hw_pagesize_t hw_page_array[MAX_NUM_LEVEL + 1];
 350 
 351 kmutex_t        *fpc_mutex[NPC_MUTEX];
 352 kmutex_t        *cpc_mutex[NPC_MUTEX];
 353 
 354 /* Lock to protect mnoderanges array for memory DR operations. */
 355 static kmutex_t mnoderange_lock;
 356 
 357 /*
 358  * Only let one thread at a time try to coalesce large pages, to
 359  * prevent them from working against each other.
 360  */
 361 static kmutex_t contig_lock;
 362 #define CONTIG_LOCK()   mutex_enter(&contig_lock);
 363 #define CONTIG_UNLOCK() mutex_exit(&contig_lock);
 364 
 365 #define PFN_16M         (mmu_btop((uint64_t)0x1000000))
 366 
 367 caddr_t
 368 i86devmap(pfn_t pf, pgcnt_t pgcnt, uint_t prot)
 369 {
 370         caddr_t addr;
 371         caddr_t addr1;
 372         page_t *pp;
 373 
 374         addr1 = addr = vmem_alloc(heap_arena, mmu_ptob(pgcnt), VM_SLEEP);
 375 
 376         for (; pgcnt != 0; addr += MMU_PAGESIZE, ++pf, --pgcnt) {
 377                 pp = page_numtopp_nolock(pf);
 378                 if (pp == NULL) {
 379                         hat_devload(kas.a_hat, addr, MMU_PAGESIZE, pf,
 380                             prot | HAT_NOSYNC, HAT_LOAD_LOCK);
 381                 } else {
 382                         hat_memload(kas.a_hat, addr, pp,
 383                             prot | HAT_NOSYNC, HAT_LOAD_LOCK);
 384                 }
 385         }
 386 
 387         return (addr1);
 388 }
 389 
 390 /*
 391  * This routine is like page_numtopp, but accepts only free pages, which
 392  * it allocates (unfrees) and returns with the exclusive lock held.
 393  * It is used by machdep.c/dma_init() to find contiguous free pages.
 394  */
 395 page_t *
 396 page_numtopp_alloc(pfn_t pfnum)
 397 {
 398         page_t *pp;
 399 
 400 retry:
 401         pp = page_numtopp_nolock(pfnum);
 402         if (pp == NULL) {
 403                 return (NULL);
 404         }
 405 
 406         if (!page_trylock(pp, SE_EXCL)) {
 407                 return (NULL);
 408         }
 409 
 410         if (page_pptonum(pp) != pfnum) {
 411                 page_unlock(pp);
 412                 goto retry;
 413         }
 414 
 415         if (!PP_ISFREE(pp)) {
 416                 page_unlock(pp);
 417                 return (NULL);
 418         }
 419         if (pp->p_szc) {
 420                 page_demote_free_pages(pp);
 421                 page_unlock(pp);
 422                 goto retry;
 423         }
 424 
 425         /* If associated with a vnode, destroy mappings */
 426 
 427         if (pp->p_vnode) {
 428 
 429                 page_destroy_free(pp);
 430 
 431                 if (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_NO_RECLAIM)) {
 432                         return (NULL);
 433                 }
 434 
 435                 if (page_pptonum(pp) != pfnum) {
 436                         page_unlock(pp);
 437                         goto retry;
 438                 }
 439         }
 440 
 441         if (!PP_ISFREE(pp)) {
 442                 page_unlock(pp);
 443                 return (NULL);
 444         }
 445 
 446         if (!page_reclaim(pp, (kmutex_t *)NULL))
 447                 return (NULL);
 448 
 449         return (pp);
 450 }
 451 
 452 /*
 453  * Return the optimum page size for a given mapping
 454  */
 455 /*ARGSUSED*/
 456 size_t
 457 map_pgsz(int maptype, struct proc *p, caddr_t addr, size_t len, int memcntl)
 458 {
 459         level_t l = 0;
 460         size_t pgsz = MMU_PAGESIZE;
 461         size_t max_lpsize;
 462         uint_t mszc;
 463 
 464         ASSERT(maptype != MAPPGSZ_VA);
 465 
 466         if (maptype != MAPPGSZ_ISM && physmem < privm_lpg_min_physmem) {
 467                 return (MMU_PAGESIZE);
 468         }
 469 
 470         switch (maptype) {
 471         case MAPPGSZ_HEAP:
 472         case MAPPGSZ_STK:
 473                 max_lpsize = memcntl ? mcntl0_lpsize : (maptype ==
 474                     MAPPGSZ_HEAP ? max_uheap_lpsize : max_ustack_lpsize);
 475                 if (max_lpsize == MMU_PAGESIZE) {
 476                         return (MMU_PAGESIZE);
 477                 }
 478                 if (len == 0) {
 479                         len = (maptype == MAPPGSZ_HEAP) ? p->p_brkbase +
 480                             p->p_brksize - p->p_bssbase : p->p_stksize;
 481                 }
 482                 len = (maptype == MAPPGSZ_HEAP) ? MAX(len,
 483                     default_uheap_lpsize) : MAX(len, default_ustack_lpsize);
 484 
 485                 /*
 486                  * use the pages size that best fits len
 487                  */
 488                 for (l = mmu.umax_page_level; l > 0; --l) {
 489                         if (LEVEL_SIZE(l) > max_lpsize || len < LEVEL_SIZE(l)) {
 490                                 continue;
 491                         } else {
 492                                 pgsz = LEVEL_SIZE(l);
 493                         }
 494                         break;
 495                 }
 496 
 497                 mszc = (maptype == MAPPGSZ_HEAP ? p->p_brkpageszc :
 498                     p->p_stkpageszc);
 499                 if (addr == 0 && (pgsz < hw_page_array[mszc].hp_size)) {
 500                         pgsz = hw_page_array[mszc].hp_size;
 501                 }
 502                 return (pgsz);
 503 
 504         case MAPPGSZ_ISM:
 505                 for (l = mmu.umax_page_level; l > 0; --l) {
 506                         if (len >= LEVEL_SIZE(l))
 507                                 return (LEVEL_SIZE(l));
 508                 }
 509                 return (LEVEL_SIZE(0));
 510         }
 511         return (pgsz);
 512 }
 513 
 514 static uint_t
 515 map_szcvec(caddr_t addr, size_t size, uintptr_t off, size_t max_lpsize,
 516     size_t min_physmem)
 517 {
 518         caddr_t eaddr = addr + size;
 519         uint_t szcvec = 0;
 520         caddr_t raddr;
 521         caddr_t readdr;
 522         size_t  pgsz;
 523         int i;
 524 
 525         if (physmem < min_physmem || max_lpsize <= MMU_PAGESIZE) {
 526                 return (0);
 527         }
 528 
 529         for (i = mmu_exported_page_sizes - 1; i > 0; i--) {
 530                 pgsz = page_get_pagesize(i);
 531                 if (pgsz > max_lpsize) {
 532                         continue;
 533                 }
 534                 raddr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
 535                 readdr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
 536                 if (raddr < addr || raddr >= readdr) {
 537                         continue;
 538                 }
 539                 if (P2PHASE((uintptr_t)addr ^ off, pgsz)) {
 540                         continue;
 541                 }
 542                 /*
 543                  * Set szcvec to the remaining page sizes.
 544                  */
 545                 szcvec = ((1 << (i + 1)) - 1) & ~1;
 546                 break;
 547         }
 548         return (szcvec);
 549 }
 550 
 551 /*
 552  * Return a bit vector of large page size codes that
 553  * can be used to map [addr, addr + len) region.
 554  */
 555 /*ARGSUSED*/
 556 uint_t
 557 map_pgszcvec(caddr_t addr, size_t size, uintptr_t off, int flags, int type,
 558     int memcntl)
 559 {
 560         size_t max_lpsize = mcntl0_lpsize;
 561 
 562         if (mmu.max_page_level == 0)
 563                 return (0);
 564 
 565         if (flags & MAP_TEXT) {
 566                 if (!memcntl)
 567                         max_lpsize = max_utext_lpsize;
 568                 return (map_szcvec(addr, size, off, max_lpsize,
 569                     shm_lpg_min_physmem));
 570 
 571         } else if (flags & MAP_INITDATA) {
 572                 if (!memcntl)
 573                         max_lpsize = max_uidata_lpsize;
 574                 return (map_szcvec(addr, size, off, max_lpsize,
 575                     privm_lpg_min_physmem));
 576 
 577         } else if (type == MAPPGSZC_SHM) {
 578                 if (!memcntl)
 579                         max_lpsize = max_shm_lpsize;
 580                 return (map_szcvec(addr, size, off, max_lpsize,
 581                     shm_lpg_min_physmem));
 582 
 583         } else if (type == MAPPGSZC_HEAP) {
 584                 if (!memcntl)
 585                         max_lpsize = max_uheap_lpsize;
 586                 return (map_szcvec(addr, size, off, max_lpsize,
 587                     privm_lpg_min_physmem));
 588 
 589         } else if (type == MAPPGSZC_STACK) {
 590                 if (!memcntl)
 591                         max_lpsize = max_ustack_lpsize;
 592                 return (map_szcvec(addr, size, off, max_lpsize,
 593                     privm_lpg_min_physmem));
 594 
 595         } else {
 596                 if (!memcntl)
 597                         max_lpsize = max_privmap_lpsize;
 598                 return (map_szcvec(addr, size, off, max_lpsize,
 599                     privm_lpg_min_physmem));
 600         }
 601 }
 602 
 603 /*
 604  * Handle a pagefault.
 605  */
 606 faultcode_t
 607 pagefault(
 608         caddr_t addr,
 609         enum fault_type type,
 610         enum seg_rw rw,
 611         int iskernel)
 612 {
 613         struct as *as;
 614         struct hat *hat;
 615         struct proc *p;
 616         kthread_t *t;
 617         faultcode_t res;
 618         caddr_t base;
 619         size_t len;
 620         int err;
 621         int mapped_red;
 622         uintptr_t ea;
 623 
 624         ASSERT_STACK_ALIGNED();
 625 
 626         if (INVALID_VADDR(addr))
 627                 return (FC_NOMAP);
 628 
 629         mapped_red = segkp_map_red();
 630 
 631         if (iskernel) {
 632                 as = &kas;
 633                 hat = as->a_hat;
 634         } else {
 635                 t = curthread;
 636                 p = ttoproc(t);
 637                 as = p->p_as;
 638                 hat = as->a_hat;
 639         }
 640 
 641         /*
 642          * Dispatch pagefault.
 643          */
 644         res = as_fault(hat, as, addr, 1, type, rw);
 645 
 646         /*
 647          * If this isn't a potential unmapped hole in the user's
 648          * UNIX data or stack segments, just return status info.
 649          */
 650         if (res != FC_NOMAP || iskernel)
 651                 goto out;
 652 
 653         /*
 654          * Check to see if we happened to faulted on a currently unmapped
 655          * part of the UNIX data or stack segments.  If so, create a zfod
 656          * mapping there and then try calling the fault routine again.
 657          */
 658         base = p->p_brkbase;
 659         len = p->p_brksize;
 660 
 661         if (addr < base || addr >= base + len) {          /* data seg? */
 662                 base = (caddr_t)p->p_usrstack - p->p_stksize;
 663                 len = p->p_stksize;
 664                 if (addr < base || addr >= p->p_usrstack) {    /* stack seg? */
 665                         /* not in either UNIX data or stack segments */
 666                         res = FC_NOMAP;
 667                         goto out;
 668                 }
 669         }
 670 
 671         /*
 672          * the rest of this function implements a 3.X 4.X 5.X compatibility
 673          * This code is probably not needed anymore
 674          */
 675         if (p->p_model == DATAMODEL_ILP32) {
 676 
 677                 /* expand the gap to the page boundaries on each side */
 678                 ea = P2ROUNDUP((uintptr_t)base + len, MMU_PAGESIZE);
 679                 base = (caddr_t)P2ALIGN((uintptr_t)base, MMU_PAGESIZE);
 680                 len = ea - (uintptr_t)base;
 681 
 682                 as_rangelock(as);
 683                 if (as_gap(as, MMU_PAGESIZE, &base, &len, AH_CONTAIN, addr) ==
 684                     0) {
 685                         err = as_map(as, base, len, segvn_create, zfod_argsp);
 686                         as_rangeunlock(as);
 687                         if (err) {
 688                                 res = FC_MAKE_ERR(err);
 689                                 goto out;
 690                         }
 691                 } else {
 692                         /*
 693                          * This page is already mapped by another thread after
 694                          * we returned from as_fault() above.  We just fall
 695                          * through as_fault() below.
 696                          */
 697                         as_rangeunlock(as);
 698                 }
 699 
 700                 res = as_fault(hat, as, addr, 1, F_INVAL, rw);
 701         }
 702 
 703 out:
 704         if (mapped_red)
 705                 segkp_unmap_red();
 706 
 707         return (res);
 708 }
 709 
 710 void
 711 map_addr(caddr_t *addrp, size_t len, offset_t off, int vacalign, uint_t flags)
 712 {
 713         struct proc *p = curproc;
 714         caddr_t userlimit = (flags & _MAP_LOW32) ?
 715             (caddr_t)_userlimit32 : p->p_as->a_userlimit;
 716 
 717         map_addr_proc(addrp, len, off, vacalign, userlimit, curproc, flags);
 718 }
 719 
 720 /*ARGSUSED*/
 721 int
 722 map_addr_vacalign_check(caddr_t addr, u_offset_t off)
 723 {
 724         return (0);
 725 }
 726 
 727 /*
 728  * The maximum amount a randomized mapping will be slewed.  We should perhaps
 729  * arrange things so these tunables can be separate for mmap, mmapobj, and
 730  * ld.so
 731  */
 732 size_t aslr_max_map_skew = 256 * 1024 * 1024; /* 256MB */
 733 
 734 /*
 735  * map_addr_proc() is the routine called when the system is to
 736  * choose an address for the user.  We will pick an address
 737  * range which is the highest available below userlimit.
 738  *
 739  * Every mapping will have a redzone of a single page on either side of
 740  * the request. This is done to leave one page unmapped between segments.
 741  * This is not required, but it's useful for the user because if their
 742  * program strays across a segment boundary, it will catch a fault
 743  * immediately making debugging a little easier.  Currently the redzone
 744  * is mandatory.
 745  *
 746  * addrp is a value/result parameter.
 747  *      On input it is a hint from the user to be used in a completely
 748  *      machine dependent fashion.  We decide to completely ignore this hint.
 749  *      If MAP_ALIGN was specified, addrp contains the minimal alignment, which
 750  *      must be some "power of two" multiple of pagesize.
 751  *
 752  *      On output it is NULL if no address can be found in the current
 753  *      processes address space or else an address that is currently
 754  *      not mapped for len bytes with a page of red zone on either side.
 755  *
 756  *      vacalign is not needed on x86 (it's for viturally addressed caches)
 757  */
 758 /*ARGSUSED*/
 759 void
 760 map_addr_proc(
 761         caddr_t *addrp,
 762         size_t len,
 763         offset_t off,
 764         int vacalign,
 765         caddr_t userlimit,
 766         struct proc *p,
 767         uint_t flags)
 768 {
 769         struct as *as = p->p_as;
 770         caddr_t addr;
 771         caddr_t base;
 772         size_t slen;
 773         size_t align_amount;
 774 
 775         ASSERT32(userlimit == as->a_userlimit);
 776 
 777         base = p->p_brkbase;
 778 #if defined(__amd64)
 779         if (p->p_model == DATAMODEL_NATIVE) {
 780                 if (userlimit < as->a_userlimit) {
 781                         /*
 782                          * This happens when a program wants to map
 783                          * something in a range that's accessible to a
 784                          * program in a smaller address space.  For example,
 785                          * a 64-bit program calling mmap32(2) to guarantee
 786                          * that the returned address is below 4Gbytes.
 787                          */
 788                         ASSERT((uintptr_t)userlimit < ADDRESS_C(0xffffffff));
 789 
 790                         if (userlimit > base)
 791                                 slen = userlimit - base;
 792                         else {
 793                                 *addrp = NULL;
 794                                 return;
 795                         }
 796                 } else {
 797                         /*
 798                          * With the stack positioned at a higher address than
 799                          * the heap for 64-bit processes, it is necessary to be
 800                          * mindful of its location and potential size.
 801                          *
 802                          * Unallocated space above the top of the stack (that
 803                          * is, at a lower address) but still within the bounds
 804                          * of the stack limit should be considered unavailable.
 805                          *
 806                          * As the 64-bit stack guard is mapped in immediately
 807                          * adjacent to the stack limit boundary, this prevents
 808                          * new mappings from having accidentally dangerous
 809                          * proximity to the stack.
 810                          */
 811                         slen = p->p_usrstack - base -
 812                             ((p->p_stk_ctl + PAGEOFFSET) & PAGEMASK);
 813                 }
 814         } else
 815 #endif /* defined(__amd64) */
 816                 slen = userlimit - base;
 817 
 818         /* Make len be a multiple of PAGESIZE */
 819         len = (len + PAGEOFFSET) & PAGEMASK;
 820 
 821         /*
 822          * figure out what the alignment should be
 823          *
 824          * XX64 -- is there an ELF_AMD64_MAXPGSZ or is it the same????
 825          */
 826         if (len <= ELF_386_MAXPGSZ) {
 827                 /*
 828                  * Align virtual addresses to ensure that ELF shared libraries
 829                  * are mapped with the appropriate alignment constraints by
 830                  * the run-time linker.
 831                  */
 832                 align_amount = ELF_386_MAXPGSZ;
 833         } else {
 834                 /*
 835                  * For 32-bit processes, only those which have specified
 836                  * MAP_ALIGN and an addr will be aligned on a larger page size.
 837                  * Not doing so can potentially waste up to 1G of process
 838                  * address space.
 839                  */
 840                 int lvl = (p->p_model == DATAMODEL_ILP32) ? 1 :
 841                     mmu.umax_page_level;
 842 
 843                 while (lvl && len < LEVEL_SIZE(lvl))
 844                         --lvl;
 845 
 846                 align_amount = LEVEL_SIZE(lvl);
 847         }
 848         if ((flags & MAP_ALIGN) && ((uintptr_t)*addrp > align_amount))
 849                 align_amount = (uintptr_t)*addrp;
 850 
 851         ASSERT(ISP2(align_amount));
 852         ASSERT(align_amount == 0 || align_amount >= PAGESIZE);
 853 
 854         off = off & (align_amount - 1);
 855 
 856         /*
 857          * Look for a large enough hole starting below userlimit.
 858          * After finding it, use the upper part.
 859          */
 860         if (as_gap_aligned(as, len, &base, &slen, AH_HI, NULL, align_amount,
 861             PAGESIZE, off) == 0) {
 862                 caddr_t as_addr;
 863 
 864                 /*
 865                  * addr is the highest possible address to use since we have
 866                  * a PAGESIZE redzone at the beginning and end.
 867                  */
 868                 addr = base + slen - (PAGESIZE + len);
 869                 as_addr = addr;
 870                 /*
 871                  * Round address DOWN to the alignment amount and
 872                  * add the offset in.
 873                  * If addr is greater than as_addr, len would not be large
 874                  * enough to include the redzone, so we must adjust down
 875                  * by the alignment amount.
 876                  */
 877                 addr = (caddr_t)((uintptr_t)addr & (~(align_amount - 1)));
 878                 addr += (uintptr_t)off;
 879                 if (addr > as_addr) {
 880                         addr -= align_amount;
 881                 }
 882 
 883                 /*
 884                  * If randomization is requested, slew the allocation
 885                  * backwards, within the same gap, by a random amount.
 886                  */
 887                 if (flags & _MAP_RANDOMIZE) {
 888                         uint32_t slew;
 889 
 890                         (void) random_get_pseudo_bytes((uint8_t *)&slew,
 891                             sizeof (slew));
 892 
 893                         slew = slew % MIN(aslr_max_map_skew, (addr - base));
 894                         addr -= P2ALIGN(slew, align_amount);
 895                 }
 896 
 897                 ASSERT(addr > base);
 898                 ASSERT(addr + len < base + slen);
 899                 ASSERT(((uintptr_t)addr & (align_amount - 1)) ==
 900                     ((uintptr_t)(off)));
 901                 *addrp = addr;
 902         } else {
 903                 *addrp = NULL;  /* no more virtual space */
 904         }
 905 }
 906 
 907 int valid_va_range_aligned_wraparound;
 908 
 909 /*
 910  * Determine whether [*basep, *basep + *lenp) contains a mappable range of
 911  * addresses at least "minlen" long, where the base of the range is at "off"
 912  * phase from an "align" boundary and there is space for a "redzone"-sized
 913  * redzone on either side of the range.  On success, 1 is returned and *basep
 914  * and *lenp are adjusted to describe the acceptable range (including
 915  * the redzone).  On failure, 0 is returned.
 916  */
 917 /*ARGSUSED3*/
 918 int
 919 valid_va_range_aligned(caddr_t *basep, size_t *lenp, size_t minlen, int dir,
 920     size_t align, size_t redzone, size_t off)
 921 {
 922         uintptr_t hi, lo;
 923         size_t tot_len;
 924 
 925         ASSERT(align == 0 ? off == 0 : off < align);
 926         ASSERT(ISP2(align));
 927         ASSERT(align == 0 || align >= PAGESIZE);
 928 
 929         lo = (uintptr_t)*basep;
 930         hi = lo + *lenp;
 931         tot_len = minlen + 2 * redzone; /* need at least this much space */
 932 
 933         /*
 934          * If hi rolled over the top, try cutting back.
 935          */
 936         if (hi < lo) {
 937                 *lenp = 0UL - lo - 1UL;
 938                 /* See if this really happens. If so, then we figure out why */
 939                 valid_va_range_aligned_wraparound++;
 940                 hi = lo + *lenp;
 941         }
 942         if (*lenp < tot_len) {
 943                 return (0);
 944         }
 945 
 946 #if defined(__amd64)
 947         /*
 948          * Deal with a possible hole in the address range between
 949          * hole_start and hole_end that should never be mapped.
 950          */
 951         if (lo < hole_start) {
 952                 if (hi > hole_start) {
 953                         if (hi < hole_end) {
 954                                 hi = hole_start;
 955                         } else {
 956                                 /* lo < hole_start && hi >= hole_end */
 957                                 if (dir == AH_LO) {
 958                                         /*
 959                                          * prefer lowest range
 960                                          */
 961                                         if (hole_start - lo >= tot_len)
 962                                                 hi = hole_start;
 963                                         else if (hi - hole_end >= tot_len)
 964                                                 lo = hole_end;
 965                                         else
 966                                                 return (0);
 967                                 } else {
 968                                         /*
 969                                          * prefer highest range
 970                                          */
 971                                         if (hi - hole_end >= tot_len)
 972                                                 lo = hole_end;
 973                                         else if (hole_start - lo >= tot_len)
 974                                                 hi = hole_start;
 975                                         else
 976                                                 return (0);
 977                                 }
 978                         }
 979                 }
 980         } else {
 981                 /* lo >= hole_start */
 982                 if (hi < hole_end)
 983                         return (0);
 984                 if (lo < hole_end)
 985                         lo = hole_end;
 986         }
 987 #endif
 988 
 989         if (hi - lo < tot_len)
 990                 return (0);
 991 
 992         if (align > 1) {
 993                 uintptr_t tlo = lo + redzone;
 994                 uintptr_t thi = hi - redzone;
 995                 tlo = (uintptr_t)P2PHASEUP(tlo, align, off);
 996                 if (tlo < lo + redzone) {
 997                         return (0);
 998                 }
 999                 if (thi < tlo || thi - tlo < minlen) {
1000                         return (0);
1001                 }
1002         }
1003 
1004         *basep = (caddr_t)lo;
1005         *lenp = hi - lo;
1006         return (1);
1007 }
1008 
1009 /*
1010  * Determine whether [*basep, *basep + *lenp) contains a mappable range of
1011  * addresses at least "minlen" long.  On success, 1 is returned and *basep
1012  * and *lenp are adjusted to describe the acceptable range.  On failure, 0
1013  * is returned.
1014  */
1015 int
1016 valid_va_range(caddr_t *basep, size_t *lenp, size_t minlen, int dir)
1017 {
1018         return (valid_va_range_aligned(basep, lenp, minlen, dir, 0, 0, 0));
1019 }
1020 
1021 /*
1022  * Default to forbidding the first 64k of address space.  This protects most
1023  * reasonably sized structures from dereferences through NULL:
1024  *     ((foo_t *)0)->bar
1025  */
1026 uintptr_t forbidden_null_mapping_sz = 0x10000;
1027 
1028 /*
1029  * Determine whether [addr, addr+len] are valid user addresses.
1030  */
1031 /*ARGSUSED*/
1032 int
1033 valid_usr_range(caddr_t addr, size_t len, uint_t prot, struct as *as,
1034     caddr_t userlimit)
1035 {
1036         caddr_t eaddr = addr + len;
1037 
1038         if (eaddr <= addr || addr >= userlimit || eaddr > userlimit)
1039                 return (RANGE_BADADDR);
1040 
1041         if ((addr <= (caddr_t)forbidden_null_mapping_sz) &&
1042             as->a_proc != NULL &&
1043             secflag_enabled(as->a_proc, PROC_SEC_FORBIDNULLMAP))
1044                 return (RANGE_BADADDR);
1045 
1046 #if defined(__amd64)
1047         /*
1048          * Check for the VA hole
1049          */
1050         if (eaddr > (caddr_t)hole_start && addr < (caddr_t)hole_end)
1051                 return (RANGE_BADADDR);
1052 #endif
1053 
1054         return (RANGE_OKAY);
1055 }
1056 
1057 /*
1058  * Return 1 if the page frame is onboard memory, else 0.
1059  */
1060 int
1061 pf_is_memory(pfn_t pf)
1062 {
1063         if (pfn_is_foreign(pf))
1064                 return (0);
1065         return (address_in_memlist(phys_install, pfn_to_pa(pf), 1));
1066 }
1067 
1068 /*
1069  * return the memrange containing pfn
1070  */
1071 int
1072 memrange_num(pfn_t pfn)
1073 {
1074         int n;
1075 
1076         for (n = 0; n < nranges - 1; ++n) {
1077                 if (pfn >= memranges[n])
1078                         break;
1079         }
1080         return (n);
1081 }
1082 
1083 /*
1084  * return the mnoderange containing pfn
1085  */
1086 /*ARGSUSED*/
1087 int
1088 pfn_2_mtype(pfn_t pfn)
1089 {
1090 #if defined(__xpv)
1091         return (0);
1092 #else
1093         int     n;
1094 
1095         /* Always start from highest pfn and work our way down */
1096         for (n = mtypetop; n != -1; n = mnoderanges[n].mnr_next) {
1097                 if (pfn >= mnoderanges[n].mnr_pfnlo) {
1098                         break;
1099                 }
1100         }
1101         return (n);
1102 #endif
1103 }
1104 
1105 #if !defined(__xpv)
1106 /*
1107  * is_contigpage_free:
1108  *      returns a page list of contiguous pages. It minimally has to return
1109  *      minctg pages. Caller determines minctg based on the scatter-gather
1110  *      list length.
1111  *
1112  *      pfnp is set to the next page frame to search on return.
1113  */
1114 static page_t *
1115 is_contigpage_free(
1116         pfn_t *pfnp,
1117         pgcnt_t *pgcnt,
1118         pgcnt_t minctg,
1119         uint64_t pfnseg,
1120         int iolock)
1121 {
1122         int     i = 0;
1123         pfn_t   pfn = *pfnp;
1124         page_t  *pp;
1125         page_t  *plist = NULL;
1126 
1127         /*
1128          * fail if pfn + minctg crosses a segment boundary.
1129          * Adjust for next starting pfn to begin at segment boundary.
1130          */
1131 
1132         if (((*pfnp + minctg - 1) & pfnseg) < (*pfnp & pfnseg)) {
1133                 *pfnp = roundup(*pfnp, pfnseg + 1);
1134                 return (NULL);
1135         }
1136 
1137         do {
1138 retry:
1139                 pp = page_numtopp_nolock(pfn + i);
1140                 if ((pp == NULL) || IS_DUMP_PAGE(pp) ||
1141                     (page_trylock(pp, SE_EXCL) == 0)) {
1142                         (*pfnp)++;
1143                         break;
1144                 }
1145                 if (page_pptonum(pp) != pfn + i) {
1146                         page_unlock(pp);
1147                         goto retry;
1148                 }
1149 
1150                 if (!(PP_ISFREE(pp))) {
1151                         page_unlock(pp);
1152                         (*pfnp)++;
1153                         break;
1154                 }
1155 
1156                 if (!PP_ISAGED(pp)) {
1157                         page_list_sub(pp, PG_CACHE_LIST);
1158                         page_hashout(pp, (kmutex_t *)NULL);
1159                 } else {
1160                         page_list_sub(pp, PG_FREE_LIST);
1161                 }
1162 
1163                 if (iolock)
1164                         page_io_lock(pp);
1165                 page_list_concat(&plist, &pp);
1166 
1167                 /*
1168                  * exit loop when pgcnt satisfied or segment boundary reached.
1169                  */
1170 
1171         } while ((++i < *pgcnt) && ((pfn + i) & pfnseg));
1172 
1173         *pfnp += i;             /* set to next pfn to search */
1174 
1175         if (i >= minctg) {
1176                 *pgcnt -= i;
1177                 return (plist);
1178         }
1179 
1180         /*
1181          * failure: minctg not satisfied.
1182          *
1183          * if next request crosses segment boundary, set next pfn
1184          * to search from the segment boundary.
1185          */
1186         if (((*pfnp + minctg - 1) & pfnseg) < (*pfnp & pfnseg))
1187                 *pfnp = roundup(*pfnp, pfnseg + 1);
1188 
1189         /* clean up any pages already allocated */
1190 
1191         while (plist) {
1192                 pp = plist;
1193                 page_sub(&plist, pp);
1194                 page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
1195                 if (iolock)
1196                         page_io_unlock(pp);
1197                 page_unlock(pp);
1198         }
1199 
1200         return (NULL);
1201 }
1202 #endif  /* !__xpv */
1203 
1204 /*
1205  * verify that pages being returned from allocator have correct DMA attribute
1206  */
1207 #ifndef DEBUG
1208 #define check_dma(a, b, c) (void)(0)
1209 #else
1210 static void
1211 check_dma(ddi_dma_attr_t *dma_attr, page_t *pp, int cnt)
1212 {
1213         if (dma_attr == NULL)
1214                 return;
1215 
1216         while (cnt-- > 0) {
1217                 if (pa_to_ma(pfn_to_pa(pp->p_pagenum)) <
1218                     dma_attr->dma_attr_addr_lo)
1219                         panic("PFN (pp=%p) below dma_attr_addr_lo", (void *)pp);
1220                 if (pa_to_ma(pfn_to_pa(pp->p_pagenum)) >=
1221                     dma_attr->dma_attr_addr_hi)
1222                         panic("PFN (pp=%p) above dma_attr_addr_hi", (void *)pp);
1223                 pp = pp->p_next;
1224         }
1225 }
1226 #endif
1227 
1228 #if !defined(__xpv)
1229 static page_t *
1230 page_get_contigpage(pgcnt_t *pgcnt, ddi_dma_attr_t *mattr, int iolock)
1231 {
1232         pfn_t           pfn;
1233         int             sgllen;
1234         uint64_t        pfnseg;
1235         pgcnt_t         minctg;
1236         page_t          *pplist = NULL, *plist;
1237         uint64_t        lo, hi;
1238         pgcnt_t         pfnalign = 0;
1239         static pfn_t    startpfn;
1240         static pgcnt_t  lastctgcnt;
1241         uintptr_t       align;
1242 
1243         CONTIG_LOCK();
1244 
1245         if (mattr) {
1246                 lo = mmu_btop((mattr->dma_attr_addr_lo + MMU_PAGEOFFSET));
1247                 hi = mmu_btop(mattr->dma_attr_addr_hi);
1248                 if (hi >= physmax)
1249                         hi = physmax - 1;
1250                 sgllen = mattr->dma_attr_sgllen;
1251                 pfnseg = mmu_btop(mattr->dma_attr_seg);
1252 
1253                 align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer);
1254                 if (align > MMU_PAGESIZE)
1255                         pfnalign = mmu_btop(align);
1256 
1257                 /*
1258                  * in order to satisfy the request, must minimally
1259                  * acquire minctg contiguous pages
1260                  */
1261                 minctg = howmany(*pgcnt, sgllen);
1262 
1263                 ASSERT(hi >= lo);
1264 
1265                 /*
1266                  * start from where last searched if the minctg >= lastctgcnt
1267                  */
1268                 if (minctg < lastctgcnt || startpfn < lo || startpfn > hi)
1269                         startpfn = lo;
1270         } else {
1271                 hi = physmax - 1;
1272                 lo = 0;
1273                 sgllen = 1;
1274                 pfnseg = mmu.highest_pfn;
1275                 minctg = *pgcnt;
1276 
1277                 if (minctg < lastctgcnt)
1278                         startpfn = lo;
1279         }
1280         lastctgcnt = minctg;
1281 
1282         ASSERT(pfnseg + 1 >= (uint64_t)minctg);
1283 
1284         /* conserve 16m memory - start search above 16m when possible */
1285         if (hi > PFN_16M && startpfn < PFN_16M)
1286                 startpfn = PFN_16M;
1287 
1288         pfn = startpfn;
1289         if (pfnalign)
1290                 pfn = P2ROUNDUP(pfn, pfnalign);
1291 
1292         while (pfn + minctg - 1 <= hi) {
1293 
1294                 plist = is_contigpage_free(&pfn, pgcnt, minctg, pfnseg, iolock);
1295                 if (plist) {
1296                         page_list_concat(&pplist, &plist);
1297                         sgllen--;
1298                         /*
1299                          * return when contig pages no longer needed
1300                          */
1301                         if (!*pgcnt || ((*pgcnt <= sgllen) && !pfnalign)) {
1302                                 startpfn = pfn;
1303                                 CONTIG_UNLOCK();
1304                                 check_dma(mattr, pplist, *pgcnt);
1305                                 return (pplist);
1306                         }
1307                         minctg = howmany(*pgcnt, sgllen);
1308                 }
1309                 if (pfnalign)
1310                         pfn = P2ROUNDUP(pfn, pfnalign);
1311         }
1312 
1313         /* cannot find contig pages in specified range */
1314         if (startpfn == lo) {
1315                 CONTIG_UNLOCK();
1316                 return (NULL);
1317         }
1318 
1319         /* did not start with lo previously */
1320         pfn = lo;
1321         if (pfnalign)
1322                 pfn = P2ROUNDUP(pfn, pfnalign);
1323 
1324         /* allow search to go above startpfn */
1325         while (pfn < startpfn) {
1326 
1327                 plist = is_contigpage_free(&pfn, pgcnt, minctg, pfnseg, iolock);
1328                 if (plist != NULL) {
1329 
1330                         page_list_concat(&pplist, &plist);
1331                         sgllen--;
1332 
1333                         /*
1334                          * return when contig pages no longer needed
1335                          */
1336                         if (!*pgcnt || ((*pgcnt <= sgllen) && !pfnalign)) {
1337                                 startpfn = pfn;
1338                                 CONTIG_UNLOCK();
1339                                 check_dma(mattr, pplist, *pgcnt);
1340                                 return (pplist);
1341                         }
1342                         minctg = howmany(*pgcnt, sgllen);
1343                 }
1344                 if (pfnalign)
1345                         pfn = P2ROUNDUP(pfn, pfnalign);
1346         }
1347         CONTIG_UNLOCK();
1348         return (NULL);
1349 }
1350 #endif  /* !__xpv */
1351 
1352 /*
1353  * mnode_range_cnt() calculates the number of memory ranges for mnode and
1354  * memranges[]. Used to determine the size of page lists and mnoderanges.
1355  */
1356 int
1357 mnode_range_cnt(int mnode)
1358 {
1359 #if defined(__xpv)
1360         ASSERT(mnode == 0);
1361         return (1);
1362 #else   /* __xpv */
1363         int     mri;
1364         int     mnrcnt = 0;
1365 
1366         if (mem_node_config[mnode].exists != 0) {
1367                 mri = nranges - 1;
1368 
1369                 /* find the memranges index below contained in mnode range */
1370 
1371                 while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase)
1372                         mri--;
1373 
1374                 /*
1375                  * increment mnode range counter when memranges or mnode
1376                  * boundary is reached.
1377                  */
1378                 while (mri >= 0 &&
1379                     mem_node_config[mnode].physmax >= MEMRANGELO(mri)) {
1380                         mnrcnt++;
1381                         if (mem_node_config[mnode].physmax > MEMRANGEHI(mri))
1382                                 mri--;
1383                         else
1384                                 break;
1385                 }
1386         }
1387         ASSERT(mnrcnt <= MAX_MNODE_MRANGES);
1388         return (mnrcnt);
1389 #endif  /* __xpv */
1390 }
1391 
1392 /*
1393  * mnode_range_setup() initializes mnoderanges.
1394  */
1395 void
1396 mnode_range_setup(mnoderange_t *mnoderanges)
1397 {
1398         mnoderange_t *mp = mnoderanges;
1399         int     mnode, mri;
1400         int     mindex = 0;     /* current index into mnoderanges array */
1401         int     i, j;
1402         pfn_t   hipfn;
1403         int     last, hi;
1404 
1405         for (mnode = 0; mnode < max_mem_nodes; mnode++) {
1406                 if (mem_node_config[mnode].exists == 0)
1407                         continue;
1408 
1409                 mri = nranges - 1;
1410 
1411                 while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase)
1412                         mri--;
1413 
1414                 while (mri >= 0 && mem_node_config[mnode].physmax >=
1415                     MEMRANGELO(mri)) {
1416                         mnoderanges->mnr_pfnlo = MAX(MEMRANGELO(mri),
1417                             mem_node_config[mnode].physbase);
1418                         mnoderanges->mnr_pfnhi = MIN(MEMRANGEHI(mri),
1419                             mem_node_config[mnode].physmax);
1420                         mnoderanges->mnr_mnode = mnode;
1421                         mnoderanges->mnr_memrange = mri;
1422                         mnoderanges->mnr_exists = 1;
1423                         mnoderanges++;
1424                         mindex++;
1425                         if (mem_node_config[mnode].physmax > MEMRANGEHI(mri))
1426                                 mri--;
1427                         else
1428                                 break;
1429                 }
1430         }
1431 
1432         /*
1433          * For now do a simple sort of the mnoderanges array to fill in
1434          * the mnr_next fields.  Since mindex is expected to be relatively
1435          * small, using a simple O(N^2) algorithm.
1436          */
1437         for (i = 0; i < mindex; i++) {
1438                 if (mp[i].mnr_pfnlo == 0)       /* find lowest */
1439                         break;
1440         }
1441         ASSERT(i < mindex);
1442         last = i;
1443         mtype16m = last;
1444         mp[last].mnr_next = -1;
1445         for (i = 0; i < mindex - 1; i++) {
1446                 hipfn = (pfn_t)(-1);
1447                 hi = -1;
1448                 /* find next highest mnode range */
1449                 for (j = 0; j < mindex; j++) {
1450                         if (mp[j].mnr_pfnlo > mp[last].mnr_pfnlo &&
1451                             mp[j].mnr_pfnlo < hipfn) {
1452                                 hipfn = mp[j].mnr_pfnlo;
1453                                 hi = j;
1454                         }
1455                 }
1456                 mp[hi].mnr_next = last;
1457                 last = hi;
1458         }
1459         mtypetop = last;
1460 }
1461 
1462 #ifndef __xpv
1463 /*
1464  * Update mnoderanges for memory hot-add DR operations.
1465  */
1466 static void
1467 mnode_range_add(int mnode)
1468 {
1469         int     *prev;
1470         int     n, mri;
1471         pfn_t   start, end;
1472         extern  void membar_sync(void);
1473 
1474         ASSERT(0 <= mnode && mnode < max_mem_nodes);
1475         ASSERT(mem_node_config[mnode].exists);
1476         start = mem_node_config[mnode].physbase;
1477         end = mem_node_config[mnode].physmax;
1478         ASSERT(start <= end);
1479         mutex_enter(&mnoderange_lock);
1480 
1481 #ifdef  DEBUG
1482         /* Check whether it interleaves with other memory nodes. */
1483         for (n = mtypetop; n != -1; n = mnoderanges[n].mnr_next) {
1484                 ASSERT(mnoderanges[n].mnr_exists);
1485                 if (mnoderanges[n].mnr_mnode == mnode)
1486                         continue;
1487                 ASSERT(start > mnoderanges[n].mnr_pfnhi ||
1488                     end < mnoderanges[n].mnr_pfnlo);
1489         }
1490 #endif  /* DEBUG */
1491 
1492         mri = nranges - 1;
1493         while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase)
1494                 mri--;
1495         while (mri >= 0 && mem_node_config[mnode].physmax >= MEMRANGELO(mri)) {
1496                 /* Check whether mtype already exists. */
1497                 for (n = mtypetop; n != -1; n = mnoderanges[n].mnr_next) {
1498                         if (mnoderanges[n].mnr_mnode == mnode &&
1499                             mnoderanges[n].mnr_memrange == mri) {
1500                                 mnoderanges[n].mnr_pfnlo = MAX(MEMRANGELO(mri),
1501                                     start);
1502                                 mnoderanges[n].mnr_pfnhi = MIN(MEMRANGEHI(mri),
1503                                     end);
1504                                 break;
1505                         }
1506                 }
1507 
1508                 /* Add a new entry if it doesn't exist yet. */
1509                 if (n == -1) {
1510                         /* Try to find an unused entry in mnoderanges array. */
1511                         for (n = 0; n < mnoderangecnt; n++) {
1512                                 if (mnoderanges[n].mnr_exists == 0)
1513                                         break;
1514                         }
1515                         ASSERT(n < mnoderangecnt);
1516                         mnoderanges[n].mnr_pfnlo = MAX(MEMRANGELO(mri), start);
1517                         mnoderanges[n].mnr_pfnhi = MIN(MEMRANGEHI(mri), end);
1518                         mnoderanges[n].mnr_mnode = mnode;
1519                         mnoderanges[n].mnr_memrange = mri;
1520                         mnoderanges[n].mnr_exists = 1;
1521                         /* Page 0 should always be present. */
1522                         for (prev = &mtypetop;
1523                             mnoderanges[*prev].mnr_pfnlo > start;
1524                             prev = &mnoderanges[*prev].mnr_next) {
1525                                 ASSERT(mnoderanges[*prev].mnr_next >= 0);
1526                                 ASSERT(mnoderanges[*prev].mnr_pfnlo > end);
1527                         }
1528                         mnoderanges[n].mnr_next = *prev;
1529                         membar_sync();
1530                         *prev = n;
1531                 }
1532 
1533                 if (mem_node_config[mnode].physmax > MEMRANGEHI(mri))
1534                         mri--;
1535                 else
1536                         break;
1537         }
1538 
1539         mutex_exit(&mnoderange_lock);
1540 }
1541 
1542 /*
1543  * Update mnoderanges for memory hot-removal DR operations.
1544  */
1545 static void
1546 mnode_range_del(int mnode)
1547 {
1548         _NOTE(ARGUNUSED(mnode));
1549         ASSERT(0 <= mnode && mnode < max_mem_nodes);
1550         /* TODO: support deletion operation. */
1551         ASSERT(0);
1552 }
1553 
1554 void
1555 plat_slice_add(pfn_t start, pfn_t end)
1556 {
1557         mem_node_add_slice(start, end);
1558         if (plat_dr_enabled()) {
1559                 mnode_range_add(PFN_2_MEM_NODE(start));
1560         }
1561 }
1562 
1563 void
1564 plat_slice_del(pfn_t start, pfn_t end)
1565 {
1566         ASSERT(PFN_2_MEM_NODE(start) == PFN_2_MEM_NODE(end));
1567         ASSERT(plat_dr_enabled());
1568         mnode_range_del(PFN_2_MEM_NODE(start));
1569         mem_node_del_slice(start, end);
1570 }
1571 #endif  /* __xpv */
1572 
1573 /*ARGSUSED*/
1574 int
1575 mtype_init(vnode_t *vp, caddr_t vaddr, uint_t *flags, size_t pgsz)
1576 {
1577         int mtype = mtypetop;
1578 
1579 #if !defined(__xpv)
1580 #if defined(__i386)
1581         /*
1582          * set the mtype range
1583          * - kmem requests need to be below 4g if restricted_kmemalloc is set.
1584          * - for non kmem requests, set range to above 4g if memory below 4g
1585          * runs low.
1586          */
1587         if (restricted_kmemalloc && VN_ISKAS(vp) &&
1588             (caddr_t)(vaddr) >= kernelheap &&
1589             (caddr_t)(vaddr) < ekernelheap) {
1590                 ASSERT(physmax4g);
1591                 mtype = mtype4g;
1592                 if (RESTRICT16M_ALLOC(freemem4g - btop(pgsz),
1593                     btop(pgsz), *flags)) {
1594                         *flags |= PGI_MT_RANGE16M;
1595                 } else {
1596                         VM_STAT_ADD(vmm_vmstats.unrestrict16mcnt);
1597                         VM_STAT_COND_ADD((*flags & PG_PANIC),
1598                             vmm_vmstats.pgpanicalloc);
1599                         *flags |= PGI_MT_RANGE0;
1600                 }
1601                 return (mtype);
1602         }
1603 #endif  /* __i386 */
1604 
1605         if (RESTRICT4G_ALLOC) {
1606                 VM_STAT_ADD(vmm_vmstats.restrict4gcnt);
1607                 /* here only for > 4g systems */
1608                 *flags |= PGI_MT_RANGE4G;
1609         } else if (RESTRICT16M_ALLOC(freemem, btop(pgsz), *flags)) {
1610                 *flags |= PGI_MT_RANGE16M;
1611         } else {
1612                 VM_STAT_ADD(vmm_vmstats.unrestrict16mcnt);
1613                 VM_STAT_COND_ADD((*flags & PG_PANIC), vmm_vmstats.pgpanicalloc);
1614                 *flags |= PGI_MT_RANGE0;
1615         }
1616 #endif /* !__xpv */
1617         return (mtype);
1618 }
1619 
1620 
1621 /* mtype init for page_get_replacement_page */
1622 /*ARGSUSED*/
1623 int
1624 mtype_pgr_init(int *flags, page_t *pp, int mnode, pgcnt_t pgcnt)
1625 {
1626         int mtype = mtypetop;
1627 #if !defined(__xpv)
1628         if (RESTRICT16M_ALLOC(freemem, pgcnt, *flags)) {
1629                 *flags |= PGI_MT_RANGE16M;
1630         } else {
1631                 VM_STAT_ADD(vmm_vmstats.unrestrict16mcnt);
1632                 *flags |= PGI_MT_RANGE0;
1633         }
1634 #endif
1635         return (mtype);
1636 }
1637 
1638 /*
1639  * Determine if the mnode range specified in mtype contains memory belonging
1640  * to memory node mnode.  If flags & PGI_MT_RANGE is set then mtype contains
1641  * the range from high pfn to 0, 16m or 4g.
1642  *
1643  * Return first mnode range type index found otherwise return -1 if none found.
1644  */
1645 int
1646 mtype_func(int mnode, int mtype, uint_t flags)
1647 {
1648         if (flags & PGI_MT_RANGE) {
1649                 int     mnr_lim = MRI_0;
1650 
1651                 if (flags & PGI_MT_NEXT) {
1652                         mtype = mnoderanges[mtype].mnr_next;
1653                 }
1654                 if (flags & PGI_MT_RANGE4G)
1655                         mnr_lim = MRI_4G;       /* exclude 0-4g range */
1656                 else if (flags & PGI_MT_RANGE16M)
1657                         mnr_lim = MRI_16M;      /* exclude 0-16m range */
1658                 while (mtype != -1 &&
1659                     mnoderanges[mtype].mnr_memrange <= mnr_lim) {
1660                         if (mnoderanges[mtype].mnr_mnode == mnode)
1661                                 return (mtype);
1662                         mtype = mnoderanges[mtype].mnr_next;
1663                 }
1664         } else if (mnoderanges[mtype].mnr_mnode == mnode) {
1665                 return (mtype);
1666         }
1667         return (-1);
1668 }
1669 
1670 /*
1671  * Update the page list max counts with the pfn range specified by the
1672  * input parameters.
1673  */
1674 void
1675 mtype_modify_max(pfn_t startpfn, long cnt)
1676 {
1677         int             mtype;
1678         pgcnt_t         inc;
1679         spgcnt_t        scnt = (spgcnt_t)(cnt);
1680         pgcnt_t         acnt = ABS(scnt);
1681         pfn_t           endpfn = startpfn + acnt;
1682         pfn_t           pfn, lo;
1683 
1684         if (!physmax4g)
1685                 return;
1686 
1687         mtype = mtypetop;
1688         for (pfn = endpfn; pfn > startpfn; ) {
1689                 ASSERT(mtype != -1);
1690                 lo = mnoderanges[mtype].mnr_pfnlo;
1691                 if (pfn > lo) {
1692                         if (startpfn >= lo) {
1693                                 inc = pfn - startpfn;
1694                         } else {
1695                                 inc = pfn - lo;
1696                         }
1697                         if (mnoderanges[mtype].mnr_memrange != MRI_4G) {
1698                                 if (scnt > 0)
1699                                         maxmem4g += inc;
1700                                 else
1701                                         maxmem4g -= inc;
1702                         }
1703                         pfn -= inc;
1704                 }
1705                 mtype = mnoderanges[mtype].mnr_next;
1706         }
1707 }
1708 
1709 int
1710 mtype_2_mrange(int mtype)
1711 {
1712         return (mnoderanges[mtype].mnr_memrange);
1713 }
1714 
1715 void
1716 mnodetype_2_pfn(int mnode, int mtype, pfn_t *pfnlo, pfn_t *pfnhi)
1717 {
1718         _NOTE(ARGUNUSED(mnode));
1719         ASSERT(mnoderanges[mtype].mnr_mnode == mnode);
1720         *pfnlo = mnoderanges[mtype].mnr_pfnlo;
1721         *pfnhi = mnoderanges[mtype].mnr_pfnhi;
1722 }
1723 
1724 size_t
1725 plcnt_sz(size_t ctrs_sz)
1726 {
1727 #ifdef DEBUG
1728         int     szc, colors;
1729 
1730         ctrs_sz += mnoderangecnt * sizeof (struct mnr_mts) * mmu_page_sizes;
1731         for (szc = 0; szc < mmu_page_sizes; szc++) {
1732                 colors = page_get_pagecolors(szc);
1733                 ctrs_sz += mnoderangecnt * sizeof (pgcnt_t) * colors;
1734         }
1735 #endif
1736         return (ctrs_sz);
1737 }
1738 
1739 caddr_t
1740 plcnt_init(caddr_t addr)
1741 {
1742 #ifdef DEBUG
1743         int     mt, szc, colors;
1744 
1745         for (mt = 0; mt < mnoderangecnt; mt++) {
1746                 mnoderanges[mt].mnr_mts = (struct mnr_mts *)addr;
1747                 addr += (sizeof (struct mnr_mts) * mmu_page_sizes);
1748                 for (szc = 0; szc < mmu_page_sizes; szc++) {
1749                         colors = page_get_pagecolors(szc);
1750                         mnoderanges[mt].mnr_mts[szc].mnr_mts_colors = colors;
1751                         mnoderanges[mt].mnr_mts[szc].mnr_mtsc_pgcnt =
1752                             (pgcnt_t *)addr;
1753                         addr += (sizeof (pgcnt_t) * colors);
1754                 }
1755         }
1756 #endif
1757         return (addr);
1758 }
1759 
1760 void
1761 plcnt_inc_dec(page_t *pp, int mtype, int szc, long cnt, int flags)
1762 {
1763         _NOTE(ARGUNUSED(pp));
1764 #ifdef DEBUG
1765         int     bin = PP_2_BIN(pp);
1766 
1767         atomic_add_long(&mnoderanges[mtype].mnr_mts[szc].mnr_mts_pgcnt, cnt);
1768         atomic_add_long(&mnoderanges[mtype].mnr_mts[szc].mnr_mtsc_pgcnt[bin],
1769             cnt);
1770 #endif
1771         ASSERT(mtype == PP_2_MTYPE(pp));
1772         if (physmax4g && mnoderanges[mtype].mnr_memrange != MRI_4G)
1773                 atomic_add_long(&freemem4g, cnt);
1774         if (flags & PG_CACHE_LIST)
1775                 atomic_add_long(&mnoderanges[mtype].mnr_mt_clpgcnt, cnt);
1776         else
1777                 atomic_add_long(&mnoderanges[mtype].mnr_mt_flpgcnt[szc], cnt);
1778         atomic_add_long(&mnoderanges[mtype].mnr_mt_totcnt, cnt);
1779 }
1780 
1781 /*
1782  * Returns the free page count for mnode
1783  */
1784 int
1785 mnode_pgcnt(int mnode)
1786 {
1787         int     mtype = mtypetop;
1788         int     flags = PGI_MT_RANGE0;
1789         pgcnt_t pgcnt = 0;
1790 
1791         mtype = mtype_func(mnode, mtype, flags);
1792 
1793         while (mtype != -1) {
1794                 pgcnt += MTYPE_FREEMEM(mtype);
1795                 mtype = mtype_func(mnode, mtype, flags | PGI_MT_NEXT);
1796         }
1797         return (pgcnt);
1798 }
1799 
1800 /*
1801  * Initialize page coloring variables based on the l2 cache parameters.
1802  * Calculate and return memory needed for page coloring data structures.
1803  */
1804 size_t
1805 page_coloring_init(uint_t l2_sz, int l2_linesz, int l2_assoc)
1806 {
1807         _NOTE(ARGUNUSED(l2_linesz));
1808         size_t  colorsz = 0;
1809         int     i;
1810         int     colors;
1811 
1812 #if defined(__xpv)
1813         /*
1814          * Hypervisor domains currently don't have any concept of NUMA.
1815          * Hence we'll act like there is only 1 memrange.
1816          */
1817         i = memrange_num(1);
1818 #else /* !__xpv */
1819         /*
1820          * Reduce the memory ranges lists if we don't have large amounts
1821          * of memory. This avoids searching known empty free lists.
1822          * To support memory DR operations, we need to keep memory ranges
1823          * for possible memory hot-add operations.
1824          */
1825         if (plat_dr_physmax > physmax)
1826                 i = memrange_num(plat_dr_physmax);
1827         else
1828                 i = memrange_num(physmax);
1829 #if defined(__i386)
1830         if (i > MRI_4G)
1831                 restricted_kmemalloc = 0;
1832 #endif
1833         /* physmax greater than 4g */
1834         if (i == MRI_4G)
1835                 physmax4g = 1;
1836 #endif /* !__xpv */
1837         memranges += i;
1838         nranges -= i;
1839 
1840         ASSERT(mmu_page_sizes <= MMU_PAGE_SIZES);
1841 
1842         ASSERT(ISP2(l2_linesz));
1843         ASSERT(l2_sz > MMU_PAGESIZE);
1844 
1845         /* l2_assoc is 0 for fully associative l2 cache */
1846         if (l2_assoc)
1847                 l2_colors = MAX(1, l2_sz / (l2_assoc * MMU_PAGESIZE));
1848         else
1849                 l2_colors = 1;
1850 
1851         ASSERT(ISP2(l2_colors));
1852 
1853         /* for scalability, configure at least PAGE_COLORS_MIN color bins */
1854         page_colors = MAX(l2_colors, PAGE_COLORS_MIN);
1855 
1856         /*
1857          * cpu_page_colors is non-zero when a page color may be spread across
1858          * multiple bins.
1859          */
1860         if (l2_colors < page_colors)
1861                 cpu_page_colors = l2_colors;
1862 
1863         ASSERT(ISP2(page_colors));
1864 
1865         page_colors_mask = page_colors - 1;
1866 
1867         ASSERT(ISP2(CPUSETSIZE()));
1868         page_coloring_shift = lowbit(CPUSETSIZE());
1869 
1870         /* initialize number of colors per page size */
1871         for (i = 0; i <= mmu.max_page_level; i++) {
1872                 hw_page_array[i].hp_size = LEVEL_SIZE(i);
1873                 hw_page_array[i].hp_shift = LEVEL_SHIFT(i);
1874                 hw_page_array[i].hp_pgcnt = LEVEL_SIZE(i) >> LEVEL_SHIFT(0);
1875                 hw_page_array[i].hp_colors = (page_colors_mask >>
1876                     (hw_page_array[i].hp_shift - hw_page_array[0].hp_shift))
1877                     + 1;
1878                 colorequivszc[i] = 0;
1879         }
1880 
1881         /*
1882          * The value of cpu_page_colors determines if additional color bins
1883          * need to be checked for a particular color in the page_get routines.
1884          */
1885         if (cpu_page_colors != 0) {
1886 
1887                 int a = lowbit(page_colors) - lowbit(cpu_page_colors);
1888                 ASSERT(a > 0);
1889                 ASSERT(a < 16);
1890 
1891                 for (i = 0; i <= mmu.max_page_level; i++) {
1892                         if ((colors = hw_page_array[i].hp_colors) <= 1) {
1893                                 colorequivszc[i] = 0;
1894                                 continue;
1895                         }
1896                         while ((colors >> a) == 0)
1897                                 a--;
1898                         ASSERT(a >= 0);
1899 
1900                         /* higher 4 bits encodes color equiv mask */
1901                         colorequivszc[i] = (a << 4);
1902                 }
1903         }
1904 
1905         /* factor in colorequiv to check additional 'equivalent' bins. */
1906         if (colorequiv > 1) {
1907 
1908                 int a = lowbit(colorequiv) - 1;
1909                 if (a > 15)
1910                         a = 15;
1911 
1912                 for (i = 0; i <= mmu.max_page_level; i++) {
1913                         if ((colors = hw_page_array[i].hp_colors) <= 1) {
1914                                 continue;
1915                         }
1916                         while ((colors >> a) == 0)
1917                                 a--;
1918                         if ((a << 4) > colorequivszc[i]) {
1919                                 colorequivszc[i] = (a << 4);
1920                         }
1921                 }
1922         }
1923 
1924         /* size for mnoderanges */
1925         for (mnoderangecnt = 0, i = 0; i < max_mem_nodes; i++)
1926                 mnoderangecnt += mnode_range_cnt(i);
1927         if (plat_dr_support_memory()) {
1928                 /*
1929                  * Reserve enough space for memory DR operations.
1930                  * Two extra mnoderanges for possbile fragmentations,
1931                  * one for the 2G boundary and the other for the 4G boundary.
1932                  * We don't expect a memory board crossing the 16M boundary
1933                  * for memory hot-add operations on x86 platforms.
1934                  */
1935                 mnoderangecnt += 2 + max_mem_nodes - lgrp_plat_node_cnt;
1936         }
1937         colorsz = mnoderangecnt * sizeof (mnoderange_t);
1938 
1939         /* size for fpc_mutex and cpc_mutex */
1940         colorsz += (2 * max_mem_nodes * sizeof (kmutex_t) * NPC_MUTEX);
1941 
1942         /* size of page_freelists */
1943         colorsz += mnoderangecnt * sizeof (page_t ***);
1944         colorsz += mnoderangecnt * mmu_page_sizes * sizeof (page_t **);
1945 
1946         for (i = 0; i < mmu_page_sizes; i++) {
1947                 colors = page_get_pagecolors(i);
1948                 colorsz += mnoderangecnt * colors * sizeof (page_t *);
1949         }
1950 
1951         /* size of page_cachelists */
1952         colorsz += mnoderangecnt * sizeof (page_t **);
1953         colorsz += mnoderangecnt * page_colors * sizeof (page_t *);
1954 
1955         return (colorsz);
1956 }
1957 
1958 /*
1959  * Called once at startup to configure page_coloring data structures and
1960  * does the 1st page_free()/page_freelist_add().
1961  */
1962 void
1963 page_coloring_setup(caddr_t pcmemaddr)
1964 {
1965         int     i;
1966         int     j;
1967         int     k;
1968         caddr_t addr;
1969         int     colors;
1970 
1971         /*
1972          * do page coloring setup
1973          */
1974         addr = pcmemaddr;
1975 
1976         mnoderanges = (mnoderange_t *)addr;
1977         addr += (mnoderangecnt * sizeof (mnoderange_t));
1978 
1979         mnode_range_setup(mnoderanges);
1980 
1981         if (physmax4g)
1982                 mtype4g = pfn_2_mtype(0xfffff);
1983 
1984         for (k = 0; k < NPC_MUTEX; k++) {
1985                 fpc_mutex[k] = (kmutex_t *)addr;
1986                 addr += (max_mem_nodes * sizeof (kmutex_t));
1987         }
1988         for (k = 0; k < NPC_MUTEX; k++) {
1989                 cpc_mutex[k] = (kmutex_t *)addr;
1990                 addr += (max_mem_nodes * sizeof (kmutex_t));
1991         }
1992         page_freelists = (page_t ****)addr;
1993         addr += (mnoderangecnt * sizeof (page_t ***));
1994 
1995         page_cachelists = (page_t ***)addr;
1996         addr += (mnoderangecnt * sizeof (page_t **));
1997 
1998         for (i = 0; i < mnoderangecnt; i++) {
1999                 page_freelists[i] = (page_t ***)addr;
2000                 addr += (mmu_page_sizes * sizeof (page_t **));
2001 
2002                 for (j = 0; j < mmu_page_sizes; j++) {
2003                         colors = page_get_pagecolors(j);
2004                         page_freelists[i][j] = (page_t **)addr;
2005                         addr += (colors * sizeof (page_t *));
2006                 }
2007                 page_cachelists[i] = (page_t **)addr;
2008                 addr += (page_colors * sizeof (page_t *));
2009         }
2010 }
2011 
2012 #if defined(__xpv)
2013 /*
2014  * Give back 10% of the io_pool pages to the free list.
2015  * Don't shrink the pool below some absolute minimum.
2016  */
2017 static void
2018 page_io_pool_shrink()
2019 {
2020         int retcnt;
2021         page_t *pp, *pp_first, *pp_last, **curpool;
2022         mfn_t mfn;
2023         int bothpools = 0;
2024 
2025         mutex_enter(&io_pool_lock);
2026         io_pool_shrink_attempts++;      /* should be a kstat? */
2027         retcnt = io_pool_cnt / 10;
2028         if (io_pool_cnt - retcnt < io_pool_cnt_min)
2029                 retcnt = io_pool_cnt - io_pool_cnt_min;
2030         if (retcnt <= 0)
2031                 goto done;
2032         io_pool_shrinks++;      /* should be a kstat? */
2033         curpool = &io_pool_4g;
2034 domore:
2035         /*
2036          * Loop through taking pages from the end of the list
2037          * (highest mfns) till amount to return reached.
2038          */
2039         for (pp = *curpool; pp && retcnt > 0; ) {
2040                 pp_first = pp_last = pp->p_prev;
2041                 if (pp_first == *curpool)
2042                         break;
2043                 retcnt--;
2044                 io_pool_cnt--;
2045                 page_io_pool_sub(curpool, pp_first, pp_last);
2046                 if ((mfn = pfn_to_mfn(pp->p_pagenum)) < start_mfn)
2047                         start_mfn = mfn;
2048                 page_free(pp_first, 1);
2049                 pp = *curpool;
2050         }
2051         if (retcnt != 0 && !bothpools) {
2052                 /*
2053                  * If not enough found in less constrained pool try the
2054                  * more constrained one.
2055                  */
2056                 curpool = &io_pool_16m;
2057                 bothpools = 1;
2058                 goto domore;
2059         }
2060 done:
2061         mutex_exit(&io_pool_lock);
2062 }
2063 
2064 #endif  /* __xpv */
2065 
2066 uint_t
2067 page_create_update_flags_x86(uint_t flags)
2068 {
2069 #if defined(__xpv)
2070         /*
2071          * Check this is an urgent allocation and free pages are depleted.
2072          */
2073         if (!(flags & PG_WAIT) && freemem < desfree)
2074                 page_io_pool_shrink();
2075 #else /* !__xpv */
2076         /*
2077          * page_create_get_something may call this because 4g memory may be
2078          * depleted. Set flags to allow for relocation of base page below
2079          * 4g if necessary.
2080          */
2081         if (physmax4g)
2082                 flags |= (PGI_PGCPSZC0 | PGI_PGCPHIPRI);
2083 #endif /* __xpv */
2084         return (flags);
2085 }
2086 
2087 /*ARGSUSED*/
2088 int
2089 bp_color(struct buf *bp)
2090 {
2091         return (0);
2092 }
2093 
2094 #if defined(__xpv)
2095 
2096 /*
2097  * Take pages out of an io_pool
2098  */
2099 static void
2100 page_io_pool_sub(page_t **poolp, page_t *pp_first, page_t *pp_last)
2101 {
2102         if (*poolp == pp_first) {
2103                 *poolp = pp_last->p_next;
2104                 if (*poolp == pp_first)
2105                         *poolp = NULL;
2106         }
2107         pp_first->p_prev->p_next = pp_last->p_next;
2108         pp_last->p_next->p_prev = pp_first->p_prev;
2109         pp_first->p_prev = pp_last;
2110         pp_last->p_next = pp_first;
2111 }
2112 
2113 /*
2114  * Put a page on the io_pool list. The list is ordered by increasing MFN.
2115  */
2116 static void
2117 page_io_pool_add(page_t **poolp, page_t *pp)
2118 {
2119         page_t  *look;
2120         mfn_t   mfn = mfn_list[pp->p_pagenum];
2121 
2122         if (*poolp == NULL) {
2123                 *poolp = pp;
2124                 pp->p_next = pp;
2125                 pp->p_prev = pp;
2126                 return;
2127         }
2128 
2129         /*
2130          * Since we try to take pages from the high end of the pool
2131          * chances are good that the pages to be put on the list will
2132          * go at or near the end of the list. so start at the end and
2133          * work backwards.
2134          */
2135         look = (*poolp)->p_prev;
2136         while (mfn < mfn_list[look->p_pagenum]) {
2137                 look = look->p_prev;
2138                 if (look == (*poolp)->p_prev)
2139                         break; /* backed all the way to front of list */
2140         }
2141 
2142         /* insert after look */
2143         pp->p_prev = look;
2144         pp->p_next = look->p_next;
2145         pp->p_next->p_prev = pp;
2146         look->p_next = pp;
2147         if (mfn < mfn_list[(*poolp)->p_pagenum]) {
2148                 /*
2149                  * we inserted a new first list element
2150                  * adjust pool pointer to newly inserted element
2151                  */
2152                 *poolp = pp;
2153         }
2154 }
2155 
2156 /*
2157  * Add a page to the io_pool.  Setting the force flag will force the page
2158  * into the io_pool no matter what.
2159  */
2160 static void
2161 add_page_to_pool(page_t *pp, int force)
2162 {
2163         page_t *highest;
2164         page_t *freep = NULL;
2165 
2166         mutex_enter(&io_pool_lock);
2167         /*
2168          * Always keep the scarce low memory pages
2169          */
2170         if (mfn_list[pp->p_pagenum] < PFN_16MEG) {
2171                 ++io_pool_cnt;
2172                 page_io_pool_add(&io_pool_16m, pp);
2173                 goto done;
2174         }
2175         if (io_pool_cnt < io_pool_cnt_max || force || io_pool_4g == NULL) {
2176                 ++io_pool_cnt;
2177                 page_io_pool_add(&io_pool_4g, pp);
2178         } else {
2179                 highest = io_pool_4g->p_prev;
2180                 if (mfn_list[pp->p_pagenum] < mfn_list[highest->p_pagenum]) {
2181                         page_io_pool_sub(&io_pool_4g, highest, highest);
2182                         page_io_pool_add(&io_pool_4g, pp);
2183                         freep = highest;
2184                 } else {
2185                         freep = pp;
2186                 }
2187         }
2188 done:
2189         mutex_exit(&io_pool_lock);
2190         if (freep)
2191                 page_free(freep, 1);
2192 }
2193 
2194 
2195 int contig_pfn_cnt;     /* no of pfns in the contig pfn list */
2196 int contig_pfn_max;     /* capacity of the contig pfn list */
2197 int next_alloc_pfn;     /* next position in list to start a contig search */
2198 int contig_pfnlist_updates;     /* pfn list update count */
2199 int contig_pfnlist_builds;      /* how many times have we (re)built list */
2200 int contig_pfnlist_buildfailed; /* how many times has list build failed */
2201 int create_contig_pending;      /* nonzero means taskq creating contig list */
2202 pfn_t *contig_pfn_list = NULL;  /* list of contig pfns in ascending mfn order */
2203 
2204 /*
2205  * Function to use in sorting a list of pfns by their underlying mfns.
2206  */
2207 static int
2208 mfn_compare(const void *pfnp1, const void *pfnp2)
2209 {
2210         mfn_t mfn1 = mfn_list[*(pfn_t *)pfnp1];
2211         mfn_t mfn2 = mfn_list[*(pfn_t *)pfnp2];
2212 
2213         if (mfn1 > mfn2)
2214                 return (1);
2215         if (mfn1 < mfn2)
2216                 return (-1);
2217         return (0);
2218 }
2219 
2220 /*
2221  * Compact the contig_pfn_list by tossing all the non-contiguous
2222  * elements from the list.
2223  */
2224 static void
2225 compact_contig_pfn_list(void)
2226 {
2227         pfn_t pfn, lapfn, prev_lapfn;
2228         mfn_t mfn;
2229         int i, newcnt = 0;
2230 
2231         prev_lapfn = 0;
2232         for (i = 0; i < contig_pfn_cnt - 1; i++) {
2233                 pfn = contig_pfn_list[i];
2234                 lapfn = contig_pfn_list[i + 1];
2235                 mfn = mfn_list[pfn];
2236                 /*
2237                  * See if next pfn is for a contig mfn
2238                  */
2239                 if (mfn_list[lapfn] != mfn + 1)
2240                         continue;
2241                 /*
2242                  * pfn and lookahead are both put in list
2243                  * unless pfn is the previous lookahead.
2244                  */
2245                 if (pfn != prev_lapfn)
2246                         contig_pfn_list[newcnt++] = pfn;
2247                 contig_pfn_list[newcnt++] = lapfn;
2248                 prev_lapfn = lapfn;
2249         }
2250         for (i = newcnt; i < contig_pfn_cnt; i++)
2251                 contig_pfn_list[i] = 0;
2252         contig_pfn_cnt = newcnt;
2253 }
2254 
2255 /*ARGSUSED*/
2256 static void
2257 call_create_contiglist(void *arg)
2258 {
2259         (void) create_contig_pfnlist(PG_WAIT);
2260 }
2261 
2262 /*
2263  * Create list of freelist pfns that have underlying
2264  * contiguous mfns.  The list is kept in ascending mfn order.
2265  * returns 1 if list created else 0.
2266  */
2267 static int
2268 create_contig_pfnlist(uint_t flags)
2269 {
2270         pfn_t pfn;
2271         page_t *pp;
2272         int ret = 1;
2273 
2274         mutex_enter(&contig_list_lock);
2275         if (contig_pfn_list != NULL)
2276                 goto out;
2277         contig_pfn_max = freemem + (freemem / 10);
2278         contig_pfn_list = kmem_zalloc(contig_pfn_max * sizeof (pfn_t),
2279             (flags & PG_WAIT) ? KM_SLEEP : KM_NOSLEEP);
2280         if (contig_pfn_list == NULL) {
2281                 /*
2282                  * If we could not create the contig list (because
2283                  * we could not sleep for memory).  Dispatch a taskq that can
2284                  * sleep to get the memory.
2285                  */
2286                 if (!create_contig_pending) {
2287                         if (taskq_dispatch(system_taskq, call_create_contiglist,
2288                             NULL, TQ_NOSLEEP) != TASKQID_INVALID)
2289                                 create_contig_pending = 1;
2290                 }
2291                 contig_pfnlist_buildfailed++;   /* count list build failures */
2292                 ret = 0;
2293                 goto out;
2294         }
2295         create_contig_pending = 0;
2296         ASSERT(contig_pfn_cnt == 0);
2297         for (pfn = 0; pfn < mfn_count; pfn++) {
2298                 pp = page_numtopp_nolock(pfn);
2299                 if (pp == NULL || !PP_ISFREE(pp))
2300                         continue;
2301                 contig_pfn_list[contig_pfn_cnt] = pfn;
2302                 if (++contig_pfn_cnt == contig_pfn_max)
2303                         break;
2304         }
2305         /*
2306          * Sanity check the new list.
2307          */
2308         if (contig_pfn_cnt < 2) { /* no contig pfns */
2309                 contig_pfn_cnt = 0;
2310                 contig_pfnlist_buildfailed++;
2311                 kmem_free(contig_pfn_list, contig_pfn_max * sizeof (pfn_t));
2312                 contig_pfn_list = NULL;
2313                 contig_pfn_max = 0;
2314                 ret = 0;
2315                 goto out;
2316         }
2317         qsort(contig_pfn_list, contig_pfn_cnt, sizeof (pfn_t), mfn_compare);
2318         compact_contig_pfn_list();
2319         /*
2320          * Make sure next search of the newly created contiguous pfn
2321          * list starts at the beginning of the list.
2322          */
2323         next_alloc_pfn = 0;
2324         contig_pfnlist_builds++;        /* count list builds */
2325 out:
2326         mutex_exit(&contig_list_lock);
2327         return (ret);
2328 }
2329 
2330 
2331 /*
2332  * Toss the current contig pfnlist.  Someone is about to do a massive
2333  * update to pfn<->mfn mappings.  So we have them destroy the list and lock
2334  * it till they are done with their update.
2335  */
2336 void
2337 clear_and_lock_contig_pfnlist()
2338 {
2339         pfn_t *listp = NULL;
2340         size_t listsize;
2341 
2342         mutex_enter(&contig_list_lock);
2343         if (contig_pfn_list != NULL) {
2344                 listp = contig_pfn_list;
2345                 listsize = contig_pfn_max * sizeof (pfn_t);
2346                 contig_pfn_list = NULL;
2347                 contig_pfn_max = contig_pfn_cnt = 0;
2348         }
2349         if (listp != NULL)
2350                 kmem_free(listp, listsize);
2351 }
2352 
2353 /*
2354  * Unlock the contig_pfn_list.  The next attempted use of it will cause
2355  * it to be re-created.
2356  */
2357 void
2358 unlock_contig_pfnlist()
2359 {
2360         mutex_exit(&contig_list_lock);
2361 }
2362 
2363 /*
2364  * Update the contiguous pfn list in response to a pfn <-> mfn reassignment
2365  */
2366 void
2367 update_contig_pfnlist(pfn_t pfn, mfn_t oldmfn, mfn_t newmfn)
2368 {
2369         int probe_hi, probe_lo, probe_pos, insert_after, insert_point;
2370         pfn_t probe_pfn;
2371         mfn_t probe_mfn;
2372         int drop_lock = 0;
2373 
2374         if (mutex_owner(&contig_list_lock) != curthread) {
2375                 drop_lock = 1;
2376                 mutex_enter(&contig_list_lock);
2377         }
2378         if (contig_pfn_list == NULL)
2379                 goto done;
2380         contig_pfnlist_updates++;
2381         /*
2382          * Find the pfn in the current list.  Use a binary chop to locate it.
2383          */
2384         probe_hi = contig_pfn_cnt - 1;
2385         probe_lo = 0;
2386         probe_pos = (probe_hi + probe_lo) / 2;
2387         while ((probe_pfn = contig_pfn_list[probe_pos]) != pfn) {
2388                 if (probe_pos == probe_lo) { /* pfn not in list */
2389                         probe_pos = -1;
2390                         break;
2391                 }
2392                 if (pfn_to_mfn(probe_pfn) <= oldmfn)
2393                         probe_lo = probe_pos;
2394                 else
2395                         probe_hi = probe_pos;
2396                 probe_pos = (probe_hi + probe_lo) / 2;
2397         }
2398         if (probe_pos >= 0) {
2399                 /*
2400                  * Remove pfn from list and ensure next alloc
2401                  * position stays in bounds.
2402                  */
2403                 if (--contig_pfn_cnt <= next_alloc_pfn)
2404                         next_alloc_pfn = 0;
2405                 if (contig_pfn_cnt < 2) { /* no contig pfns */
2406                         contig_pfn_cnt = 0;
2407                         kmem_free(contig_pfn_list,
2408                             contig_pfn_max * sizeof (pfn_t));
2409                         contig_pfn_list = NULL;
2410                         contig_pfn_max = 0;
2411                         goto done;
2412                 }
2413                 ovbcopy(&contig_pfn_list[probe_pos + 1],
2414                     &contig_pfn_list[probe_pos],
2415                     (contig_pfn_cnt - probe_pos) * sizeof (pfn_t));
2416         }
2417         if (newmfn == MFN_INVALID)
2418                 goto done;
2419         /*
2420          * Check if new mfn has adjacent mfns in the list
2421          */
2422         probe_hi = contig_pfn_cnt - 1;
2423         probe_lo = 0;
2424         insert_after = -2;
2425         do {
2426                 probe_pos = (probe_hi + probe_lo) / 2;
2427                 probe_mfn = pfn_to_mfn(contig_pfn_list[probe_pos]);
2428                 if (newmfn == probe_mfn + 1)
2429                         insert_after = probe_pos;
2430                 else if (newmfn == probe_mfn - 1)
2431                         insert_after = probe_pos - 1;
2432                 if (probe_pos == probe_lo)
2433                         break;
2434                 if (probe_mfn <= newmfn)
2435                         probe_lo = probe_pos;
2436                 else
2437                         probe_hi = probe_pos;
2438         } while (insert_after == -2);
2439         /*
2440          * If there is space in the list and there are adjacent mfns
2441          * insert the pfn in to its proper place in the list.
2442          */
2443         if (insert_after != -2 && contig_pfn_cnt + 1 <= contig_pfn_max) {
2444                 insert_point = insert_after + 1;
2445                 ovbcopy(&contig_pfn_list[insert_point],
2446                     &contig_pfn_list[insert_point + 1],
2447                     (contig_pfn_cnt - insert_point) * sizeof (pfn_t));
2448                 contig_pfn_list[insert_point] = pfn;
2449                 contig_pfn_cnt++;
2450         }
2451 done:
2452         if (drop_lock)
2453                 mutex_exit(&contig_list_lock);
2454 }
2455 
2456 /*
2457  * Called to (re-)populate the io_pool from the free page lists.
2458  */
2459 long
2460 populate_io_pool(void)
2461 {
2462         pfn_t pfn;
2463         mfn_t mfn, max_mfn;
2464         page_t *pp;
2465 
2466         /*
2467          * Figure out the bounds of the pool on first invocation.
2468          * We use a percentage of memory for the io pool size.
2469          * we allow that to shrink, but not to less than a fixed minimum
2470          */
2471         if (io_pool_cnt_max == 0) {
2472                 io_pool_cnt_max = physmem / (100 / io_pool_physmem_pct);
2473                 io_pool_cnt_lowater = io_pool_cnt_max;
2474                 /*
2475                  * This is the first time in populate_io_pool, grab a va to use
2476                  * when we need to allocate pages.
2477                  */
2478                 io_pool_kva = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP);
2479         }
2480         /*
2481          * If we are out of pages in the pool, then grow the size of the pool
2482          */
2483         if (io_pool_cnt == 0) {
2484                 /*
2485                  * Grow the max size of the io pool by 5%, but never more than
2486                  * 25% of physical memory.
2487                  */
2488                 if (io_pool_cnt_max < physmem / 4)
2489                         io_pool_cnt_max += io_pool_cnt_max / 20;
2490         }
2491         io_pool_grows++;        /* should be a kstat? */
2492 
2493         /*
2494          * Get highest mfn on this platform, but limit to the 32 bit DMA max.
2495          */
2496         (void) mfn_to_pfn(start_mfn);
2497         max_mfn = MIN(cached_max_mfn, PFN_4GIG);
2498         for (mfn = start_mfn; mfn < max_mfn; start_mfn = ++mfn) {
2499                 pfn = mfn_to_pfn(mfn);
2500                 if (pfn & PFN_IS_FOREIGN_MFN)
2501                         continue;
2502                 /*
2503                  * try to allocate it from free pages
2504                  */
2505                 pp = page_numtopp_alloc(pfn);
2506                 if (pp == NULL)
2507                         continue;
2508                 PP_CLRFREE(pp);
2509                 add_page_to_pool(pp, 1);
2510                 if (io_pool_cnt >= io_pool_cnt_max)
2511                         break;
2512         }
2513 
2514         return (io_pool_cnt);
2515 }
2516 
2517 /*
2518  * Destroy a page that was being used for DMA I/O. It may or
2519  * may not actually go back to the io_pool.
2520  */
2521 void
2522 page_destroy_io(page_t *pp)
2523 {
2524         mfn_t mfn = mfn_list[pp->p_pagenum];
2525 
2526         /*
2527          * When the page was alloc'd a reservation was made, release it now
2528          */
2529         page_unresv(1);
2530         /*
2531          * Unload translations, if any, then hash out the
2532          * page to erase its identity.
2533          */
2534         (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
2535         page_hashout(pp, NULL);
2536 
2537         /*
2538          * If the page came from the free lists, just put it back to them.
2539          * DomU pages always go on the free lists as well.
2540          */
2541         if (!DOMAIN_IS_INITDOMAIN(xen_info) || mfn >= PFN_4GIG) {
2542                 page_free(pp, 1);
2543                 return;
2544         }
2545 
2546         add_page_to_pool(pp, 0);
2547 }
2548 
2549 
2550 long contig_searches;           /* count of times contig pages requested */
2551 long contig_search_restarts;    /* count of contig ranges tried */
2552 long contig_search_failed;      /* count of contig alloc failures */
2553 
2554 /*
2555  * Free partial page list
2556  */
2557 static void
2558 free_partial_list(page_t **pplist)
2559 {
2560         page_t *pp;
2561 
2562         while (*pplist != NULL) {
2563                 pp = *pplist;
2564                 page_io_pool_sub(pplist, pp, pp);
2565                 page_free(pp, 1);
2566         }
2567 }
2568 
2569 /*
2570  * Look thru the contiguous pfns that are not part of the io_pool for
2571  * contiguous free pages.  Return a list of the found pages or NULL.
2572  */
2573 page_t *
2574 find_contig_free(uint_t npages, uint_t flags, uint64_t pfnseg,
2575     pgcnt_t pfnalign)
2576 {
2577         page_t *pp, *plist = NULL;
2578         mfn_t mfn, prev_mfn, start_mfn;
2579         pfn_t pfn;
2580         int pages_needed, pages_requested;
2581         int search_start;
2582 
2583         /*
2584          * create the contig pfn list if not already done
2585          */
2586 retry:
2587         mutex_enter(&contig_list_lock);
2588         if (contig_pfn_list == NULL) {
2589                 mutex_exit(&contig_list_lock);
2590                 if (!create_contig_pfnlist(flags)) {
2591                         return (NULL);
2592                 }
2593                 goto retry;
2594         }
2595         contig_searches++;
2596         /*
2597          * Search contiguous pfn list for physically contiguous pages not in
2598          * the io_pool.  Start the search where the last search left off.
2599          */
2600         pages_requested = pages_needed = npages;
2601         search_start = next_alloc_pfn;
2602         start_mfn = prev_mfn = 0;
2603         while (pages_needed) {
2604                 pfn = contig_pfn_list[next_alloc_pfn];
2605                 mfn = pfn_to_mfn(pfn);
2606                 /*
2607                  * Check if mfn is first one or contig to previous one and
2608                  * if page corresponding to mfn is free and that mfn
2609                  * range is not crossing a segment boundary.
2610                  */
2611                 if ((prev_mfn == 0 || mfn == prev_mfn + 1) &&
2612                     (pp = page_numtopp_alloc(pfn)) != NULL &&
2613                     !((mfn & pfnseg) < (start_mfn & pfnseg))) {
2614                         PP_CLRFREE(pp);
2615                         page_io_pool_add(&plist, pp);
2616                         pages_needed--;
2617                         if (prev_mfn == 0) {
2618                                 if (pfnalign &&
2619                                     mfn != P2ROUNDUP(mfn, pfnalign)) {
2620                                         /*
2621                                          * not properly aligned
2622                                          */
2623                                         contig_search_restarts++;
2624                                         free_partial_list(&plist);
2625                                         pages_needed = pages_requested;
2626                                         start_mfn = prev_mfn = 0;
2627                                         goto skip;
2628                                 }
2629                                 start_mfn = mfn;
2630                         }
2631                         prev_mfn = mfn;
2632                 } else {
2633                         contig_search_restarts++;
2634                         free_partial_list(&plist);
2635                         pages_needed = pages_requested;
2636                         start_mfn = prev_mfn = 0;
2637                 }
2638 skip:
2639                 if (++next_alloc_pfn == contig_pfn_cnt)
2640                         next_alloc_pfn = 0;
2641                 if (next_alloc_pfn == search_start)
2642                         break; /* all pfns searched */
2643         }
2644         mutex_exit(&contig_list_lock);
2645         if (pages_needed) {
2646                 contig_search_failed++;
2647                 /*
2648                  * Failed to find enough contig pages.
2649                  * free partial page list
2650                  */
2651                 free_partial_list(&plist);
2652         }
2653         return (plist);
2654 }
2655 
2656 /*
2657  * Search the reserved io pool pages for a page range with the
2658  * desired characteristics.
2659  */
2660 page_t *
2661 page_io_pool_alloc(ddi_dma_attr_t *mattr, int contig, pgcnt_t minctg)
2662 {
2663         page_t *pp_first, *pp_last;
2664         page_t *pp, **poolp;
2665         pgcnt_t nwanted, pfnalign;
2666         uint64_t pfnseg;
2667         mfn_t mfn, tmfn, hi_mfn, lo_mfn;
2668         int align, attempt = 0;
2669 
2670         if (minctg == 1)
2671                 contig = 0;
2672         lo_mfn = mmu_btop(mattr->dma_attr_addr_lo);
2673         hi_mfn = mmu_btop(mattr->dma_attr_addr_hi);
2674         pfnseg = mmu_btop(mattr->dma_attr_seg);
2675         align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer);
2676         if (align > MMU_PAGESIZE)
2677                 pfnalign = mmu_btop(align);
2678         else
2679                 pfnalign = 0;
2680 
2681 try_again:
2682         /*
2683          * See if we want pages for a legacy device
2684          */
2685         if (hi_mfn < PFN_16MEG)
2686                 poolp = &io_pool_16m;
2687         else
2688                 poolp = &io_pool_4g;
2689 try_smaller:
2690         /*
2691          * Take pages from I/O pool. We'll use pages from the highest
2692          * MFN range possible.
2693          */
2694         pp_first = pp_last = NULL;
2695         mutex_enter(&io_pool_lock);
2696         nwanted = minctg;
2697         for (pp = *poolp; pp && nwanted > 0; ) {
2698                 pp = pp->p_prev;
2699 
2700                 /*
2701                  * skip pages above allowable range
2702                  */
2703                 mfn = mfn_list[pp->p_pagenum];
2704                 if (hi_mfn < mfn)
2705                         goto skip;
2706 
2707                 /*
2708                  * stop at pages below allowable range
2709                  */
2710                 if (lo_mfn > mfn)
2711                         break;
2712 restart:
2713                 if (pp_last == NULL) {
2714                         /*
2715                          * Check alignment
2716                          */
2717                         tmfn = mfn - (minctg - 1);
2718                         if (pfnalign && tmfn != P2ROUNDUP(tmfn, pfnalign))
2719                                 goto skip; /* not properly aligned */
2720                         /*
2721                          * Check segment
2722                          */
2723                         if ((mfn & pfnseg) < (tmfn & pfnseg))
2724                                 goto skip; /* crosses seg boundary */
2725                         /*
2726                          * Start building page list
2727                          */
2728                         pp_first = pp_last = pp;
2729                         nwanted--;
2730                 } else {
2731                         /*
2732                          * check physical contiguity if required
2733                          */
2734                         if (contig &&
2735                             mfn_list[pp_first->p_pagenum] != mfn + 1) {
2736                                 /*
2737                                  * not a contiguous page, restart list.
2738                                  */
2739                                 pp_last = NULL;
2740                                 nwanted = minctg;
2741                                 goto restart;
2742                         } else { /* add page to list */
2743                                 pp_first = pp;
2744                                 nwanted--;
2745                         }
2746                 }
2747 skip:
2748                 if (pp == *poolp)
2749                         break;
2750         }
2751 
2752         /*
2753          * If we didn't find memory. Try the more constrained pool, then
2754          * sweep free pages into the DMA pool and try again.
2755          */
2756         if (nwanted != 0) {
2757                 mutex_exit(&io_pool_lock);
2758                 /*
2759                  * If we were looking in the less constrained pool and
2760                  * didn't find pages, try the more constrained pool.
2761                  */
2762                 if (poolp == &io_pool_4g) {
2763                         poolp = &io_pool_16m;
2764                         goto try_smaller;
2765                 }
2766                 kmem_reap();
2767                 if (++attempt < 4) {
2768                         /*
2769                          * Grab some more io_pool pages
2770                          */
2771                         (void) populate_io_pool();
2772                         goto try_again; /* go around and retry */
2773                 }
2774                 return (NULL);
2775         }
2776         /*
2777          * Found the pages, now snip them from the list
2778          */
2779         page_io_pool_sub(poolp, pp_first, pp_last);
2780         io_pool_cnt -= minctg;
2781         /*
2782          * reset low water mark
2783          */
2784         if (io_pool_cnt < io_pool_cnt_lowater)
2785                 io_pool_cnt_lowater = io_pool_cnt;
2786         mutex_exit(&io_pool_lock);
2787         return (pp_first);
2788 }
2789 
2790 page_t *
2791 page_swap_with_hypervisor(struct vnode *vp, u_offset_t off, caddr_t vaddr,
2792     ddi_dma_attr_t *mattr, uint_t flags, pgcnt_t minctg)
2793 {
2794         uint_t kflags;
2795         int order, extra, extpages, i, contig, nbits, extents;
2796         page_t *pp, *expp, *pp_first, **pplist = NULL;
2797         mfn_t *mfnlist = NULL;
2798 
2799         contig = flags & PG_PHYSCONTIG;
2800         if (minctg == 1)
2801                 contig = 0;
2802         flags &= ~PG_PHYSCONTIG;
2803         kflags = flags & PG_WAIT ? KM_SLEEP : KM_NOSLEEP;
2804         /*
2805          * Hypervisor will allocate extents, if we want contig
2806          * pages extent must be >= minctg
2807          */
2808         if (contig) {
2809                 order = highbit(minctg) - 1;
2810                 if (minctg & ((1 << order) - 1))
2811                         order++;
2812                 extpages = 1 << order;
2813         } else {
2814                 order = 0;
2815                 extpages = minctg;
2816         }
2817         if (extpages > minctg) {
2818                 extra = extpages - minctg;
2819                 if (!page_resv(extra, kflags))
2820                         return (NULL);
2821         }
2822         pp_first = NULL;
2823         pplist = kmem_alloc(extpages * sizeof (page_t *), kflags);
2824         if (pplist == NULL)
2825                 goto balloon_fail;
2826         mfnlist = kmem_alloc(extpages * sizeof (mfn_t), kflags);
2827         if (mfnlist == NULL)
2828                 goto balloon_fail;
2829         pp = page_create_va(vp, off, minctg * PAGESIZE, flags, &kvseg, vaddr);
2830         if (pp == NULL)
2831                 goto balloon_fail;
2832         pp_first = pp;
2833         if (extpages > minctg) {
2834                 /*
2835                  * fill out the rest of extent pages to swap
2836                  * with the hypervisor
2837                  */
2838                 for (i = 0; i < extra; i++) {
2839                         expp = page_create_va(vp,
2840                             (u_offset_t)(uintptr_t)io_pool_kva,
2841                             PAGESIZE, flags, &kvseg, io_pool_kva);
2842                         if (expp == NULL)
2843                                 goto balloon_fail;
2844                         (void) hat_pageunload(expp, HAT_FORCE_PGUNLOAD);
2845                         page_io_unlock(expp);
2846                         page_hashout(expp, NULL);
2847                         page_io_lock(expp);
2848                         /*
2849                          * add page to end of list
2850                          */
2851                         expp->p_prev = pp_first->p_prev;
2852                         expp->p_next = pp_first;
2853                         expp->p_prev->p_next = expp;
2854                         pp_first->p_prev = expp;
2855                 }
2856 
2857         }
2858         for (i = 0; i < extpages; i++) {
2859                 pplist[i] = pp;
2860                 pp = pp->p_next;
2861         }
2862         nbits = highbit(mattr->dma_attr_addr_hi);
2863         extents = contig ? 1 : minctg;
2864         if (balloon_replace_pages(extents, pplist, nbits, order,
2865             mfnlist) != extents) {
2866                 if (ioalloc_dbg)
2867                         cmn_err(CE_NOTE, "request to hypervisor"
2868                             " for %d pages, maxaddr %" PRIx64 " failed",
2869                             extpages, mattr->dma_attr_addr_hi);
2870                 goto balloon_fail;
2871         }
2872 
2873         kmem_free(pplist, extpages * sizeof (page_t *));
2874         kmem_free(mfnlist, extpages * sizeof (mfn_t));
2875         /*
2876          * Return any excess pages to free list
2877          */
2878         if (extpages > minctg) {
2879                 for (i = 0; i < extra; i++) {
2880                         pp = pp_first->p_prev;
2881                         page_sub(&pp_first, pp);
2882                         page_io_unlock(pp);
2883                         page_unresv(1);
2884                         page_free(pp, 1);
2885                 }
2886         }
2887         return (pp_first);
2888 balloon_fail:
2889         /*
2890          * Return pages to free list and return failure
2891          */
2892         while (pp_first != NULL) {
2893                 pp = pp_first;
2894                 page_sub(&pp_first, pp);
2895                 page_io_unlock(pp);
2896                 if (pp->p_vnode != NULL)
2897                         page_hashout(pp, NULL);
2898                 page_free(pp, 1);
2899         }
2900         if (pplist)
2901                 kmem_free(pplist, extpages * sizeof (page_t *));
2902         if (mfnlist)
2903                 kmem_free(mfnlist, extpages * sizeof (mfn_t));
2904         page_unresv(extpages - minctg);
2905         return (NULL);
2906 }
2907 
2908 static void
2909 return_partial_alloc(page_t *plist)
2910 {
2911         page_t *pp;
2912 
2913         while (plist != NULL) {
2914                 pp = plist;
2915                 page_sub(&plist, pp);
2916                 page_io_unlock(pp);
2917                 page_destroy_io(pp);
2918         }
2919 }
2920 
2921 static page_t *
2922 page_get_contigpages(
2923         struct vnode    *vp,
2924         u_offset_t      off,
2925         int             *npagesp,
2926         uint_t          flags,
2927         caddr_t         vaddr,
2928         ddi_dma_attr_t  *mattr)
2929 {
2930         mfn_t   max_mfn = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL);
2931         page_t  *plist; /* list to return */
2932         page_t  *pp, *mcpl;
2933         int     contig, anyaddr, npages, getone = 0;
2934         mfn_t   lo_mfn;
2935         mfn_t   hi_mfn;
2936         pgcnt_t pfnalign = 0;
2937         int     align, sgllen;
2938         uint64_t pfnseg;
2939         pgcnt_t minctg;
2940 
2941         npages = *npagesp;
2942         ASSERT(mattr != NULL);
2943         lo_mfn = mmu_btop(mattr->dma_attr_addr_lo);
2944         hi_mfn = mmu_btop(mattr->dma_attr_addr_hi);
2945         sgllen = mattr->dma_attr_sgllen;
2946         pfnseg = mmu_btop(mattr->dma_attr_seg);
2947         align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer);
2948         if (align > MMU_PAGESIZE)
2949                 pfnalign = mmu_btop(align);
2950 
2951         contig = flags & PG_PHYSCONTIG;
2952         if (npages == -1) {
2953                 npages = 1;
2954                 pfnalign = 0;
2955         }
2956         /*
2957          * Clear the contig flag if only one page is needed.
2958          */
2959         if (npages == 1) {
2960                 getone = 1;
2961                 contig = 0;
2962         }
2963 
2964         /*
2965          * Check if any page in the system is fine.
2966          */
2967         anyaddr = lo_mfn == 0 && hi_mfn >= max_mfn;
2968         if (!contig && anyaddr && !pfnalign) {
2969                 flags &= ~PG_PHYSCONTIG;
2970                 plist = page_create_va(vp, off, npages * MMU_PAGESIZE,
2971                     flags, &kvseg, vaddr);
2972                 if (plist != NULL) {
2973                         *npagesp = 0;
2974                         return (plist);
2975                 }
2976         }
2977         plist = NULL;
2978         minctg = howmany(npages, sgllen);
2979         while (npages > sgllen || getone) {
2980                 if (minctg > npages)
2981                         minctg = npages;
2982                 mcpl = NULL;
2983                 /*
2984                  * We could want contig pages with no address range limits.
2985                  */
2986                 if (anyaddr && contig) {
2987                         /*
2988                          * Look for free contig pages to satisfy the request.
2989                          */
2990                         mcpl = find_contig_free(minctg, flags, pfnseg,
2991                             pfnalign);
2992                 }
2993                 /*
2994                  * Try the reserved io pools next
2995                  */
2996                 if (mcpl == NULL)
2997                         mcpl = page_io_pool_alloc(mattr, contig, minctg);
2998                 if (mcpl != NULL) {
2999                         pp = mcpl;
3000                         do {
3001                                 if (!page_hashin(pp, vp, off, NULL)) {
3002                                         panic("page_get_contigpages:"
3003                                             " hashin failed"
3004                                             " pp %p, vp %p, off %llx",
3005                                             (void *)pp, (void *)vp, off);
3006                                 }
3007                                 off += MMU_PAGESIZE;
3008                                 PP_CLRFREE(pp);
3009                                 PP_CLRAGED(pp);
3010                                 page_set_props(pp, P_REF);
3011                                 page_io_lock(pp);
3012                                 pp = pp->p_next;
3013                         } while (pp != mcpl);
3014                 } else {
3015                         /*
3016                          * Hypervisor exchange doesn't handle segment or
3017                          * alignment constraints
3018                          */
3019                         if (mattr->dma_attr_seg < mattr->dma_attr_addr_hi ||
3020                             pfnalign)
3021                                 goto fail;
3022                         /*
3023                          * Try exchanging pages with the hypervisor
3024                          */
3025                         mcpl = page_swap_with_hypervisor(vp, off, vaddr, mattr,
3026                             flags, minctg);
3027                         if (mcpl == NULL)
3028                                 goto fail;
3029                         off += minctg * MMU_PAGESIZE;
3030                 }
3031                 check_dma(mattr, mcpl, minctg);
3032                 /*
3033                  * Here with a minctg run of contiguous pages, add them to the
3034                  * list we will return for this request.
3035                  */
3036                 page_list_concat(&plist, &mcpl);
3037                 npages -= minctg;
3038                 *npagesp = npages;
3039                 sgllen--;
3040                 if (getone)
3041                         break;
3042         }
3043         return (plist);
3044 fail:
3045         return_partial_alloc(plist);
3046         return (NULL);
3047 }
3048 
3049 /*
3050  * Allocator for domain 0 I/O pages. We match the required
3051  * DMA attributes and contiguity constraints.
3052  */
3053 /*ARGSUSED*/
3054 page_t *
3055 page_create_io(
3056         struct vnode    *vp,
3057         u_offset_t      off,
3058         uint_t          bytes,
3059         uint_t          flags,
3060         struct as       *as,
3061         caddr_t         vaddr,
3062         ddi_dma_attr_t  *mattr)
3063 {
3064         page_t  *plist = NULL, *pp;
3065         int     npages = 0, contig, anyaddr, pages_req;
3066         mfn_t   lo_mfn;
3067         mfn_t   hi_mfn;
3068         pgcnt_t pfnalign = 0;
3069         int     align;
3070         int     is_domu = 0;
3071         int     dummy, bytes_got;
3072         mfn_t   max_mfn = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL);
3073 
3074         ASSERT(mattr != NULL);
3075         lo_mfn = mmu_btop(mattr->dma_attr_addr_lo);
3076         hi_mfn = mmu_btop(mattr->dma_attr_addr_hi);
3077         align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer);
3078         if (align > MMU_PAGESIZE)
3079                 pfnalign = mmu_btop(align);
3080 
3081         /*
3082          * Clear the contig flag if only one page is needed or the scatter
3083          * gather list length is >= npages.
3084          */
3085         pages_req = npages = mmu_btopr(bytes);
3086         contig = (flags & PG_PHYSCONTIG);
3087         bytes = P2ROUNDUP(bytes, MMU_PAGESIZE);
3088         if (bytes == MMU_PAGESIZE || mattr->dma_attr_sgllen >= npages)
3089                 contig = 0;
3090 
3091         /*
3092          * Check if any old page in the system is fine.
3093          * DomU should always go down this path.
3094          */
3095         is_domu = !DOMAIN_IS_INITDOMAIN(xen_info);
3096         anyaddr = lo_mfn == 0 && hi_mfn >= max_mfn && !pfnalign;
3097         if ((!contig && anyaddr) || is_domu) {
3098                 flags &= ~PG_PHYSCONTIG;
3099                 plist = page_create_va(vp, off, bytes, flags, &kvseg, vaddr);
3100                 if (plist != NULL)
3101                         return (plist);
3102                 else if (is_domu)
3103                         return (NULL); /* no memory available */
3104         }
3105         /*
3106          * DomU should never reach here
3107          */
3108         if (contig) {
3109                 plist = page_get_contigpages(vp, off, &npages, flags, vaddr,
3110                     mattr);
3111                 if (plist == NULL)
3112                         goto fail;
3113                 bytes_got = (pages_req - npages) << MMU_PAGESHIFT;
3114                 vaddr += bytes_got;
3115                 off += bytes_got;
3116                 /*
3117                  * We now have all the contiguous pages we need, but
3118                  * we may still need additional non-contiguous pages.
3119                  */
3120         }
3121         /*
3122          * now loop collecting the requested number of pages, these do
3123          * not have to be contiguous pages but we will use the contig
3124          * page alloc code to get the pages since it will honor any
3125          * other constraints the pages may have.
3126          */
3127         while (npages--) {
3128                 dummy = -1;
3129                 pp = page_get_contigpages(vp, off, &dummy, flags, vaddr, mattr);
3130                 if (pp == NULL)
3131                         goto fail;
3132                 page_add(&plist, pp);
3133                 vaddr += MMU_PAGESIZE;
3134                 off += MMU_PAGESIZE;
3135         }
3136         return (plist);
3137 fail:
3138         /*
3139          * Failed to get enough pages, return ones we did get
3140          */
3141         return_partial_alloc(plist);
3142         return (NULL);
3143 }
3144 
3145 /*
3146  * Lock and return the page with the highest mfn that we can find.  last_mfn
3147  * holds the last one found, so the next search can start from there.  We
3148  * also keep a counter so that we don't loop forever if the machine has no
3149  * free pages.
3150  *
3151  * This is called from the balloon thread to find pages to give away.  new_high
3152  * is used when new mfn's have been added to the system - we will reset our
3153  * search if the new mfn's are higher than our current search position.
3154  */
3155 page_t *
3156 page_get_high_mfn(mfn_t new_high)
3157 {
3158         static mfn_t last_mfn = 0;
3159         pfn_t pfn;
3160         page_t *pp;
3161         ulong_t loop_count = 0;
3162 
3163         if (new_high > last_mfn)
3164                 last_mfn = new_high;
3165 
3166         for (; loop_count < mfn_count; loop_count++, last_mfn--) {
3167                 if (last_mfn == 0) {
3168                         last_mfn = cached_max_mfn;
3169                 }
3170 
3171                 pfn = mfn_to_pfn(last_mfn);
3172                 if (pfn & PFN_IS_FOREIGN_MFN)
3173                         continue;
3174 
3175                 /* See if the page is free.  If so, lock it. */
3176                 pp = page_numtopp_alloc(pfn);
3177                 if (pp == NULL)
3178                         continue;
3179                 PP_CLRFREE(pp);
3180 
3181                 ASSERT(PAGE_EXCL(pp));
3182                 ASSERT(pp->p_vnode == NULL);
3183                 ASSERT(!hat_page_is_mapped(pp));
3184                 last_mfn--;
3185                 return (pp);
3186         }
3187         return (NULL);
3188 }
3189 
3190 #else /* !__xpv */
3191 
3192 /*
3193  * get a page from any list with the given mnode
3194  */
3195 static page_t *
3196 page_get_mnode_anylist(ulong_t origbin, uchar_t szc, uint_t flags,
3197     int mnode, int mtype, ddi_dma_attr_t *dma_attr)
3198 {
3199         kmutex_t                *pcm;
3200         int                     i;
3201         page_t                  *pp;
3202         page_t                  *first_pp;
3203         uint64_t                pgaddr;
3204         ulong_t                 bin;
3205         int                     mtypestart;
3206         int                     plw_initialized;
3207         page_list_walker_t      plw;
3208 
3209         VM_STAT_ADD(pga_vmstats.pgma_alloc);
3210 
3211         ASSERT((flags & PG_MATCH_COLOR) == 0);
3212         ASSERT(szc == 0);
3213         ASSERT(dma_attr != NULL);
3214 
3215         MTYPE_START(mnode, mtype, flags);
3216         if (mtype < 0) {
3217                 VM_STAT_ADD(pga_vmstats.pgma_allocempty);
3218                 return (NULL);
3219         }
3220 
3221         mtypestart = mtype;
3222 
3223         bin = origbin;
3224 
3225         /*
3226          * check up to page_colors + 1 bins - origbin may be checked twice
3227          * because of BIN_STEP skip
3228          */
3229         do {
3230                 plw_initialized = 0;
3231 
3232                 for (plw.plw_count = 0;
3233                     plw.plw_count < page_colors; plw.plw_count++) {
3234 
3235                         if (PAGE_FREELISTS(mnode, szc, bin, mtype) == NULL)
3236                                 goto nextfreebin;
3237 
3238                         pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
3239                         mutex_enter(pcm);
3240                         pp = PAGE_FREELISTS(mnode, szc, bin, mtype);
3241                         first_pp = pp;
3242                         while (pp != NULL) {
3243                                 if (IS_DUMP_PAGE(pp) || page_trylock(pp,
3244                                     SE_EXCL) == 0) {
3245                                         pp = pp->p_next;
3246                                         if (pp == first_pp) {
3247                                                 pp = NULL;
3248                                         }
3249                                         continue;
3250                                 }
3251 
3252                                 ASSERT(PP_ISFREE(pp));
3253                                 ASSERT(PP_ISAGED(pp));
3254                                 ASSERT(pp->p_vnode == NULL);
3255                                 ASSERT(pp->p_hash == NULL);
3256                                 ASSERT(pp->p_offset == (u_offset_t)-1);
3257                                 ASSERT(pp->p_szc == szc);
3258                                 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
3259                                 /* check if page within DMA attributes */
3260                                 pgaddr = pa_to_ma(pfn_to_pa(pp->p_pagenum));
3261                                 if ((pgaddr >= dma_attr->dma_attr_addr_lo) &&
3262                                     (pgaddr + MMU_PAGESIZE - 1 <=
3263                                     dma_attr->dma_attr_addr_hi)) {
3264                                         break;
3265                                 }
3266 
3267                                 /* continue looking */
3268                                 page_unlock(pp);
3269                                 pp = pp->p_next;
3270                                 if (pp == first_pp)
3271                                         pp = NULL;
3272 
3273                         }
3274                         if (pp != NULL) {
3275                                 ASSERT(mtype == PP_2_MTYPE(pp));
3276                                 ASSERT(pp->p_szc == 0);
3277 
3278                                 /* found a page with specified DMA attributes */
3279                                 page_sub(&PAGE_FREELISTS(mnode, szc, bin,
3280                                     mtype), pp);
3281                                 page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST);
3282 
3283                                 if ((PP_ISFREE(pp) == 0) ||
3284                                     (PP_ISAGED(pp) == 0)) {
3285                                         cmn_err(CE_PANIC, "page %p is not free",
3286                                             (void *)pp);
3287                                 }
3288 
3289                                 mutex_exit(pcm);
3290                                 check_dma(dma_attr, pp, 1);
3291                                 VM_STAT_ADD(pga_vmstats.pgma_allocok);
3292                                 return (pp);
3293                         }
3294                         mutex_exit(pcm);
3295 nextfreebin:
3296                         if (plw_initialized == 0) {
3297                                 page_list_walk_init(szc, 0, bin, 1, 0, &plw);
3298                                 ASSERT(plw.plw_ceq_dif == page_colors);
3299                                 plw_initialized = 1;
3300                         }
3301 
3302                         if (plw.plw_do_split) {
3303                                 pp = page_freelist_split(szc, bin, mnode,
3304                                     mtype,
3305                                     mmu_btop(dma_attr->dma_attr_addr_lo),
3306                                     mmu_btop(dma_attr->dma_attr_addr_hi + 1),
3307                                     &plw);
3308                                 if (pp != NULL) {
3309                                         check_dma(dma_attr, pp, 1);
3310                                         return (pp);
3311                                 }
3312                         }
3313 
3314                         bin = page_list_walk_next_bin(szc, bin, &plw);
3315                 }
3316 
3317                 MTYPE_NEXT(mnode, mtype, flags);
3318         } while (mtype >= 0);
3319 
3320         /* failed to find a page in the freelist; try it in the cachelist */
3321 
3322         /* reset mtype start for cachelist search */
3323         mtype = mtypestart;
3324         ASSERT(mtype >= 0);
3325 
3326         /* start with the bin of matching color */
3327         bin = origbin;
3328 
3329         do {
3330                 for (i = 0; i <= page_colors; i++) {
3331                         if (PAGE_CACHELISTS(mnode, bin, mtype) == NULL)
3332                                 goto nextcachebin;
3333                         pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST);
3334                         mutex_enter(pcm);
3335                         pp = PAGE_CACHELISTS(mnode, bin, mtype);
3336                         first_pp = pp;
3337                         while (pp != NULL) {
3338                                 if (IS_DUMP_PAGE(pp) || page_trylock(pp,
3339                                     SE_EXCL) == 0) {
3340                                         pp = pp->p_next;
3341                                         if (pp == first_pp)
3342                                                 pp = NULL;
3343                                         continue;
3344                                 }
3345                                 ASSERT(pp->p_vnode);
3346                                 ASSERT(PP_ISAGED(pp) == 0);
3347                                 ASSERT(pp->p_szc == 0);
3348                                 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
3349 
3350                                 /* check if page within DMA attributes */
3351 
3352                                 pgaddr = pa_to_ma(pfn_to_pa(pp->p_pagenum));
3353                                 if ((pgaddr >= dma_attr->dma_attr_addr_lo) &&
3354                                     (pgaddr + MMU_PAGESIZE - 1 <=
3355                                     dma_attr->dma_attr_addr_hi)) {
3356                                         break;
3357                                 }
3358 
3359                                 /* continue looking */
3360                                 page_unlock(pp);
3361                                 pp = pp->p_next;
3362                                 if (pp == first_pp)
3363                                         pp = NULL;
3364                         }
3365 
3366                         if (pp != NULL) {
3367                                 ASSERT(mtype == PP_2_MTYPE(pp));
3368                                 ASSERT(pp->p_szc == 0);
3369 
3370                                 /* found a page with specified DMA attributes */
3371                                 page_sub(&PAGE_CACHELISTS(mnode, bin,
3372                                     mtype), pp);
3373                                 page_ctr_sub(mnode, mtype, pp, PG_CACHE_LIST);
3374 
3375                                 mutex_exit(pcm);
3376                                 ASSERT(pp->p_vnode);
3377                                 ASSERT(PP_ISAGED(pp) == 0);
3378                                 check_dma(dma_attr, pp, 1);
3379                                 VM_STAT_ADD(pga_vmstats.pgma_allocok);
3380                                 return (pp);
3381                         }
3382                         mutex_exit(pcm);
3383 nextcachebin:
3384                         bin += (i == 0) ? BIN_STEP : 1;
3385                         bin &= page_colors_mask;
3386                 }
3387                 MTYPE_NEXT(mnode, mtype, flags);
3388         } while (mtype >= 0);
3389 
3390         VM_STAT_ADD(pga_vmstats.pgma_allocfailed);
3391         return (NULL);
3392 }
3393 
3394 /*
3395  * This function is similar to page_get_freelist()/page_get_cachelist()
3396  * but it searches both the lists to find a page with the specified
3397  * color (or no color) and DMA attributes. The search is done in the
3398  * freelist first and then in the cache list within the highest memory
3399  * range (based on DMA attributes) before searching in the lower
3400  * memory ranges.
3401  *
3402  * Note: This function is called only by page_create_io().
3403  */
3404 /*ARGSUSED*/
3405 static page_t *
3406 page_get_anylist(struct vnode *vp, u_offset_t off, struct as *as, caddr_t vaddr,
3407     size_t size, uint_t flags, ddi_dma_attr_t *dma_attr, lgrp_t *lgrp)
3408 {
3409         uint_t          bin;
3410         int             mtype;
3411         page_t          *pp;
3412         int             n;
3413         int             m;
3414         int             szc;
3415         int             fullrange;
3416         int             mnode;
3417         int             local_failed_stat = 0;
3418         lgrp_mnode_cookie_t     lgrp_cookie;
3419 
3420         VM_STAT_ADD(pga_vmstats.pga_alloc);
3421 
3422         /* only base pagesize currently supported */
3423         if (size != MMU_PAGESIZE)
3424                 return (NULL);
3425 
3426         /*
3427          * If we're passed a specific lgroup, we use it.  Otherwise,
3428          * assume first-touch placement is desired.
3429          */
3430         if (!LGRP_EXISTS(lgrp))
3431                 lgrp = lgrp_home_lgrp();
3432 
3433         /* LINTED */
3434         AS_2_BIN(as, seg, vp, vaddr, bin, 0);
3435 
3436         /*
3437          * Only hold one freelist or cachelist lock at a time, that way we
3438          * can start anywhere and not have to worry about lock
3439          * ordering.
3440          */
3441         if (dma_attr == NULL) {
3442                 n = mtype16m;
3443                 m = mtypetop;
3444                 fullrange = 1;
3445                 VM_STAT_ADD(pga_vmstats.pga_nulldmaattr);
3446         } else {
3447                 pfn_t pfnlo = mmu_btop(dma_attr->dma_attr_addr_lo);
3448                 pfn_t pfnhi = mmu_btop(dma_attr->dma_attr_addr_hi);
3449 
3450                 /*
3451                  * We can guarantee alignment only for page boundary.
3452                  */
3453                 if (dma_attr->dma_attr_align > MMU_PAGESIZE)
3454                         return (NULL);
3455 
3456                 /* Sanity check the dma_attr */
3457                 if (pfnlo > pfnhi)
3458                         return (NULL);
3459 
3460                 n = pfn_2_mtype(pfnlo);
3461                 m = pfn_2_mtype(pfnhi);
3462 
3463                 fullrange = ((pfnlo == mnoderanges[n].mnr_pfnlo) &&
3464                     (pfnhi >= mnoderanges[m].mnr_pfnhi));
3465         }
3466         VM_STAT_COND_ADD(fullrange == 0, pga_vmstats.pga_notfullrange);
3467 
3468         szc = 0;
3469 
3470         /* cylcing thru mtype handled by RANGE0 if n == mtype16m */
3471         if (n == mtype16m) {
3472                 flags |= PGI_MT_RANGE0;
3473                 n = m;
3474         }
3475 
3476         /*
3477          * Try local memory node first, but try remote if we can't
3478          * get a page of the right color.
3479          */
3480         LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_HIER);
3481         while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3482                 /*
3483                  * allocate pages from high pfn to low.
3484                  */
3485                 mtype = m;
3486                 do {
3487                         if (fullrange != 0) {
3488                                 pp = page_get_mnode_freelist(mnode,
3489                                     bin, mtype, szc, flags);
3490                                 if (pp == NULL) {
3491                                         pp = page_get_mnode_cachelist(
3492                                             bin, flags, mnode, mtype);
3493                                 }
3494                         } else {
3495                                 pp = page_get_mnode_anylist(bin, szc,
3496                                     flags, mnode, mtype, dma_attr);
3497                         }
3498                         if (pp != NULL) {
3499                                 VM_STAT_ADD(pga_vmstats.pga_allocok);
3500                                 check_dma(dma_attr, pp, 1);
3501                                 return (pp);
3502                         }
3503                 } while (mtype != n &&
3504                     (mtype = mnoderanges[mtype].mnr_next) != -1);
3505                 if (!local_failed_stat) {
3506                         lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1);
3507                         local_failed_stat = 1;
3508                 }
3509         }
3510         VM_STAT_ADD(pga_vmstats.pga_allocfailed);
3511 
3512         return (NULL);
3513 }
3514 
3515 /*
3516  * page_create_io()
3517  *
3518  * This function is a copy of page_create_va() with an additional
3519  * argument 'mattr' that specifies DMA memory requirements to
3520  * the page list functions. This function is used by the segkmem
3521  * allocator so it is only to create new pages (i.e PG_EXCL is
3522  * set).
3523  *
3524  * Note: This interface is currently used by x86 PSM only and is
3525  *       not fully specified so the commitment level is only for
3526  *       private interface specific to x86. This interface uses PSM
3527  *       specific page_get_anylist() interface.
3528  */
3529 
3530 #define PAGE_HASH_SEARCH(index, pp, vp, off) { \
3531         for ((pp) = page_hash[(index)]; (pp); (pp) = (pp)->p_hash) { \
3532                 if ((pp)->p_vnode == (vp) && (pp)->p_offset == (off)) \
3533                         break; \
3534         } \
3535 }
3536 
3537 
3538 page_t *
3539 page_create_io(
3540         struct vnode    *vp,
3541         u_offset_t      off,
3542         uint_t          bytes,
3543         uint_t          flags,
3544         struct as       *as,
3545         caddr_t         vaddr,
3546         ddi_dma_attr_t  *mattr) /* DMA memory attributes if any */
3547 {
3548         page_t          *plist = NULL;
3549         uint_t          plist_len = 0;
3550         pgcnt_t         npages;
3551         page_t          *npp = NULL;
3552         uint_t          pages_req;
3553         page_t          *pp;
3554         kmutex_t        *phm = NULL;
3555         uint_t          index;
3556 
3557         TRACE_4(TR_FAC_VM, TR_PAGE_CREATE_START,
3558             "page_create_start:vp %p off %llx bytes %u flags %x",
3559             vp, off, bytes, flags);
3560 
3561         ASSERT((flags & ~(PG_EXCL | PG_WAIT | PG_PHYSCONTIG)) == 0);
3562 
3563         pages_req = npages = mmu_btopr(bytes);
3564 
3565         /*
3566          * Do the freemem and pcf accounting.
3567          */
3568         if (!page_create_wait(npages, flags)) {
3569                 return (NULL);
3570         }
3571 
3572         TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SUCCESS,
3573             "page_create_success:vp %p off %llx", vp, off);
3574 
3575         /*
3576          * If satisfying this request has left us with too little
3577          * memory, start the wheels turning to get some back.  The
3578          * first clause of the test prevents waking up the pageout
3579          * daemon in situations where it would decide that there's
3580          * nothing to do.
3581          */
3582         if (nscan < desscan && freemem < minfree) {
3583                 TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL,
3584                     "pageout_cv_signal:freemem %ld", freemem);
3585                 cv_signal(&proc_pageout->p_cv);
3586         }
3587 
3588         if (flags & PG_PHYSCONTIG) {
3589 
3590                 plist = page_get_contigpage(&npages, mattr, 1);
3591                 if (plist == NULL) {
3592                         page_create_putback(npages);
3593                         return (NULL);
3594                 }
3595 
3596                 pp = plist;
3597 
3598                 do {
3599                         if (!page_hashin(pp, vp, off, NULL)) {
3600                                 panic("pg_creat_io: hashin failed %p %p %llx",
3601                                     (void *)pp, (void *)vp, off);
3602                         }
3603                         VM_STAT_ADD(page_create_new);
3604                         off += MMU_PAGESIZE;
3605                         PP_CLRFREE(pp);
3606                         PP_CLRAGED(pp);
3607                         page_set_props(pp, P_REF);
3608                         pp = pp->p_next;
3609                 } while (pp != plist);
3610 
3611                 if (!npages) {
3612                         check_dma(mattr, plist, pages_req);
3613                         return (plist);
3614                 } else {
3615                         vaddr += (pages_req - npages) << MMU_PAGESHIFT;
3616                 }
3617 
3618                 /*
3619                  * fall-thru:
3620                  *
3621                  * page_get_contigpage returns when npages <= sgllen.
3622                  * Grab the rest of the non-contig pages below from anylist.
3623                  */
3624         }
3625 
3626         /*
3627          * Loop around collecting the requested number of pages.
3628          * Most of the time, we have to `create' a new page. With
3629          * this in mind, pull the page off the free list before
3630          * getting the hash lock.  This will minimize the hash
3631          * lock hold time, nesting, and the like.  If it turns
3632          * out we don't need the page, we put it back at the end.
3633          */
3634         while (npages--) {
3635                 phm = NULL;
3636 
3637                 index = PAGE_HASH_FUNC(vp, off);
3638 top:
3639                 ASSERT(phm == NULL);
3640                 ASSERT(index == PAGE_HASH_FUNC(vp, off));
3641                 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
3642 
3643                 if (npp == NULL) {
3644                         /*
3645                          * Try to get the page of any color either from
3646                          * the freelist or from the cache list.
3647                          */
3648                         npp = page_get_anylist(vp, off, as, vaddr, MMU_PAGESIZE,
3649                             flags & ~PG_MATCH_COLOR, mattr, NULL);
3650                         if (npp == NULL) {
3651                                 if (mattr == NULL) {
3652                                         /*
3653                                          * Not looking for a special page;
3654                                          * panic!
3655                                          */
3656                                         panic("no page found %d", (int)npages);
3657                                 }
3658                                 /*
3659                                  * No page found! This can happen
3660                                  * if we are looking for a page
3661                                  * within a specific memory range
3662                                  * for DMA purposes. If PG_WAIT is
3663                                  * specified then we wait for a
3664                                  * while and then try again. The
3665                                  * wait could be forever if we
3666                                  * don't get the page(s) we need.
3667                                  *
3668                                  * Note: XXX We really need a mechanism
3669                                  * to wait for pages in the desired
3670                                  * range. For now, we wait for any
3671                                  * pages and see if we can use it.
3672                                  */
3673 
3674                                 if ((mattr != NULL) && (flags & PG_WAIT)) {
3675                                         delay(10);
3676                                         goto top;
3677                                 }
3678                                 goto fail; /* undo accounting stuff */
3679                         }
3680 
3681                         if (PP_ISAGED(npp) == 0) {
3682                                 /*
3683                                  * Since this page came from the
3684                                  * cachelist, we must destroy the
3685                                  * old vnode association.
3686                                  */
3687                                 page_hashout(npp, (kmutex_t *)NULL);
3688                         }
3689                 }
3690 
3691                 /*
3692                  * We own this page!
3693                  */
3694                 ASSERT(PAGE_EXCL(npp));
3695                 ASSERT(npp->p_vnode == NULL);
3696                 ASSERT(!hat_page_is_mapped(npp));
3697                 PP_CLRFREE(npp);
3698                 PP_CLRAGED(npp);
3699 
3700                 /*
3701                  * Here we have a page in our hot little mits and are
3702                  * just waiting to stuff it on the appropriate lists.
3703                  * Get the mutex and check to see if it really does
3704                  * not exist.
3705                  */
3706                 phm = PAGE_HASH_MUTEX(index);
3707                 mutex_enter(phm);
3708                 PAGE_HASH_SEARCH(index, pp, vp, off);
3709                 if (pp == NULL) {
3710                         VM_STAT_ADD(page_create_new);
3711                         pp = npp;
3712                         npp = NULL;
3713                         if (!page_hashin(pp, vp, off, phm)) {
3714                                 /*
3715                                  * Since we hold the page hash mutex and
3716                                  * just searched for this page, page_hashin
3717                                  * had better not fail.  If it does, that
3718                                  * means somethread did not follow the
3719                                  * page hash mutex rules.  Panic now and
3720                                  * get it over with.  As usual, go down
3721                                  * holding all the locks.
3722                                  */
3723                                 ASSERT(MUTEX_HELD(phm));
3724                                 panic("page_create: hashin fail %p %p %llx %p",
3725                                     (void *)pp, (void *)vp, off, (void *)phm);
3726 
3727                         }
3728                         ASSERT(MUTEX_HELD(phm));
3729                         mutex_exit(phm);
3730                         phm = NULL;
3731 
3732                         /*
3733                          * Hat layer locking need not be done to set
3734                          * the following bits since the page is not hashed
3735                          * and was on the free list (i.e., had no mappings).
3736                          *
3737                          * Set the reference bit to protect
3738                          * against immediate pageout
3739                          *
3740                          * XXXmh modify freelist code to set reference
3741                          * bit so we don't have to do it here.
3742                          */
3743                         page_set_props(pp, P_REF);
3744                 } else {
3745                         ASSERT(MUTEX_HELD(phm));
3746                         mutex_exit(phm);
3747                         phm = NULL;
3748                         /*
3749                          * NOTE: This should not happen for pages associated
3750                          *       with kernel vnode 'kvp'.
3751                          */
3752                         /* XX64 - to debug why this happens! */
3753                         ASSERT(!VN_ISKAS(vp));
3754                         if (VN_ISKAS(vp))
3755                                 cmn_err(CE_NOTE,
3756                                     "page_create: page not expected "
3757                                     "in hash list for kernel vnode - pp 0x%p",
3758                                     (void *)pp);
3759                         VM_STAT_ADD(page_create_exists);
3760                         goto fail;
3761                 }
3762 
3763                 /*
3764                  * Got a page!  It is locked.  Acquire the i/o
3765                  * lock since we are going to use the p_next and
3766                  * p_prev fields to link the requested pages together.
3767                  */
3768                 page_io_lock(pp);
3769                 page_add(&plist, pp);
3770                 plist = plist->p_next;
3771                 off += MMU_PAGESIZE;
3772                 vaddr += MMU_PAGESIZE;
3773         }
3774 
3775         check_dma(mattr, plist, pages_req);
3776         return (plist);
3777 
3778 fail:
3779         if (npp != NULL) {
3780                 /*
3781                  * Did not need this page after all.
3782                  * Put it back on the free list.
3783                  */
3784                 VM_STAT_ADD(page_create_putbacks);
3785                 PP_SETFREE(npp);
3786                 PP_SETAGED(npp);
3787                 npp->p_offset = (u_offset_t)-1;
3788                 page_list_add(npp, PG_FREE_LIST | PG_LIST_TAIL);
3789                 page_unlock(npp);
3790         }
3791 
3792         /*
3793          * Give up the pages we already got.
3794          */
3795         while (plist != NULL) {
3796                 pp = plist;
3797                 page_sub(&plist, pp);
3798                 page_io_unlock(pp);
3799                 plist_len++;
3800                 /*LINTED: constant in conditional ctx*/
3801                 VN_DISPOSE(pp, B_INVAL, 0, kcred);
3802         }
3803 
3804         /*
3805          * VN_DISPOSE does freemem accounting for the pages in plist
3806          * by calling page_free. So, we need to undo the pcf accounting
3807          * for only the remaining pages.
3808          */
3809         VM_STAT_ADD(page_create_putbacks);
3810         page_create_putback(pages_req - plist_len);
3811 
3812         return (NULL);
3813 }
3814 #endif /* !__xpv */
3815 
3816 
3817 /*
3818  * Copy the data from the physical page represented by "frompp" to
3819  * that represented by "topp". ppcopy uses CPU->cpu_caddr1 and
3820  * CPU->cpu_caddr2.  It assumes that no one uses either map at interrupt
3821  * level and no one sleeps with an active mapping there.
3822  *
3823  * Note that the ref/mod bits in the page_t's are not affected by
3824  * this operation, hence it is up to the caller to update them appropriately.
3825  */
3826 int
3827 ppcopy(page_t *frompp, page_t *topp)
3828 {
3829         caddr_t         pp_addr1;
3830         caddr_t         pp_addr2;
3831         hat_mempte_t    pte1;
3832         hat_mempte_t    pte2;
3833         kmutex_t        *ppaddr_mutex;
3834         label_t         ljb;
3835         int             ret = 1;
3836 
3837         ASSERT_STACK_ALIGNED();
3838         ASSERT(PAGE_LOCKED(frompp));
3839         ASSERT(PAGE_LOCKED(topp));
3840 
3841         if (kpm_enable) {
3842                 pp_addr1 = hat_kpm_page2va(frompp, 0);
3843                 pp_addr2 = hat_kpm_page2va(topp, 0);
3844                 kpreempt_disable();
3845         } else {
3846                 /*
3847                  * disable pre-emption so that CPU can't change
3848                  */
3849                 kpreempt_disable();
3850 
3851                 pp_addr1 = CPU->cpu_caddr1;
3852                 pp_addr2 = CPU->cpu_caddr2;
3853                 pte1 = CPU->cpu_caddr1pte;
3854                 pte2 = CPU->cpu_caddr2pte;
3855 
3856                 ppaddr_mutex = &CPU->cpu_ppaddr_mutex;
3857                 mutex_enter(ppaddr_mutex);
3858 
3859                 hat_mempte_remap(page_pptonum(frompp), pp_addr1, pte1,
3860                     PROT_READ | HAT_STORECACHING_OK, HAT_LOAD_NOCONSIST);
3861                 hat_mempte_remap(page_pptonum(topp), pp_addr2, pte2,
3862                     PROT_READ | PROT_WRITE | HAT_STORECACHING_OK,
3863                     HAT_LOAD_NOCONSIST);
3864         }
3865 
3866         if (on_fault(&ljb)) {
3867                 ret = 0;
3868                 goto faulted;
3869         }
3870         if (use_sse_pagecopy)
3871 #ifdef __xpv
3872                 page_copy_no_xmm(pp_addr2, pp_addr1);
3873 #else
3874                 hwblkpagecopy(pp_addr1, pp_addr2);
3875 #endif
3876         else
3877                 bcopy(pp_addr1, pp_addr2, PAGESIZE);
3878 
3879         no_fault();
3880 faulted:
3881         if (!kpm_enable) {
3882 #ifdef __xpv
3883                 /*
3884                  * We can't leave unused mappings laying about under the
3885                  * hypervisor, so blow them away.
3886                  */
3887                 if (HYPERVISOR_update_va_mapping((uintptr_t)pp_addr1, 0,
3888                     UVMF_INVLPG | UVMF_LOCAL) < 0)
3889                         panic("HYPERVISOR_update_va_mapping() failed");
3890                 if (HYPERVISOR_update_va_mapping((uintptr_t)pp_addr2, 0,
3891                     UVMF_INVLPG | UVMF_LOCAL) < 0)
3892                         panic("HYPERVISOR_update_va_mapping() failed");
3893 #endif
3894                 mutex_exit(ppaddr_mutex);
3895         }
3896         kpreempt_enable();
3897         return (ret);
3898 }
3899 
3900 void
3901 pagezero(page_t *pp, uint_t off, uint_t len)
3902 {
3903         ASSERT(PAGE_LOCKED(pp));
3904         pfnzero(page_pptonum(pp), off, len);
3905 }
3906 
3907 /*
3908  * Zero the physical page from off to off + len given by pfn
3909  * without changing the reference and modified bits of page.
3910  *
3911  * We use this using CPU private page address #2, see ppcopy() for more info.
3912  * pfnzero() must not be called at interrupt level.
3913  */
3914 void
3915 pfnzero(pfn_t pfn, uint_t off, uint_t len)
3916 {
3917         caddr_t         pp_addr2;
3918         hat_mempte_t    pte2;
3919         kmutex_t        *ppaddr_mutex = NULL;
3920 
3921         ASSERT_STACK_ALIGNED();
3922         ASSERT(len <= MMU_PAGESIZE);
3923         ASSERT(off <= MMU_PAGESIZE);
3924         ASSERT(off + len <= MMU_PAGESIZE);
3925 
3926         if (kpm_enable && !pfn_is_foreign(pfn)) {
3927                 pp_addr2 = hat_kpm_pfn2va(pfn);
3928                 kpreempt_disable();
3929         } else {
3930                 kpreempt_disable();
3931 
3932                 pp_addr2 = CPU->cpu_caddr2;
3933                 pte2 = CPU->cpu_caddr2pte;
3934 
3935                 ppaddr_mutex = &CPU->cpu_ppaddr_mutex;
3936                 mutex_enter(ppaddr_mutex);
3937 
3938                 hat_mempte_remap(pfn, pp_addr2, pte2,
3939                     PROT_READ | PROT_WRITE | HAT_STORECACHING_OK,
3940                     HAT_LOAD_NOCONSIST);
3941         }
3942 
3943         if (use_sse_pagezero) {
3944 #ifdef __xpv
3945                 uint_t rem;
3946 
3947                 /*
3948                  * zero a byte at a time until properly aligned for
3949                  * block_zero_no_xmm().
3950                  */
3951                 while (!P2NPHASE(off, ((uint_t)BLOCKZEROALIGN)) && len-- > 0)
3952                         pp_addr2[off++] = 0;
3953 
3954                 /*
3955                  * Now use faster block_zero_no_xmm() for any range
3956                  * that is properly aligned and sized.
3957                  */
3958                 rem = P2PHASE(len, ((uint_t)BLOCKZEROALIGN));
3959                 len -= rem;
3960                 if (len != 0) {
3961                         block_zero_no_xmm(pp_addr2 + off, len);
3962                         off += len;
3963                 }
3964 
3965                 /*
3966                  * zero remainder with byte stores.
3967                  */
3968                 while (rem-- > 0)
3969                         pp_addr2[off++] = 0;
3970 #else
3971                 hwblkclr(pp_addr2 + off, len);
3972 #endif
3973         } else {
3974                 bzero(pp_addr2 + off, len);
3975         }
3976 
3977         if (!kpm_enable || pfn_is_foreign(pfn)) {
3978 #ifdef __xpv
3979                 /*
3980                  * On the hypervisor this page might get used for a page
3981                  * table before any intervening change to this mapping,
3982                  * so blow it away.
3983                  */
3984                 if (HYPERVISOR_update_va_mapping((uintptr_t)pp_addr2, 0,
3985                     UVMF_INVLPG) < 0)
3986                         panic("HYPERVISOR_update_va_mapping() failed");
3987 #endif
3988                 mutex_exit(ppaddr_mutex);
3989         }
3990 
3991         kpreempt_enable();
3992 }
3993 
3994 /*
3995  * Platform-dependent page scrub call.
3996  */
3997 void
3998 pagescrub(page_t *pp, uint_t off, uint_t len)
3999 {
4000         /*
4001          * For now, we rely on the fact that pagezero() will
4002          * always clear UEs.
4003          */
4004         pagezero(pp, off, len);
4005 }
4006 
4007 /*
4008  * set up two private addresses for use on a given CPU for use in ppcopy()
4009  */
4010 void
4011 setup_vaddr_for_ppcopy(struct cpu *cpup)
4012 {
4013         void *addr;
4014         hat_mempte_t pte_pa;
4015 
4016         addr = vmem_alloc(heap_arena, mmu_ptob(1), VM_SLEEP);
4017         pte_pa = hat_mempte_setup(addr);
4018         cpup->cpu_caddr1 = addr;
4019         cpup->cpu_caddr1pte = pte_pa;
4020 
4021         addr = vmem_alloc(heap_arena, mmu_ptob(1), VM_SLEEP);
4022         pte_pa = hat_mempte_setup(addr);
4023         cpup->cpu_caddr2 = addr;
4024         cpup->cpu_caddr2pte = pte_pa;
4025 
4026         mutex_init(&cpup->cpu_ppaddr_mutex, NULL, MUTEX_DEFAULT, NULL);
4027 }
4028 
4029 /*
4030  * Undo setup_vaddr_for_ppcopy
4031  */
4032 void
4033 teardown_vaddr_for_ppcopy(struct cpu *cpup)
4034 {
4035         mutex_destroy(&cpup->cpu_ppaddr_mutex);
4036 
4037         hat_mempte_release(cpup->cpu_caddr2, cpup->cpu_caddr2pte);
4038         cpup->cpu_caddr2pte = 0;
4039         vmem_free(heap_arena, cpup->cpu_caddr2, mmu_ptob(1));
4040         cpup->cpu_caddr2 = 0;
4041 
4042         hat_mempte_release(cpup->cpu_caddr1, cpup->cpu_caddr1pte);
4043         cpup->cpu_caddr1pte = 0;
4044         vmem_free(heap_arena, cpup->cpu_caddr1, mmu_ptob(1));
4045         cpup->cpu_caddr1 = 0;
4046 }
4047 
4048 /*
4049  * Function for flushing D-cache when performing module relocations
4050  * to an alternate mapping.  Unnecessary on Intel / AMD platforms.
4051  */
4052 void
4053 dcache_flushall()
4054 {}
4055 
4056 /*
4057  * Allocate a memory page.  The argument 'seed' can be any pseudo-random
4058  * number to vary where the pages come from.  This is quite a hacked up
4059  * method -- it works for now, but really needs to be fixed up a bit.
4060  *
4061  * We currently use page_create_va() on the kvp with fake offsets,
4062  * segments and virt address.  This is pretty bogus, but was copied from the
4063  * old hat_i86.c code.  A better approach would be to specify either mnode
4064  * random or mnode local and takes a page from whatever color has the MOST
4065  * available - this would have a minimal impact on page coloring.
4066  */
4067 page_t *
4068 page_get_physical(uintptr_t seed)
4069 {
4070         page_t *pp;
4071         u_offset_t offset;
4072         static struct seg tmpseg;
4073         static uintptr_t ctr = 0;
4074 
4075         /*
4076          * This code is gross, we really need a simpler page allocator.
4077          *
4078          * We need to assign an offset for the page to call page_create_va()
4079          * To avoid conflicts with other pages, we get creative with the offset.
4080          * For 32 bits, we need an offset > 4Gig
4081          * For 64 bits, need an offset somewhere in the VA hole.
4082          */
4083         offset = seed;
4084         if (offset > kernelbase)
4085                 offset -= kernelbase;
4086         offset <<= MMU_PAGESHIFT;
4087 #if defined(__amd64)
4088         offset += mmu.hole_start;       /* something in VA hole */
4089 #else
4090         offset += 1ULL << 40;     /* something > 4 Gig */
4091 #endif
4092 
4093         if (page_resv(1, KM_NOSLEEP) == 0)
4094                 return (NULL);
4095 
4096 #ifdef  DEBUG
4097         pp = page_exists(&kvp, offset);
4098         if (pp != NULL)
4099                 panic("page already exists %p", (void *)pp);
4100 #endif
4101 
4102         pp = page_create_va(&kvp, offset, MMU_PAGESIZE, PG_EXCL,
4103             &tmpseg, (caddr_t)(ctr += MMU_PAGESIZE));       /* changing VA usage */
4104         if (pp != NULL) {
4105                 page_io_unlock(pp);
4106                 page_downgrade(pp);
4107         }
4108         return (pp);
4109 }