1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
23 */
24 /*
25 * Copyright (c) 2010, Intel Corporation.
26 * All rights reserved.
27 */
28
29 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
30 /* All Rights Reserved */
31
32 /*
33 * Portions of this source code were derived from Berkeley 4.3 BSD
34 * under license from the Regents of the University of California.
35 */
36
37 /*
38 * UNIX machine dependent virtual memory support.
39 */
40
41 #include <sys/types.h>
42 #include <sys/param.h>
43 #include <sys/systm.h>
44 #include <sys/user.h>
45 #include <sys/proc.h>
46 #include <sys/kmem.h>
47 #include <sys/vmem.h>
48 #include <sys/buf.h>
49 #include <sys/cpuvar.h>
50 #include <sys/lgrp.h>
51 #include <sys/disp.h>
52 #include <sys/vm.h>
53 #include <sys/mman.h>
54 #include <sys/vnode.h>
55 #include <sys/cred.h>
56 #include <sys/exec.h>
57 #include <sys/exechdr.h>
58 #include <sys/debug.h>
59 #include <sys/vmsystm.h>
60 #include <sys/swap.h>
61 #include <sys/dumphdr.h>
62 #include <sys/random.h>
63
64 #include <vm/hat.h>
65 #include <vm/as.h>
66 #include <vm/seg.h>
67 #include <vm/seg_kp.h>
68 #include <vm/seg_vn.h>
69 #include <vm/page.h>
70 #include <vm/seg_kmem.h>
71 #include <vm/seg_kpm.h>
72 #include <vm/vm_dep.h>
73
74 #include <sys/cpu.h>
75 #include <sys/vm_machparam.h>
76 #include <sys/memlist.h>
77 #include <sys/bootconf.h> /* XXX the memlist stuff belongs in memlist_plat.h */
78 #include <vm/hat_i86.h>
79 #include <sys/x86_archext.h>
80 #include <sys/elf_386.h>
81 #include <sys/cmn_err.h>
82 #include <sys/archsystm.h>
83 #include <sys/machsystm.h>
84
85 #include <sys/vtrace.h>
86 #include <sys/ddidmareq.h>
87 #include <sys/promif.h>
88 #include <sys/memnode.h>
89 #include <sys/stack.h>
90 #include <util/qsort.h>
91 #include <sys/taskq.h>
92
93 #ifdef __xpv
94
95 #include <sys/hypervisor.h>
96 #include <sys/xen_mmu.h>
97 #include <sys/balloon_impl.h>
98
99 /*
100 * domain 0 pages usable for DMA are kept pre-allocated and kept in
101 * distinct lists, ordered by increasing mfn.
102 */
103 static kmutex_t io_pool_lock;
104 static kmutex_t contig_list_lock;
105 static page_t *io_pool_4g; /* pool for 32 bit dma limited devices */
106 static page_t *io_pool_16m; /* pool for 24 bit dma limited legacy devices */
107 static long io_pool_cnt;
108 static long io_pool_cnt_max = 0;
109 #define DEFAULT_IO_POOL_MIN 128
110 static long io_pool_cnt_min = DEFAULT_IO_POOL_MIN;
111 static long io_pool_cnt_lowater = 0;
112 static long io_pool_shrink_attempts; /* how many times did we try to shrink */
113 static long io_pool_shrinks; /* how many times did we really shrink */
114 static long io_pool_grows; /* how many times did we grow */
115 static mfn_t start_mfn = 1;
116 static caddr_t io_pool_kva; /* use to alloc pages when needed */
117
118 static int create_contig_pfnlist(uint_t);
119
120 /*
121 * percentage of phys mem to hold in the i/o pool
122 */
123 #define DEFAULT_IO_POOL_PCT 2
124 static long io_pool_physmem_pct = DEFAULT_IO_POOL_PCT;
125 static void page_io_pool_sub(page_t **, page_t *, page_t *);
126 int ioalloc_dbg = 0;
127
128 #endif /* __xpv */
129
130 uint_t vac_colors = 1;
131
132 int largepagesupport = 0;
133 extern uint_t page_create_new;
134 extern uint_t page_create_exists;
135 extern uint_t page_create_putbacks;
136 /*
137 * Allow users to disable the kernel's use of SSE.
138 */
139 extern int use_sse_pagecopy, use_sse_pagezero;
140
141 /*
142 * combined memory ranges from mnode and memranges[] to manage single
143 * mnode/mtype dimension in the page lists.
144 */
145 typedef struct {
146 pfn_t mnr_pfnlo;
147 pfn_t mnr_pfnhi;
148 int mnr_mnode;
149 int mnr_memrange; /* index into memranges[] */
150 int mnr_next; /* next lower PA mnoderange */
151 int mnr_exists;
152 /* maintain page list stats */
153 pgcnt_t mnr_mt_clpgcnt; /* cache list cnt */
154 pgcnt_t mnr_mt_flpgcnt[MMU_PAGE_SIZES]; /* free list cnt per szc */
155 pgcnt_t mnr_mt_totcnt; /* sum of cache and free lists */
156 #ifdef DEBUG
157 struct mnr_mts { /* mnode/mtype szc stats */
158 pgcnt_t mnr_mts_pgcnt;
159 int mnr_mts_colors;
160 pgcnt_t *mnr_mtsc_pgcnt;
161 } *mnr_mts;
162 #endif
163 } mnoderange_t;
164
165 #define MEMRANGEHI(mtype) \
166 ((mtype > 0) ? memranges[mtype - 1] - 1: physmax)
167 #define MEMRANGELO(mtype) (memranges[mtype])
168
169 #define MTYPE_FREEMEM(mt) (mnoderanges[mt].mnr_mt_totcnt)
170
171 /*
172 * As the PC architecture evolved memory up was clumped into several
173 * ranges for various historical I/O devices to do DMA.
174 * < 16Meg - ISA bus
175 * < 2Gig - ???
176 * < 4Gig - PCI bus or drivers that don't understand PAE mode
177 *
178 * These are listed in reverse order, so that we can skip over unused
179 * ranges on machines with small memories.
180 *
181 * For now under the Hypervisor, we'll only ever have one memrange.
182 */
183 #define PFN_4GIG 0x100000
184 #define PFN_16MEG 0x1000
185 /* Indices into the memory range (arch_memranges) array. */
186 #define MRI_4G 0
187 #define MRI_2G 1
188 #define MRI_16M 2
189 #define MRI_0 3
190 static pfn_t arch_memranges[NUM_MEM_RANGES] = {
191 PFN_4GIG, /* pfn range for 4G and above */
192 0x80000, /* pfn range for 2G-4G */
193 PFN_16MEG, /* pfn range for 16M-2G */
194 0x00000, /* pfn range for 0-16M */
195 };
196 pfn_t *memranges = &arch_memranges[0];
197 int nranges = NUM_MEM_RANGES;
198
199 /*
200 * This combines mem_node_config and memranges into one data
201 * structure to be used for page list management.
202 */
203 mnoderange_t *mnoderanges;
204 int mnoderangecnt;
205 int mtype4g;
206 int mtype16m;
207 int mtypetop; /* index of highest pfn'ed mnoderange */
208
209 /*
210 * 4g memory management variables for systems with more than 4g of memory:
211 *
212 * physical memory below 4g is required for 32bit dma devices and, currently,
213 * for kmem memory. On systems with more than 4g of memory, the pool of memory
214 * below 4g can be depleted without any paging activity given that there is
215 * likely to be sufficient memory above 4g.
216 *
217 * physmax4g is set true if the largest pfn is over 4g. The rest of the
218 * 4g memory management code is enabled only when physmax4g is true.
219 *
220 * maxmem4g is the count of the maximum number of pages on the page lists
221 * with physical addresses below 4g. It can be a lot less then 4g given that
222 * BIOS may reserve large chunks of space below 4g for hot plug pci devices,
223 * agp aperture etc.
224 *
225 * freemem4g maintains the count of the number of available pages on the
226 * page lists with physical addresses below 4g.
227 *
228 * DESFREE4G specifies the desired amount of below 4g memory. It defaults to
229 * 6% (desfree4gshift = 4) of maxmem4g.
230 *
231 * RESTRICT4G_ALLOC returns true if freemem4g falls below DESFREE4G
232 * and the amount of physical memory above 4g is greater than freemem4g.
233 * In this case, page_get_* routines will restrict below 4g allocations
234 * for requests that don't specifically require it.
235 */
236
237 #define DESFREE4G (maxmem4g >> desfree4gshift)
238
239 #define RESTRICT4G_ALLOC \
240 (physmax4g && (freemem4g < DESFREE4G) && ((freemem4g << 1) < freemem))
241
242 static pgcnt_t maxmem4g;
243 static pgcnt_t freemem4g;
244 static int physmax4g;
245 static int desfree4gshift = 4; /* maxmem4g shift to derive DESFREE4G */
246
247 /*
248 * 16m memory management:
249 *
250 * reserve some amount of physical memory below 16m for legacy devices.
251 *
252 * RESTRICT16M_ALLOC returns true if an there are sufficient free pages above
253 * 16m or if the 16m pool drops below DESFREE16M.
254 *
255 * In this case, general page allocations via page_get_{free,cache}list
256 * routines will be restricted from allocating from the 16m pool. Allocations
257 * that require specific pfn ranges (page_get_anylist) and PG_PANIC allocations
258 * are not restricted.
259 */
260
261 #define FREEMEM16M MTYPE_FREEMEM(mtype16m)
262 #define DESFREE16M desfree16m
263 #define RESTRICT16M_ALLOC(freemem, pgcnt, flags) \
264 ((freemem != 0) && ((flags & PG_PANIC) == 0) && \
265 ((freemem >= (FREEMEM16M)) || \
266 (FREEMEM16M < (DESFREE16M + pgcnt))))
267
268 static pgcnt_t desfree16m = 0x380;
269
270 /*
271 * This can be patched via /etc/system to allow old non-PAE aware device
272 * drivers to use kmem_alloc'd memory on 32 bit systems with > 4Gig RAM.
273 */
274 int restricted_kmemalloc = 0;
275
276 #ifdef VM_STATS
277 struct {
278 ulong_t pga_alloc;
279 ulong_t pga_notfullrange;
280 ulong_t pga_nulldmaattr;
281 ulong_t pga_allocok;
282 ulong_t pga_allocfailed;
283 ulong_t pgma_alloc;
284 ulong_t pgma_allocok;
285 ulong_t pgma_allocfailed;
286 ulong_t pgma_allocempty;
287 } pga_vmstats;
288 #endif
289
290 uint_t mmu_page_sizes;
291
292 /* How many page sizes the users can see */
293 uint_t mmu_exported_page_sizes;
294
295 /* page sizes that legacy applications can see */
296 uint_t mmu_legacy_page_sizes;
297
298 /*
299 * Number of pages in 1 GB. Don't enable automatic large pages if we have
300 * fewer than this many pages.
301 */
302 pgcnt_t shm_lpg_min_physmem = 1 << (30 - MMU_PAGESHIFT);
303 pgcnt_t privm_lpg_min_physmem = 1 << (30 - MMU_PAGESHIFT);
304
305 /*
306 * Maximum and default segment size tunables for user private
307 * and shared anon memory, and user text and initialized data.
308 * These can be patched via /etc/system to allow large pages
309 * to be used for mapping application private and shared anon memory.
310 */
311 size_t mcntl0_lpsize = MMU_PAGESIZE;
312 size_t max_uheap_lpsize = MMU_PAGESIZE;
313 size_t default_uheap_lpsize = MMU_PAGESIZE;
314 size_t max_ustack_lpsize = MMU_PAGESIZE;
315 size_t default_ustack_lpsize = MMU_PAGESIZE;
316 size_t max_privmap_lpsize = MMU_PAGESIZE;
317 size_t max_uidata_lpsize = MMU_PAGESIZE;
318 size_t max_utext_lpsize = MMU_PAGESIZE;
319 size_t max_shm_lpsize = MMU_PAGESIZE;
320
321
322 /*
323 * initialized by page_coloring_init().
324 */
325 uint_t page_colors;
326 uint_t page_colors_mask;
327 uint_t page_coloring_shift;
328 int cpu_page_colors;
329 static uint_t l2_colors;
330
331 /*
332 * Page freelists and cachelists are dynamically allocated once mnoderangecnt
333 * and page_colors are calculated from the l2 cache n-way set size. Within a
334 * mnode range, the page freelist and cachelist are hashed into bins based on
335 * color. This makes it easier to search for a page within a specific memory
336 * range.
337 */
338 #define PAGE_COLORS_MIN 16
339
340 page_t ****page_freelists;
341 page_t ***page_cachelists;
342
343
344 /*
345 * Used by page layer to know about page sizes
346 */
347 hw_pagesize_t hw_page_array[MAX_NUM_LEVEL + 1];
348
349 kmutex_t *fpc_mutex[NPC_MUTEX];
350 kmutex_t *cpc_mutex[NPC_MUTEX];
351
352 /* Lock to protect mnoderanges array for memory DR operations. */
353 static kmutex_t mnoderange_lock;
354
355 /*
356 * Only let one thread at a time try to coalesce large pages, to
357 * prevent them from working against each other.
358 */
359 static kmutex_t contig_lock;
360 #define CONTIG_LOCK() mutex_enter(&contig_lock);
361 #define CONTIG_UNLOCK() mutex_exit(&contig_lock);
362
363 #define PFN_16M (mmu_btop((uint64_t)0x1000000))
364
365 /*
366 * Return the optimum page size for a given mapping
367 */
368 /*ARGSUSED*/
369 size_t
370 map_pgsz(int maptype, struct proc *p, caddr_t addr, size_t len, int memcntl)
371 {
372 level_t l = 0;
373 size_t pgsz = MMU_PAGESIZE;
374 size_t max_lpsize;
375 uint_t mszc;
376
377 ASSERT(maptype != MAPPGSZ_VA);
378
379 if (maptype != MAPPGSZ_ISM && physmem < privm_lpg_min_physmem) {
380 return (MMU_PAGESIZE);
381 }
382
383 switch (maptype) {
384 case MAPPGSZ_HEAP:
385 case MAPPGSZ_STK:
386 max_lpsize = memcntl ? mcntl0_lpsize : (maptype ==
387 MAPPGSZ_HEAP ? max_uheap_lpsize : max_ustack_lpsize);
388 if (max_lpsize == MMU_PAGESIZE) {
389 return (MMU_PAGESIZE);
390 }
391 if (len == 0) {
392 len = (maptype == MAPPGSZ_HEAP) ? p->p_brkbase +
393 p->p_brksize - p->p_bssbase : p->p_stksize;
394 }
395 len = (maptype == MAPPGSZ_HEAP) ? MAX(len,
396 default_uheap_lpsize) : MAX(len, default_ustack_lpsize);
397
398 /*
399 * use the pages size that best fits len
400 */
401 for (l = mmu.umax_page_level; l > 0; --l) {
402 if (LEVEL_SIZE(l) > max_lpsize || len < LEVEL_SIZE(l)) {
403 continue;
404 } else {
405 pgsz = LEVEL_SIZE(l);
406 }
407 break;
408 }
409
410 mszc = (maptype == MAPPGSZ_HEAP ? p->p_brkpageszc :
411 p->p_stkpageszc);
412 if (addr == 0 && (pgsz < hw_page_array[mszc].hp_size)) {
413 pgsz = hw_page_array[mszc].hp_size;
414 }
415 return (pgsz);
416
417 case MAPPGSZ_ISM:
418 for (l = mmu.umax_page_level; l > 0; --l) {
419 if (len >= LEVEL_SIZE(l))
420 return (LEVEL_SIZE(l));
421 }
422 return (LEVEL_SIZE(0));
423 }
424 return (pgsz);
425 }
426
427 static uint_t
428 map_szcvec(caddr_t addr, size_t size, uintptr_t off, size_t max_lpsize,
429 size_t min_physmem)
430 {
431 caddr_t eaddr = addr + size;
432 uint_t szcvec = 0;
433 caddr_t raddr;
434 caddr_t readdr;
435 size_t pgsz;
436 int i;
437
438 if (physmem < min_physmem || max_lpsize <= MMU_PAGESIZE) {
439 return (0);
440 }
441
442 for (i = mmu_exported_page_sizes - 1; i > 0; i--) {
443 pgsz = page_get_pagesize(i);
444 if (pgsz > max_lpsize) {
445 continue;
446 }
447 raddr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
448 readdr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
449 if (raddr < addr || raddr >= readdr) {
450 continue;
451 }
452 if (P2PHASE((uintptr_t)addr ^ off, pgsz)) {
453 continue;
454 }
455 /*
456 * Set szcvec to the remaining page sizes.
457 */
458 szcvec = ((1 << (i + 1)) - 1) & ~1;
459 break;
460 }
461 return (szcvec);
462 }
463
464 /*
465 * Return a bit vector of large page size codes that
466 * can be used to map [addr, addr + len) region.
467 */
468 /*ARGSUSED*/
469 uint_t
470 map_pgszcvec(caddr_t addr, size_t size, uintptr_t off, int flags, int type,
471 int memcntl)
472 {
473 size_t max_lpsize = mcntl0_lpsize;
474
475 if (mmu.max_page_level == 0)
476 return (0);
477
478 if (flags & MAP_TEXT) {
479 if (!memcntl)
480 max_lpsize = max_utext_lpsize;
481 return (map_szcvec(addr, size, off, max_lpsize,
482 shm_lpg_min_physmem));
483
484 } else if (flags & MAP_INITDATA) {
485 if (!memcntl)
486 max_lpsize = max_uidata_lpsize;
487 return (map_szcvec(addr, size, off, max_lpsize,
488 privm_lpg_min_physmem));
489
490 } else if (type == MAPPGSZC_SHM) {
491 if (!memcntl)
492 max_lpsize = max_shm_lpsize;
493 return (map_szcvec(addr, size, off, max_lpsize,
494 shm_lpg_min_physmem));
495
496 } else if (type == MAPPGSZC_HEAP) {
497 if (!memcntl)
498 max_lpsize = max_uheap_lpsize;
499 return (map_szcvec(addr, size, off, max_lpsize,
500 privm_lpg_min_physmem));
501
502 } else if (type == MAPPGSZC_STACK) {
503 if (!memcntl)
504 max_lpsize = max_ustack_lpsize;
505 return (map_szcvec(addr, size, off, max_lpsize,
506 privm_lpg_min_physmem));
507
508 } else {
509 if (!memcntl)
510 max_lpsize = max_privmap_lpsize;
511 return (map_szcvec(addr, size, off, max_lpsize,
512 privm_lpg_min_physmem));
513 }
514 }
515
516 /*
517 * Handle a pagefault.
518 */
519 faultcode_t
520 pagefault(
521 caddr_t addr,
522 enum fault_type type,
523 enum seg_rw rw,
524 int iskernel)
525 {
526 struct as *as;
527 struct hat *hat;
528 struct proc *p;
529 kthread_t *t;
530 faultcode_t res;
531 caddr_t base;
532 size_t len;
533 int err;
534 int mapped_red;
535 uintptr_t ea;
536
537 ASSERT_STACK_ALIGNED();
538
539 if (INVALID_VADDR(addr))
540 return (FC_NOMAP);
541
542 mapped_red = segkp_map_red();
543
544 if (iskernel) {
545 as = &kas;
546 hat = as->a_hat;
547 } else {
548 t = curthread;
549 p = ttoproc(t);
550 as = p->p_as;
551 hat = as->a_hat;
552 }
553
554 /*
555 * Dispatch pagefault.
556 */
557 res = as_fault(hat, as, addr, 1, type, rw);
558
559 /*
560 * If this isn't a potential unmapped hole in the user's
561 * UNIX data or stack segments, just return status info.
562 */
563 if (res != FC_NOMAP || iskernel)
564 goto out;
565
566 /*
567 * Check to see if we happened to faulted on a currently unmapped
568 * part of the UNIX data or stack segments. If so, create a zfod
569 * mapping there and then try calling the fault routine again.
570 */
571 base = p->p_brkbase;
572 len = p->p_brksize;
573
574 if (addr < base || addr >= base + len) { /* data seg? */
575 base = (caddr_t)p->p_usrstack - p->p_stksize;
576 len = p->p_stksize;
577 if (addr < base || addr >= p->p_usrstack) { /* stack seg? */
578 /* not in either UNIX data or stack segments */
579 res = FC_NOMAP;
580 goto out;
581 }
582 }
583
584 /*
585 * the rest of this function implements a 3.X 4.X 5.X compatibility
586 * This code is probably not needed anymore
587 */
588 if (p->p_model == DATAMODEL_ILP32) {
589
590 /* expand the gap to the page boundaries on each side */
591 ea = P2ROUNDUP((uintptr_t)base + len, MMU_PAGESIZE);
592 base = (caddr_t)P2ALIGN((uintptr_t)base, MMU_PAGESIZE);
593 len = ea - (uintptr_t)base;
594
595 as_rangelock(as);
596 if (as_gap(as, MMU_PAGESIZE, &base, &len, AH_CONTAIN, addr) ==
597 0) {
598 err = as_map(as, base, len, segvn_create, zfod_argsp);
599 as_rangeunlock(as);
600 if (err) {
601 res = FC_MAKE_ERR(err);
602 goto out;
603 }
604 } else {
605 /*
606 * This page is already mapped by another thread after
607 * we returned from as_fault() above. We just fall
608 * through as_fault() below.
609 */
610 as_rangeunlock(as);
611 }
612
613 res = as_fault(hat, as, addr, 1, F_INVAL, rw);
614 }
615
616 out:
617 if (mapped_red)
618 segkp_unmap_red();
619
620 return (res);
621 }
622
623 void
624 map_addr(caddr_t *addrp, size_t len, offset_t off, int vacalign, uint_t flags)
625 {
626 struct proc *p = curproc;
627 caddr_t userlimit = (flags & _MAP_LOW32) ?
628 (caddr_t)_userlimit32 : p->p_as->a_userlimit;
629
630 map_addr_proc(addrp, len, off, vacalign, userlimit, curproc, flags);
631 }
632
633 /*ARGSUSED*/
634 int
635 map_addr_vacalign_check(caddr_t addr, u_offset_t off)
636 {
637 return (0);
638 }
639
640 /*
641 * The maximum amount a randomized mapping will be slewed. We should perhaps
642 * arrange things so these tunables can be separate for mmap, mmapobj, and
643 * ld.so
644 */
645 volatile size_t aslr_max_map_skew = 256 * 1024 * 1024; /* 256MB */
646
647 /*
648 * map_addr_proc() is the routine called when the system is to
649 * choose an address for the user. We will pick an address
650 * range which is the highest available below userlimit.
651 *
652 * Every mapping will have a redzone of a single page on either side of
653 * the request. This is done to leave one page unmapped between segments.
654 * This is not required, but it's useful for the user because if their
655 * program strays across a segment boundary, it will catch a fault
656 * immediately making debugging a little easier. Currently the redzone
657 * is mandatory.
658 *
659 * addrp is a value/result parameter.
660 * On input it is a hint from the user to be used in a completely
661 * machine dependent fashion. We decide to completely ignore this hint.
662 * If MAP_ALIGN was specified, addrp contains the minimal alignment, which
663 * must be some "power of two" multiple of pagesize.
664 *
665 * On output it is NULL if no address can be found in the current
666 * processes address space or else an address that is currently
667 * not mapped for len bytes with a page of red zone on either side.
668 *
669 * vacalign is not needed on x86 (it's for viturally addressed caches)
670 */
671 /*ARGSUSED*/
672 void
673 map_addr_proc(
674 caddr_t *addrp,
675 size_t len,
676 offset_t off,
677 int vacalign,
678 caddr_t userlimit,
679 struct proc *p,
680 uint_t flags)
681 {
682 struct as *as = p->p_as;
683 caddr_t addr;
684 caddr_t base;
685 size_t slen;
686 size_t align_amount;
687
688 ASSERT32(userlimit == as->a_userlimit);
689
690 base = p->p_brkbase;
691 #if defined(__amd64)
692 /*
693 * XX64 Yes, this needs more work.
694 */
695 if (p->p_model == DATAMODEL_NATIVE) {
696 if (userlimit < as->a_userlimit) {
697 /*
698 * This happens when a program wants to map
699 * something in a range that's accessible to a
700 * program in a smaller address space. For example,
701 * a 64-bit program calling mmap32(2) to guarantee
702 * that the returned address is below 4Gbytes.
703 */
704 ASSERT((uintptr_t)userlimit < ADDRESS_C(0xffffffff));
705
706 if (userlimit > base)
707 slen = userlimit - base;
708 else {
709 *addrp = NULL;
710 return;
711 }
712 } else {
713 /*
714 * XX64 This layout is probably wrong .. but in
715 * the event we make the amd64 address space look
716 * like sparcv9 i.e. with the stack -above- the
717 * heap, this bit of code might even be correct.
718 */
719 slen = p->p_usrstack - base -
720 ((p->p_stk_ctl + PAGEOFFSET) & PAGEMASK);
721 }
722 } else
723 #endif
724 slen = userlimit - base;
725
726 /* Make len be a multiple of PAGESIZE */
727 len = (len + PAGEOFFSET) & PAGEMASK;
728
729 /*
730 * figure out what the alignment should be
731 *
732 * XX64 -- is there an ELF_AMD64_MAXPGSZ or is it the same????
733 */
734 if (len <= ELF_386_MAXPGSZ) {
735 /*
736 * Align virtual addresses to ensure that ELF shared libraries
737 * are mapped with the appropriate alignment constraints by
738 * the run-time linker.
739 */
740 align_amount = ELF_386_MAXPGSZ;
741 } else {
742 /*
743 * For 32-bit processes, only those which have specified
744 * MAP_ALIGN and an addr will be aligned on a larger page size.
745 * Not doing so can potentially waste up to 1G of process
746 * address space.
747 */
748 int lvl = (p->p_model == DATAMODEL_ILP32) ? 1 :
749 mmu.umax_page_level;
750
751 while (lvl && len < LEVEL_SIZE(lvl))
752 --lvl;
753
754 align_amount = LEVEL_SIZE(lvl);
755 }
756 if ((flags & MAP_ALIGN) && ((uintptr_t)*addrp > align_amount))
757 align_amount = (uintptr_t)*addrp;
758
759 ASSERT(ISP2(align_amount));
760 ASSERT(align_amount == 0 || align_amount >= PAGESIZE);
761
762 off = off & (align_amount - 1);
763
764 /*
765 * Look for a large enough hole starting below userlimit.
766 * After finding it, use the upper part.
767 */
768 if (as_gap_aligned(as, len, &base, &slen, AH_HI, NULL, align_amount,
769 PAGESIZE, off) == 0) {
770 caddr_t as_addr;
771
772 /*
773 * addr is the highest possible address to use since we have
774 * a PAGESIZE redzone at the beginning and end.
775 */
776 addr = base + slen - (PAGESIZE + len);
777 as_addr = addr;
778 /*
779 * Round address DOWN to the alignment amount and
780 * add the offset in.
781 * If addr is greater than as_addr, len would not be large
782 * enough to include the redzone, so we must adjust down
783 * by the alignment amount.
784 */
785 addr = (caddr_t)((uintptr_t)addr & (~(align_amount - 1)));
786 addr += (uintptr_t)off;
787 if (addr > as_addr) {
788 addr -= align_amount;
789 }
790
791 /*
792 * If randomization is requested, slew the allocation
793 * backwards, within the same gap, by a random amount.
794 *
795 * XXX: This will fall over in processes like Java, which
796 * commonly have a great many small mappings.
797 */
798 if (flags & _MAP_RANDOMIZE) {
799 uint32_t slew;
800
801 (void) random_get_pseudo_bytes((uint8_t *)&slew,
802 sizeof (slew));
803
804 slew = slew % MIN(aslr_max_map_skew, (addr - base));
805 addr -= P2ALIGN(slew, align_amount);
806 }
807
808 ASSERT(addr > base);
809 ASSERT(addr + len < base + slen);
810 ASSERT(((uintptr_t)addr & (align_amount - 1)) ==
811 ((uintptr_t)(off)));
812 *addrp = addr;
813 } else {
814 *addrp = NULL; /* no more virtual space */
815 }
816 }
817
818 int valid_va_range_aligned_wraparound;
819
820 /*
821 * Determine whether [*basep, *basep + *lenp) contains a mappable range of
822 * addresses at least "minlen" long, where the base of the range is at "off"
823 * phase from an "align" boundary and there is space for a "redzone"-sized
824 * redzone on either side of the range. On success, 1 is returned and *basep
825 * and *lenp are adjusted to describe the acceptable range (including
826 * the redzone). On failure, 0 is returned.
827 */
828 /*ARGSUSED3*/
829 int
830 valid_va_range_aligned(caddr_t *basep, size_t *lenp, size_t minlen, int dir,
831 size_t align, size_t redzone, size_t off)
832 {
833 uintptr_t hi, lo;
834 size_t tot_len;
835
836 ASSERT(align == 0 ? off == 0 : off < align);
837 ASSERT(ISP2(align));
838 ASSERT(align == 0 || align >= PAGESIZE);
839
840 lo = (uintptr_t)*basep;
841 hi = lo + *lenp;
842 tot_len = minlen + 2 * redzone; /* need at least this much space */
843
844 /*
845 * If hi rolled over the top, try cutting back.
846 */
847 if (hi < lo) {
848 *lenp = 0UL - lo - 1UL;
849 /* See if this really happens. If so, then we figure out why */
850 valid_va_range_aligned_wraparound++;
851 hi = lo + *lenp;
852 }
853 if (*lenp < tot_len) {
854 return (0);
855 }
856
857 #if defined(__amd64)
858 /*
859 * Deal with a possible hole in the address range between
860 * hole_start and hole_end that should never be mapped.
861 */
862 if (lo < hole_start) {
863 if (hi > hole_start) {
864 if (hi < hole_end) {
865 hi = hole_start;
866 } else {
867 /* lo < hole_start && hi >= hole_end */
868 if (dir == AH_LO) {
869 /*
870 * prefer lowest range
871 */
872 if (hole_start - lo >= tot_len)
873 hi = hole_start;
874 else if (hi - hole_end >= tot_len)
875 lo = hole_end;
876 else
877 return (0);
878 } else {
879 /*
880 * prefer highest range
881 */
882 if (hi - hole_end >= tot_len)
883 lo = hole_end;
884 else if (hole_start - lo >= tot_len)
885 hi = hole_start;
886 else
887 return (0);
888 }
889 }
890 }
891 } else {
892 /* lo >= hole_start */
893 if (hi < hole_end)
894 return (0);
895 if (lo < hole_end)
896 lo = hole_end;
897 }
898 #endif
899
900 if (hi - lo < tot_len)
901 return (0);
902
903 if (align > 1) {
904 uintptr_t tlo = lo + redzone;
905 uintptr_t thi = hi - redzone;
906 tlo = (uintptr_t)P2PHASEUP(tlo, align, off);
907 if (tlo < lo + redzone) {
908 return (0);
909 }
910 if (thi < tlo || thi - tlo < minlen) {
911 return (0);
912 }
913 }
914
915 *basep = (caddr_t)lo;
916 *lenp = hi - lo;
917 return (1);
918 }
919
920 /*
921 * Determine whether [*basep, *basep + *lenp) contains a mappable range of
922 * addresses at least "minlen" long. On success, 1 is returned and *basep
923 * and *lenp are adjusted to describe the acceptable range. On failure, 0
924 * is returned.
925 */
926 int
927 valid_va_range(caddr_t *basep, size_t *lenp, size_t minlen, int dir)
928 {
929 return (valid_va_range_aligned(basep, lenp, minlen, dir, 0, 0, 0));
930 }
931
932 /*
933 * Determine whether [addr, addr+len] are valid user addresses.
934 */
935 /*ARGSUSED*/
936 int
937 valid_usr_range(caddr_t addr, size_t len, uint_t prot, struct as *as,
938 caddr_t userlimit)
939 {
940 caddr_t eaddr = addr + len;
941
942 if (eaddr <= addr || addr >= userlimit || eaddr > userlimit)
943 return (RANGE_BADADDR);
944
945 #if defined(__amd64)
946 /*
947 * Check for the VA hole
948 */
949 if (eaddr > (caddr_t)hole_start && addr < (caddr_t)hole_end)
950 return (RANGE_BADADDR);
951 #endif
952
953 return (RANGE_OKAY);
954 }
955
956 /*
957 * Return 1 if the page frame is onboard memory, else 0.
958 */
959 int
960 pf_is_memory(pfn_t pf)
961 {
962 if (pfn_is_foreign(pf))
963 return (0);
964 return (address_in_memlist(phys_install, pfn_to_pa(pf), 1));
965 }
966
967 /*
968 * return the memrange containing pfn
969 */
970 int
971 memrange_num(pfn_t pfn)
972 {
973 int n;
974
975 for (n = 0; n < nranges - 1; ++n) {
976 if (pfn >= memranges[n])
977 break;
978 }
979 return (n);
980 }
981
982 /*
983 * return the mnoderange containing pfn
984 */
985 /*ARGSUSED*/
986 int
987 pfn_2_mtype(pfn_t pfn)
988 {
989 #if defined(__xpv)
990 return (0);
991 #else
992 int n;
993
994 /* Always start from highest pfn and work our way down */
995 for (n = mtypetop; n != -1; n = mnoderanges[n].mnr_next) {
996 if (pfn >= mnoderanges[n].mnr_pfnlo) {
997 break;
998 }
999 }
1000 return (n);
1001 #endif
1002 }
1003
1004 #if !defined(__xpv)
1005 /*
1006 * is_contigpage_free:
1007 * returns a page list of contiguous pages. It minimally has to return
1008 * minctg pages. Caller determines minctg based on the scatter-gather
1009 * list length.
1010 *
1011 * pfnp is set to the next page frame to search on return.
1012 */
1013 static page_t *
1014 is_contigpage_free(
1015 pfn_t *pfnp,
1016 pgcnt_t *pgcnt,
1017 pgcnt_t minctg,
1018 uint64_t pfnseg,
1019 int iolock)
1020 {
1021 int i = 0;
1022 pfn_t pfn = *pfnp;
1023 page_t *pp;
1024 page_t *plist = NULL;
1025
1026 /*
1027 * fail if pfn + minctg crosses a segment boundary.
1028 * Adjust for next starting pfn to begin at segment boundary.
1029 */
1030
1031 if (((*pfnp + minctg - 1) & pfnseg) < (*pfnp & pfnseg)) {
1032 *pfnp = roundup(*pfnp, pfnseg + 1);
1033 return (NULL);
1034 }
1035
1036 do {
1037 retry:
1038 pp = page_numtopp_nolock(pfn + i);
1039 if ((pp == NULL) || IS_DUMP_PAGE(pp) ||
1040 (page_trylock(pp, SE_EXCL) == 0)) {
1041 (*pfnp)++;
1042 break;
1043 }
1044 if (page_pptonum(pp) != pfn + i) {
1045 page_unlock(pp);
1046 goto retry;
1047 }
1048
1049 if (!(PP_ISFREE(pp))) {
1050 page_unlock(pp);
1051 (*pfnp)++;
1052 break;
1053 }
1054
1055 if (!PP_ISAGED(pp)) {
1056 page_list_sub(pp, PG_CACHE_LIST);
1057 page_hashout(pp, (kmutex_t *)NULL);
1058 } else {
1059 page_list_sub(pp, PG_FREE_LIST);
1060 }
1061
1062 if (iolock)
1063 page_io_lock(pp);
1064 page_list_concat(&plist, &pp);
1065
1066 /*
1067 * exit loop when pgcnt satisfied or segment boundary reached.
1068 */
1069
1070 } while ((++i < *pgcnt) && ((pfn + i) & pfnseg));
1071
1072 *pfnp += i; /* set to next pfn to search */
1073
1074 if (i >= minctg) {
1075 *pgcnt -= i;
1076 return (plist);
1077 }
1078
1079 /*
1080 * failure: minctg not satisfied.
1081 *
1082 * if next request crosses segment boundary, set next pfn
1083 * to search from the segment boundary.
1084 */
1085 if (((*pfnp + minctg - 1) & pfnseg) < (*pfnp & pfnseg))
1086 *pfnp = roundup(*pfnp, pfnseg + 1);
1087
1088 /* clean up any pages already allocated */
1089
1090 while (plist) {
1091 pp = plist;
1092 page_sub(&plist, pp);
1093 page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
1094 if (iolock)
1095 page_io_unlock(pp);
1096 page_unlock(pp);
1097 }
1098
1099 return (NULL);
1100 }
1101 #endif /* !__xpv */
1102
1103 /*
1104 * verify that pages being returned from allocator have correct DMA attribute
1105 */
1106 #ifndef DEBUG
1107 #define check_dma(a, b, c) (void)(0)
1108 #else
1109 static void
1110 check_dma(ddi_dma_attr_t *dma_attr, page_t *pp, int cnt)
1111 {
1112 if (dma_attr == NULL)
1113 return;
1114
1115 while (cnt-- > 0) {
1116 if (pa_to_ma(pfn_to_pa(pp->p_pagenum)) <
1117 dma_attr->dma_attr_addr_lo)
1118 panic("PFN (pp=%p) below dma_attr_addr_lo", (void *)pp);
1119 if (pa_to_ma(pfn_to_pa(pp->p_pagenum)) >=
1120 dma_attr->dma_attr_addr_hi)
1121 panic("PFN (pp=%p) above dma_attr_addr_hi", (void *)pp);
1122 pp = pp->p_next;
1123 }
1124 }
1125 #endif
1126
1127 #if !defined(__xpv)
1128 static page_t *
1129 page_get_contigpage(pgcnt_t *pgcnt, ddi_dma_attr_t *mattr, int iolock)
1130 {
1131 pfn_t pfn;
1132 int sgllen;
1133 uint64_t pfnseg;
1134 pgcnt_t minctg;
1135 page_t *pplist = NULL, *plist;
1136 uint64_t lo, hi;
1137 pgcnt_t pfnalign = 0;
1138 static pfn_t startpfn;
1139 static pgcnt_t lastctgcnt;
1140 uintptr_t align;
1141
1142 CONTIG_LOCK();
1143
1144 if (mattr) {
1145 lo = mmu_btop((mattr->dma_attr_addr_lo + MMU_PAGEOFFSET));
1146 hi = mmu_btop(mattr->dma_attr_addr_hi);
1147 if (hi >= physmax)
1148 hi = physmax - 1;
1149 sgllen = mattr->dma_attr_sgllen;
1150 pfnseg = mmu_btop(mattr->dma_attr_seg);
1151
1152 align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer);
1153 if (align > MMU_PAGESIZE)
1154 pfnalign = mmu_btop(align);
1155
1156 /*
1157 * in order to satisfy the request, must minimally
1158 * acquire minctg contiguous pages
1159 */
1160 minctg = howmany(*pgcnt, sgllen);
1161
1162 ASSERT(hi >= lo);
1163
1164 /*
1165 * start from where last searched if the minctg >= lastctgcnt
1166 */
1167 if (minctg < lastctgcnt || startpfn < lo || startpfn > hi)
1168 startpfn = lo;
1169 } else {
1170 hi = physmax - 1;
1171 lo = 0;
1172 sgllen = 1;
1173 pfnseg = mmu.highest_pfn;
1174 minctg = *pgcnt;
1175
1176 if (minctg < lastctgcnt)
1177 startpfn = lo;
1178 }
1179 lastctgcnt = minctg;
1180
1181 ASSERT(pfnseg + 1 >= (uint64_t)minctg);
1182
1183 /* conserve 16m memory - start search above 16m when possible */
1184 if (hi > PFN_16M && startpfn < PFN_16M)
1185 startpfn = PFN_16M;
1186
1187 pfn = startpfn;
1188 if (pfnalign)
1189 pfn = P2ROUNDUP(pfn, pfnalign);
1190
1191 while (pfn + minctg - 1 <= hi) {
1192
1193 plist = is_contigpage_free(&pfn, pgcnt, minctg, pfnseg, iolock);
1194 if (plist) {
1195 page_list_concat(&pplist, &plist);
1196 sgllen--;
1197 /*
1198 * return when contig pages no longer needed
1199 */
1200 if (!*pgcnt || ((*pgcnt <= sgllen) && !pfnalign)) {
1201 startpfn = pfn;
1202 CONTIG_UNLOCK();
1203 check_dma(mattr, pplist, *pgcnt);
1204 return (pplist);
1205 }
1206 minctg = howmany(*pgcnt, sgllen);
1207 }
1208 if (pfnalign)
1209 pfn = P2ROUNDUP(pfn, pfnalign);
1210 }
1211
1212 /* cannot find contig pages in specified range */
1213 if (startpfn == lo) {
1214 CONTIG_UNLOCK();
1215 return (NULL);
1216 }
1217
1218 /* did not start with lo previously */
1219 pfn = lo;
1220 if (pfnalign)
1221 pfn = P2ROUNDUP(pfn, pfnalign);
1222
1223 /* allow search to go above startpfn */
1224 while (pfn < startpfn) {
1225
1226 plist = is_contigpage_free(&pfn, pgcnt, minctg, pfnseg, iolock);
1227 if (plist != NULL) {
1228
1229 page_list_concat(&pplist, &plist);
1230 sgllen--;
1231
1232 /*
1233 * return when contig pages no longer needed
1234 */
1235 if (!*pgcnt || ((*pgcnt <= sgllen) && !pfnalign)) {
1236 startpfn = pfn;
1237 CONTIG_UNLOCK();
1238 check_dma(mattr, pplist, *pgcnt);
1239 return (pplist);
1240 }
1241 minctg = howmany(*pgcnt, sgllen);
1242 }
1243 if (pfnalign)
1244 pfn = P2ROUNDUP(pfn, pfnalign);
1245 }
1246 CONTIG_UNLOCK();
1247 return (NULL);
1248 }
1249 #endif /* !__xpv */
1250
1251 /*
1252 * mnode_range_cnt() calculates the number of memory ranges for mnode and
1253 * memranges[]. Used to determine the size of page lists and mnoderanges.
1254 */
1255 int
1256 mnode_range_cnt(int mnode)
1257 {
1258 #if defined(__xpv)
1259 ASSERT(mnode == 0);
1260 return (1);
1261 #else /* __xpv */
1262 int mri;
1263 int mnrcnt = 0;
1264
1265 if (mem_node_config[mnode].exists != 0) {
1266 mri = nranges - 1;
1267
1268 /* find the memranges index below contained in mnode range */
1269
1270 while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase)
1271 mri--;
1272
1273 /*
1274 * increment mnode range counter when memranges or mnode
1275 * boundary is reached.
1276 */
1277 while (mri >= 0 &&
1278 mem_node_config[mnode].physmax >= MEMRANGELO(mri)) {
1279 mnrcnt++;
1280 if (mem_node_config[mnode].physmax > MEMRANGEHI(mri))
1281 mri--;
1282 else
1283 break;
1284 }
1285 }
1286 ASSERT(mnrcnt <= MAX_MNODE_MRANGES);
1287 return (mnrcnt);
1288 #endif /* __xpv */
1289 }
1290
1291 /*
1292 * mnode_range_setup() initializes mnoderanges.
1293 */
1294 void
1295 mnode_range_setup(mnoderange_t *mnoderanges)
1296 {
1297 mnoderange_t *mp = mnoderanges;
1298 int mnode, mri;
1299 int mindex = 0; /* current index into mnoderanges array */
1300 int i, j;
1301 pfn_t hipfn;
1302 int last, hi;
1303
1304 for (mnode = 0; mnode < max_mem_nodes; mnode++) {
1305 if (mem_node_config[mnode].exists == 0)
1306 continue;
1307
1308 mri = nranges - 1;
1309
1310 while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase)
1311 mri--;
1312
1313 while (mri >= 0 && mem_node_config[mnode].physmax >=
1314 MEMRANGELO(mri)) {
1315 mnoderanges->mnr_pfnlo = MAX(MEMRANGELO(mri),
1316 mem_node_config[mnode].physbase);
1317 mnoderanges->mnr_pfnhi = MIN(MEMRANGEHI(mri),
1318 mem_node_config[mnode].physmax);
1319 mnoderanges->mnr_mnode = mnode;
1320 mnoderanges->mnr_memrange = mri;
1321 mnoderanges->mnr_exists = 1;
1322 mnoderanges++;
1323 mindex++;
1324 if (mem_node_config[mnode].physmax > MEMRANGEHI(mri))
1325 mri--;
1326 else
1327 break;
1328 }
1329 }
1330
1331 /*
1332 * For now do a simple sort of the mnoderanges array to fill in
1333 * the mnr_next fields. Since mindex is expected to be relatively
1334 * small, using a simple O(N^2) algorithm.
1335 */
1336 for (i = 0; i < mindex; i++) {
1337 if (mp[i].mnr_pfnlo == 0) /* find lowest */
1338 break;
1339 }
1340 ASSERT(i < mindex);
1341 last = i;
1342 mtype16m = last;
1343 mp[last].mnr_next = -1;
1344 for (i = 0; i < mindex - 1; i++) {
1345 hipfn = (pfn_t)(-1);
1346 hi = -1;
1347 /* find next highest mnode range */
1348 for (j = 0; j < mindex; j++) {
1349 if (mp[j].mnr_pfnlo > mp[last].mnr_pfnlo &&
1350 mp[j].mnr_pfnlo < hipfn) {
1351 hipfn = mp[j].mnr_pfnlo;
1352 hi = j;
1353 }
1354 }
1355 mp[hi].mnr_next = last;
1356 last = hi;
1357 }
1358 mtypetop = last;
1359 }
1360
1361 #ifndef __xpv
1362 /*
1363 * Update mnoderanges for memory hot-add DR operations.
1364 */
1365 static void
1366 mnode_range_add(int mnode)
1367 {
1368 int *prev;
1369 int n, mri;
1370 pfn_t start, end;
1371 extern void membar_sync(void);
1372
1373 ASSERT(0 <= mnode && mnode < max_mem_nodes);
1374 ASSERT(mem_node_config[mnode].exists);
1375 start = mem_node_config[mnode].physbase;
1376 end = mem_node_config[mnode].physmax;
1377 ASSERT(start <= end);
1378 mutex_enter(&mnoderange_lock);
1379
1380 #ifdef DEBUG
1381 /* Check whether it interleaves with other memory nodes. */
1382 for (n = mtypetop; n != -1; n = mnoderanges[n].mnr_next) {
1383 ASSERT(mnoderanges[n].mnr_exists);
1384 if (mnoderanges[n].mnr_mnode == mnode)
1385 continue;
1386 ASSERT(start > mnoderanges[n].mnr_pfnhi ||
1387 end < mnoderanges[n].mnr_pfnlo);
1388 }
1389 #endif /* DEBUG */
1390
1391 mri = nranges - 1;
1392 while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase)
1393 mri--;
1394 while (mri >= 0 && mem_node_config[mnode].physmax >= MEMRANGELO(mri)) {
1395 /* Check whether mtype already exists. */
1396 for (n = mtypetop; n != -1; n = mnoderanges[n].mnr_next) {
1397 if (mnoderanges[n].mnr_mnode == mnode &&
1398 mnoderanges[n].mnr_memrange == mri) {
1399 mnoderanges[n].mnr_pfnlo = MAX(MEMRANGELO(mri),
1400 start);
1401 mnoderanges[n].mnr_pfnhi = MIN(MEMRANGEHI(mri),
1402 end);
1403 break;
1404 }
1405 }
1406
1407 /* Add a new entry if it doesn't exist yet. */
1408 if (n == -1) {
1409 /* Try to find an unused entry in mnoderanges array. */
1410 for (n = 0; n < mnoderangecnt; n++) {
1411 if (mnoderanges[n].mnr_exists == 0)
1412 break;
1413 }
1414 ASSERT(n < mnoderangecnt);
1415 mnoderanges[n].mnr_pfnlo = MAX(MEMRANGELO(mri), start);
1416 mnoderanges[n].mnr_pfnhi = MIN(MEMRANGEHI(mri), end);
1417 mnoderanges[n].mnr_mnode = mnode;
1418 mnoderanges[n].mnr_memrange = mri;
1419 mnoderanges[n].mnr_exists = 1;
1420 /* Page 0 should always be present. */
1421 for (prev = &mtypetop;
1422 mnoderanges[*prev].mnr_pfnlo > start;
1423 prev = &mnoderanges[*prev].mnr_next) {
1424 ASSERT(mnoderanges[*prev].mnr_next >= 0);
1425 ASSERT(mnoderanges[*prev].mnr_pfnlo > end);
1426 }
1427 mnoderanges[n].mnr_next = *prev;
1428 membar_sync();
1429 *prev = n;
1430 }
1431
1432 if (mem_node_config[mnode].physmax > MEMRANGEHI(mri))
1433 mri--;
1434 else
1435 break;
1436 }
1437
1438 mutex_exit(&mnoderange_lock);
1439 }
1440
1441 /*
1442 * Update mnoderanges for memory hot-removal DR operations.
1443 */
1444 static void
1445 mnode_range_del(int mnode)
1446 {
1447 _NOTE(ARGUNUSED(mnode));
1448 ASSERT(0 <= mnode && mnode < max_mem_nodes);
1449 /* TODO: support deletion operation. */
1450 ASSERT(0);
1451 }
1452
1453 void
1454 plat_slice_add(pfn_t start, pfn_t end)
1455 {
1456 mem_node_add_slice(start, end);
1457 if (plat_dr_enabled()) {
1458 mnode_range_add(PFN_2_MEM_NODE(start));
1459 }
1460 }
1461
1462 void
1463 plat_slice_del(pfn_t start, pfn_t end)
1464 {
1465 ASSERT(PFN_2_MEM_NODE(start) == PFN_2_MEM_NODE(end));
1466 ASSERT(plat_dr_enabled());
1467 mnode_range_del(PFN_2_MEM_NODE(start));
1468 mem_node_del_slice(start, end);
1469 }
1470 #endif /* __xpv */
1471
1472 /*ARGSUSED*/
1473 int
1474 mtype_init(vnode_t *vp, caddr_t vaddr, uint_t *flags, size_t pgsz)
1475 {
1476 int mtype = mtypetop;
1477
1478 #if !defined(__xpv)
1479 #if defined(__i386)
1480 /*
1481 * set the mtype range
1482 * - kmem requests need to be below 4g if restricted_kmemalloc is set.
1483 * - for non kmem requests, set range to above 4g if memory below 4g
1484 * runs low.
1485 */
1486 if (restricted_kmemalloc && VN_ISKAS(vp) &&
1487 (caddr_t)(vaddr) >= kernelheap &&
1488 (caddr_t)(vaddr) < ekernelheap) {
1489 ASSERT(physmax4g);
1490 mtype = mtype4g;
1491 if (RESTRICT16M_ALLOC(freemem4g - btop(pgsz),
1492 btop(pgsz), *flags)) {
1493 *flags |= PGI_MT_RANGE16M;
1494 } else {
1495 VM_STAT_ADD(vmm_vmstats.unrestrict16mcnt);
1496 VM_STAT_COND_ADD((*flags & PG_PANIC),
1497 vmm_vmstats.pgpanicalloc);
1498 *flags |= PGI_MT_RANGE0;
1499 }
1500 return (mtype);
1501 }
1502 #endif /* __i386 */
1503
1504 if (RESTRICT4G_ALLOC) {
1505 VM_STAT_ADD(vmm_vmstats.restrict4gcnt);
1506 /* here only for > 4g systems */
1507 *flags |= PGI_MT_RANGE4G;
1508 } else if (RESTRICT16M_ALLOC(freemem, btop(pgsz), *flags)) {
1509 *flags |= PGI_MT_RANGE16M;
1510 } else {
1511 VM_STAT_ADD(vmm_vmstats.unrestrict16mcnt);
1512 VM_STAT_COND_ADD((*flags & PG_PANIC), vmm_vmstats.pgpanicalloc);
1513 *flags |= PGI_MT_RANGE0;
1514 }
1515 #endif /* !__xpv */
1516 return (mtype);
1517 }
1518
1519
1520 /* mtype init for page_get_replacement_page */
1521 /*ARGSUSED*/
1522 int
1523 mtype_pgr_init(int *flags, page_t *pp, int mnode, pgcnt_t pgcnt)
1524 {
1525 int mtype = mtypetop;
1526 #if !defined(__xpv)
1527 if (RESTRICT16M_ALLOC(freemem, pgcnt, *flags)) {
1528 *flags |= PGI_MT_RANGE16M;
1529 } else {
1530 VM_STAT_ADD(vmm_vmstats.unrestrict16mcnt);
1531 *flags |= PGI_MT_RANGE0;
1532 }
1533 #endif
1534 return (mtype);
1535 }
1536
1537 /*
1538 * Determine if the mnode range specified in mtype contains memory belonging
1539 * to memory node mnode. If flags & PGI_MT_RANGE is set then mtype contains
1540 * the range from high pfn to 0, 16m or 4g.
1541 *
1542 * Return first mnode range type index found otherwise return -1 if none found.
1543 */
1544 int
1545 mtype_func(int mnode, int mtype, uint_t flags)
1546 {
1547 if (flags & PGI_MT_RANGE) {
1548 int mnr_lim = MRI_0;
1549
1550 if (flags & PGI_MT_NEXT) {
1551 mtype = mnoderanges[mtype].mnr_next;
1552 }
1553 if (flags & PGI_MT_RANGE4G)
1554 mnr_lim = MRI_4G; /* exclude 0-4g range */
1555 else if (flags & PGI_MT_RANGE16M)
1556 mnr_lim = MRI_16M; /* exclude 0-16m range */
1557 while (mtype != -1 &&
1558 mnoderanges[mtype].mnr_memrange <= mnr_lim) {
1559 if (mnoderanges[mtype].mnr_mnode == mnode)
1560 return (mtype);
1561 mtype = mnoderanges[mtype].mnr_next;
1562 }
1563 } else if (mnoderanges[mtype].mnr_mnode == mnode) {
1564 return (mtype);
1565 }
1566 return (-1);
1567 }
1568
1569 /*
1570 * Update the page list max counts with the pfn range specified by the
1571 * input parameters.
1572 */
1573 void
1574 mtype_modify_max(pfn_t startpfn, long cnt)
1575 {
1576 int mtype;
1577 pgcnt_t inc;
1578 spgcnt_t scnt = (spgcnt_t)(cnt);
1579 pgcnt_t acnt = ABS(scnt);
1580 pfn_t endpfn = startpfn + acnt;
1581 pfn_t pfn, lo;
1582
1583 if (!physmax4g)
1584 return;
1585
1586 mtype = mtypetop;
1587 for (pfn = endpfn; pfn > startpfn; ) {
1588 ASSERT(mtype != -1);
1589 lo = mnoderanges[mtype].mnr_pfnlo;
1590 if (pfn > lo) {
1591 if (startpfn >= lo) {
1592 inc = pfn - startpfn;
1593 } else {
1594 inc = pfn - lo;
1595 }
1596 if (mnoderanges[mtype].mnr_memrange != MRI_4G) {
1597 if (scnt > 0)
1598 maxmem4g += inc;
1599 else
1600 maxmem4g -= inc;
1601 }
1602 pfn -= inc;
1603 }
1604 mtype = mnoderanges[mtype].mnr_next;
1605 }
1606 }
1607
1608 int
1609 mtype_2_mrange(int mtype)
1610 {
1611 return (mnoderanges[mtype].mnr_memrange);
1612 }
1613
1614 void
1615 mnodetype_2_pfn(int mnode, int mtype, pfn_t *pfnlo, pfn_t *pfnhi)
1616 {
1617 _NOTE(ARGUNUSED(mnode));
1618 ASSERT(mnoderanges[mtype].mnr_mnode == mnode);
1619 *pfnlo = mnoderanges[mtype].mnr_pfnlo;
1620 *pfnhi = mnoderanges[mtype].mnr_pfnhi;
1621 }
1622
1623 size_t
1624 plcnt_sz(size_t ctrs_sz)
1625 {
1626 #ifdef DEBUG
1627 int szc, colors;
1628
1629 ctrs_sz += mnoderangecnt * sizeof (struct mnr_mts) * mmu_page_sizes;
1630 for (szc = 0; szc < mmu_page_sizes; szc++) {
1631 colors = page_get_pagecolors(szc);
1632 ctrs_sz += mnoderangecnt * sizeof (pgcnt_t) * colors;
1633 }
1634 #endif
1635 return (ctrs_sz);
1636 }
1637
1638 caddr_t
1639 plcnt_init(caddr_t addr)
1640 {
1641 #ifdef DEBUG
1642 int mt, szc, colors;
1643
1644 for (mt = 0; mt < mnoderangecnt; mt++) {
1645 mnoderanges[mt].mnr_mts = (struct mnr_mts *)addr;
1646 addr += (sizeof (struct mnr_mts) * mmu_page_sizes);
1647 for (szc = 0; szc < mmu_page_sizes; szc++) {
1648 colors = page_get_pagecolors(szc);
1649 mnoderanges[mt].mnr_mts[szc].mnr_mts_colors = colors;
1650 mnoderanges[mt].mnr_mts[szc].mnr_mtsc_pgcnt =
1651 (pgcnt_t *)addr;
1652 addr += (sizeof (pgcnt_t) * colors);
1653 }
1654 }
1655 #endif
1656 return (addr);
1657 }
1658
1659 void
1660 plcnt_inc_dec(page_t *pp, int mtype, int szc, long cnt, int flags)
1661 {
1662 _NOTE(ARGUNUSED(pp));
1663 #ifdef DEBUG
1664 int bin = PP_2_BIN(pp);
1665
1666 atomic_add_long(&mnoderanges[mtype].mnr_mts[szc].mnr_mts_pgcnt, cnt);
1667 atomic_add_long(&mnoderanges[mtype].mnr_mts[szc].mnr_mtsc_pgcnt[bin],
1668 cnt);
1669 #endif
1670 ASSERT(mtype == PP_2_MTYPE(pp));
1671 if (physmax4g && mnoderanges[mtype].mnr_memrange != MRI_4G)
1672 atomic_add_long(&freemem4g, cnt);
1673 if (flags & PG_CACHE_LIST)
1674 atomic_add_long(&mnoderanges[mtype].mnr_mt_clpgcnt, cnt);
1675 else
1676 atomic_add_long(&mnoderanges[mtype].mnr_mt_flpgcnt[szc], cnt);
1677 atomic_add_long(&mnoderanges[mtype].mnr_mt_totcnt, cnt);
1678 }
1679
1680 /*
1681 * Returns the free page count for mnode
1682 */
1683 int
1684 mnode_pgcnt(int mnode)
1685 {
1686 int mtype = mtypetop;
1687 int flags = PGI_MT_RANGE0;
1688 pgcnt_t pgcnt = 0;
1689
1690 mtype = mtype_func(mnode, mtype, flags);
1691
1692 while (mtype != -1) {
1693 pgcnt += MTYPE_FREEMEM(mtype);
1694 mtype = mtype_func(mnode, mtype, flags | PGI_MT_NEXT);
1695 }
1696 return (pgcnt);
1697 }
1698
1699 /*
1700 * Initialize page coloring variables based on the l2 cache parameters.
1701 * Calculate and return memory needed for page coloring data structures.
1702 */
1703 size_t
1704 page_coloring_init(uint_t l2_sz, int l2_linesz, int l2_assoc)
1705 {
1706 _NOTE(ARGUNUSED(l2_linesz));
1707 size_t colorsz = 0;
1708 int i;
1709 int colors;
1710
1711 #if defined(__xpv)
1712 /*
1713 * Hypervisor domains currently don't have any concept of NUMA.
1714 * Hence we'll act like there is only 1 memrange.
1715 */
1716 i = memrange_num(1);
1717 #else /* !__xpv */
1718 /*
1719 * Reduce the memory ranges lists if we don't have large amounts
1720 * of memory. This avoids searching known empty free lists.
1721 * To support memory DR operations, we need to keep memory ranges
1722 * for possible memory hot-add operations.
1723 */
1724 if (plat_dr_physmax > physmax)
1725 i = memrange_num(plat_dr_physmax);
1726 else
1727 i = memrange_num(physmax);
1728 #if defined(__i386)
1729 if (i > MRI_4G)
1730 restricted_kmemalloc = 0;
1731 #endif
1732 /* physmax greater than 4g */
1733 if (i == MRI_4G)
1734 physmax4g = 1;
1735 #endif /* !__xpv */
1736 memranges += i;
1737 nranges -= i;
1738
1739 ASSERT(mmu_page_sizes <= MMU_PAGE_SIZES);
1740
1741 ASSERT(ISP2(l2_linesz));
1742 ASSERT(l2_sz > MMU_PAGESIZE);
1743
1744 /* l2_assoc is 0 for fully associative l2 cache */
1745 if (l2_assoc)
1746 l2_colors = MAX(1, l2_sz / (l2_assoc * MMU_PAGESIZE));
1747 else
1748 l2_colors = 1;
1749
1750 ASSERT(ISP2(l2_colors));
1751
1752 /* for scalability, configure at least PAGE_COLORS_MIN color bins */
1753 page_colors = MAX(l2_colors, PAGE_COLORS_MIN);
1754
1755 /*
1756 * cpu_page_colors is non-zero when a page color may be spread across
1757 * multiple bins.
1758 */
1759 if (l2_colors < page_colors)
1760 cpu_page_colors = l2_colors;
1761
1762 ASSERT(ISP2(page_colors));
1763
1764 page_colors_mask = page_colors - 1;
1765
1766 ASSERT(ISP2(CPUSETSIZE()));
1767 page_coloring_shift = lowbit(CPUSETSIZE());
1768
1769 /* initialize number of colors per page size */
1770 for (i = 0; i <= mmu.max_page_level; i++) {
1771 hw_page_array[i].hp_size = LEVEL_SIZE(i);
1772 hw_page_array[i].hp_shift = LEVEL_SHIFT(i);
1773 hw_page_array[i].hp_pgcnt = LEVEL_SIZE(i) >> LEVEL_SHIFT(0);
1774 hw_page_array[i].hp_colors = (page_colors_mask >>
1775 (hw_page_array[i].hp_shift - hw_page_array[0].hp_shift))
1776 + 1;
1777 colorequivszc[i] = 0;
1778 }
1779
1780 /*
1781 * The value of cpu_page_colors determines if additional color bins
1782 * need to be checked for a particular color in the page_get routines.
1783 */
1784 if (cpu_page_colors != 0) {
1785
1786 int a = lowbit(page_colors) - lowbit(cpu_page_colors);
1787 ASSERT(a > 0);
1788 ASSERT(a < 16);
1789
1790 for (i = 0; i <= mmu.max_page_level; i++) {
1791 if ((colors = hw_page_array[i].hp_colors) <= 1) {
1792 colorequivszc[i] = 0;
1793 continue;
1794 }
1795 while ((colors >> a) == 0)
1796 a--;
1797 ASSERT(a >= 0);
1798
1799 /* higher 4 bits encodes color equiv mask */
1800 colorequivszc[i] = (a << 4);
1801 }
1802 }
1803
1804 /* factor in colorequiv to check additional 'equivalent' bins. */
1805 if (colorequiv > 1) {
1806
1807 int a = lowbit(colorequiv) - 1;
1808 if (a > 15)
1809 a = 15;
1810
1811 for (i = 0; i <= mmu.max_page_level; i++) {
1812 if ((colors = hw_page_array[i].hp_colors) <= 1) {
1813 continue;
1814 }
1815 while ((colors >> a) == 0)
1816 a--;
1817 if ((a << 4) > colorequivszc[i]) {
1818 colorequivszc[i] = (a << 4);
1819 }
1820 }
1821 }
1822
1823 /* size for mnoderanges */
1824 for (mnoderangecnt = 0, i = 0; i < max_mem_nodes; i++)
1825 mnoderangecnt += mnode_range_cnt(i);
1826 if (plat_dr_support_memory()) {
1827 /*
1828 * Reserve enough space for memory DR operations.
1829 * Two extra mnoderanges for possbile fragmentations,
1830 * one for the 2G boundary and the other for the 4G boundary.
1831 * We don't expect a memory board crossing the 16M boundary
1832 * for memory hot-add operations on x86 platforms.
1833 */
1834 mnoderangecnt += 2 + max_mem_nodes - lgrp_plat_node_cnt;
1835 }
1836 colorsz = mnoderangecnt * sizeof (mnoderange_t);
1837
1838 /* size for fpc_mutex and cpc_mutex */
1839 colorsz += (2 * max_mem_nodes * sizeof (kmutex_t) * NPC_MUTEX);
1840
1841 /* size of page_freelists */
1842 colorsz += mnoderangecnt * sizeof (page_t ***);
1843 colorsz += mnoderangecnt * mmu_page_sizes * sizeof (page_t **);
1844
1845 for (i = 0; i < mmu_page_sizes; i++) {
1846 colors = page_get_pagecolors(i);
1847 colorsz += mnoderangecnt * colors * sizeof (page_t *);
1848 }
1849
1850 /* size of page_cachelists */
1851 colorsz += mnoderangecnt * sizeof (page_t **);
1852 colorsz += mnoderangecnt * page_colors * sizeof (page_t *);
1853
1854 return (colorsz);
1855 }
1856
1857 /*
1858 * Called once at startup to configure page_coloring data structures and
1859 * does the 1st page_free()/page_freelist_add().
1860 */
1861 void
1862 page_coloring_setup(caddr_t pcmemaddr)
1863 {
1864 int i;
1865 int j;
1866 int k;
1867 caddr_t addr;
1868 int colors;
1869
1870 /*
1871 * do page coloring setup
1872 */
1873 addr = pcmemaddr;
1874
1875 mnoderanges = (mnoderange_t *)addr;
1876 addr += (mnoderangecnt * sizeof (mnoderange_t));
1877
1878 mnode_range_setup(mnoderanges);
1879
1880 if (physmax4g)
1881 mtype4g = pfn_2_mtype(0xfffff);
1882
1883 for (k = 0; k < NPC_MUTEX; k++) {
1884 fpc_mutex[k] = (kmutex_t *)addr;
1885 addr += (max_mem_nodes * sizeof (kmutex_t));
1886 }
1887 for (k = 0; k < NPC_MUTEX; k++) {
1888 cpc_mutex[k] = (kmutex_t *)addr;
1889 addr += (max_mem_nodes * sizeof (kmutex_t));
1890 }
1891 page_freelists = (page_t ****)addr;
1892 addr += (mnoderangecnt * sizeof (page_t ***));
1893
1894 page_cachelists = (page_t ***)addr;
1895 addr += (mnoderangecnt * sizeof (page_t **));
1896
1897 for (i = 0; i < mnoderangecnt; i++) {
1898 page_freelists[i] = (page_t ***)addr;
1899 addr += (mmu_page_sizes * sizeof (page_t **));
1900
1901 for (j = 0; j < mmu_page_sizes; j++) {
1902 colors = page_get_pagecolors(j);
1903 page_freelists[i][j] = (page_t **)addr;
1904 addr += (colors * sizeof (page_t *));
1905 }
1906 page_cachelists[i] = (page_t **)addr;
1907 addr += (page_colors * sizeof (page_t *));
1908 }
1909 }
1910
1911 #if defined(__xpv)
1912 /*
1913 * Give back 10% of the io_pool pages to the free list.
1914 * Don't shrink the pool below some absolute minimum.
1915 */
1916 static void
1917 page_io_pool_shrink()
1918 {
1919 int retcnt;
1920 page_t *pp, *pp_first, *pp_last, **curpool;
1921 mfn_t mfn;
1922 int bothpools = 0;
1923
1924 mutex_enter(&io_pool_lock);
1925 io_pool_shrink_attempts++; /* should be a kstat? */
1926 retcnt = io_pool_cnt / 10;
1927 if (io_pool_cnt - retcnt < io_pool_cnt_min)
1928 retcnt = io_pool_cnt - io_pool_cnt_min;
1929 if (retcnt <= 0)
1930 goto done;
1931 io_pool_shrinks++; /* should be a kstat? */
1932 curpool = &io_pool_4g;
1933 domore:
1934 /*
1935 * Loop through taking pages from the end of the list
1936 * (highest mfns) till amount to return reached.
1937 */
1938 for (pp = *curpool; pp && retcnt > 0; ) {
1939 pp_first = pp_last = pp->p_prev;
1940 if (pp_first == *curpool)
1941 break;
1942 retcnt--;
1943 io_pool_cnt--;
1944 page_io_pool_sub(curpool, pp_first, pp_last);
1945 if ((mfn = pfn_to_mfn(pp->p_pagenum)) < start_mfn)
1946 start_mfn = mfn;
1947 page_free(pp_first, 1);
1948 pp = *curpool;
1949 }
1950 if (retcnt != 0 && !bothpools) {
1951 /*
1952 * If not enough found in less constrained pool try the
1953 * more constrained one.
1954 */
1955 curpool = &io_pool_16m;
1956 bothpools = 1;
1957 goto domore;
1958 }
1959 done:
1960 mutex_exit(&io_pool_lock);
1961 }
1962
1963 #endif /* __xpv */
1964
1965 uint_t
1966 page_create_update_flags_x86(uint_t flags)
1967 {
1968 #if defined(__xpv)
1969 /*
1970 * Check this is an urgent allocation and free pages are depleted.
1971 */
1972 if (!(flags & PG_WAIT) && freemem < desfree)
1973 page_io_pool_shrink();
1974 #else /* !__xpv */
1975 /*
1976 * page_create_get_something may call this because 4g memory may be
1977 * depleted. Set flags to allow for relocation of base page below
1978 * 4g if necessary.
1979 */
1980 if (physmax4g)
1981 flags |= (PGI_PGCPSZC0 | PGI_PGCPHIPRI);
1982 #endif /* __xpv */
1983 return (flags);
1984 }
1985
1986 /*ARGSUSED*/
1987 int
1988 bp_color(struct buf *bp)
1989 {
1990 return (0);
1991 }
1992
1993 #if defined(__xpv)
1994
1995 /*
1996 * Take pages out of an io_pool
1997 */
1998 static void
1999 page_io_pool_sub(page_t **poolp, page_t *pp_first, page_t *pp_last)
2000 {
2001 if (*poolp == pp_first) {
2002 *poolp = pp_last->p_next;
2003 if (*poolp == pp_first)
2004 *poolp = NULL;
2005 }
2006 pp_first->p_prev->p_next = pp_last->p_next;
2007 pp_last->p_next->p_prev = pp_first->p_prev;
2008 pp_first->p_prev = pp_last;
2009 pp_last->p_next = pp_first;
2010 }
2011
2012 /*
2013 * Put a page on the io_pool list. The list is ordered by increasing MFN.
2014 */
2015 static void
2016 page_io_pool_add(page_t **poolp, page_t *pp)
2017 {
2018 page_t *look;
2019 mfn_t mfn = mfn_list[pp->p_pagenum];
2020
2021 if (*poolp == NULL) {
2022 *poolp = pp;
2023 pp->p_next = pp;
2024 pp->p_prev = pp;
2025 return;
2026 }
2027
2028 /*
2029 * Since we try to take pages from the high end of the pool
2030 * chances are good that the pages to be put on the list will
2031 * go at or near the end of the list. so start at the end and
2032 * work backwards.
2033 */
2034 look = (*poolp)->p_prev;
2035 while (mfn < mfn_list[look->p_pagenum]) {
2036 look = look->p_prev;
2037 if (look == (*poolp)->p_prev)
2038 break; /* backed all the way to front of list */
2039 }
2040
2041 /* insert after look */
2042 pp->p_prev = look;
2043 pp->p_next = look->p_next;
2044 pp->p_next->p_prev = pp;
2045 look->p_next = pp;
2046 if (mfn < mfn_list[(*poolp)->p_pagenum]) {
2047 /*
2048 * we inserted a new first list element
2049 * adjust pool pointer to newly inserted element
2050 */
2051 *poolp = pp;
2052 }
2053 }
2054
2055 /*
2056 * Add a page to the io_pool. Setting the force flag will force the page
2057 * into the io_pool no matter what.
2058 */
2059 static void
2060 add_page_to_pool(page_t *pp, int force)
2061 {
2062 page_t *highest;
2063 page_t *freep = NULL;
2064
2065 mutex_enter(&io_pool_lock);
2066 /*
2067 * Always keep the scarce low memory pages
2068 */
2069 if (mfn_list[pp->p_pagenum] < PFN_16MEG) {
2070 ++io_pool_cnt;
2071 page_io_pool_add(&io_pool_16m, pp);
2072 goto done;
2073 }
2074 if (io_pool_cnt < io_pool_cnt_max || force || io_pool_4g == NULL) {
2075 ++io_pool_cnt;
2076 page_io_pool_add(&io_pool_4g, pp);
2077 } else {
2078 highest = io_pool_4g->p_prev;
2079 if (mfn_list[pp->p_pagenum] < mfn_list[highest->p_pagenum]) {
2080 page_io_pool_sub(&io_pool_4g, highest, highest);
2081 page_io_pool_add(&io_pool_4g, pp);
2082 freep = highest;
2083 } else {
2084 freep = pp;
2085 }
2086 }
2087 done:
2088 mutex_exit(&io_pool_lock);
2089 if (freep)
2090 page_free(freep, 1);
2091 }
2092
2093
2094 int contig_pfn_cnt; /* no of pfns in the contig pfn list */
2095 int contig_pfn_max; /* capacity of the contig pfn list */
2096 int next_alloc_pfn; /* next position in list to start a contig search */
2097 int contig_pfnlist_updates; /* pfn list update count */
2098 int contig_pfnlist_builds; /* how many times have we (re)built list */
2099 int contig_pfnlist_buildfailed; /* how many times has list build failed */
2100 int create_contig_pending; /* nonzero means taskq creating contig list */
2101 pfn_t *contig_pfn_list = NULL; /* list of contig pfns in ascending mfn order */
2102
2103 /*
2104 * Function to use in sorting a list of pfns by their underlying mfns.
2105 */
2106 static int
2107 mfn_compare(const void *pfnp1, const void *pfnp2)
2108 {
2109 mfn_t mfn1 = mfn_list[*(pfn_t *)pfnp1];
2110 mfn_t mfn2 = mfn_list[*(pfn_t *)pfnp2];
2111
2112 if (mfn1 > mfn2)
2113 return (1);
2114 if (mfn1 < mfn2)
2115 return (-1);
2116 return (0);
2117 }
2118
2119 /*
2120 * Compact the contig_pfn_list by tossing all the non-contiguous
2121 * elements from the list.
2122 */
2123 static void
2124 compact_contig_pfn_list(void)
2125 {
2126 pfn_t pfn, lapfn, prev_lapfn;
2127 mfn_t mfn;
2128 int i, newcnt = 0;
2129
2130 prev_lapfn = 0;
2131 for (i = 0; i < contig_pfn_cnt - 1; i++) {
2132 pfn = contig_pfn_list[i];
2133 lapfn = contig_pfn_list[i + 1];
2134 mfn = mfn_list[pfn];
2135 /*
2136 * See if next pfn is for a contig mfn
2137 */
2138 if (mfn_list[lapfn] != mfn + 1)
2139 continue;
2140 /*
2141 * pfn and lookahead are both put in list
2142 * unless pfn is the previous lookahead.
2143 */
2144 if (pfn != prev_lapfn)
2145 contig_pfn_list[newcnt++] = pfn;
2146 contig_pfn_list[newcnt++] = lapfn;
2147 prev_lapfn = lapfn;
2148 }
2149 for (i = newcnt; i < contig_pfn_cnt; i++)
2150 contig_pfn_list[i] = 0;
2151 contig_pfn_cnt = newcnt;
2152 }
2153
2154 /*ARGSUSED*/
2155 static void
2156 call_create_contiglist(void *arg)
2157 {
2158 (void) create_contig_pfnlist(PG_WAIT);
2159 }
2160
2161 /*
2162 * Create list of freelist pfns that have underlying
2163 * contiguous mfns. The list is kept in ascending mfn order.
2164 * returns 1 if list created else 0.
2165 */
2166 static int
2167 create_contig_pfnlist(uint_t flags)
2168 {
2169 pfn_t pfn;
2170 page_t *pp;
2171 int ret = 1;
2172
2173 mutex_enter(&contig_list_lock);
2174 if (contig_pfn_list != NULL)
2175 goto out;
2176 contig_pfn_max = freemem + (freemem / 10);
2177 contig_pfn_list = kmem_zalloc(contig_pfn_max * sizeof (pfn_t),
2178 (flags & PG_WAIT) ? KM_SLEEP : KM_NOSLEEP);
2179 if (contig_pfn_list == NULL) {
2180 /*
2181 * If we could not create the contig list (because
2182 * we could not sleep for memory). Dispatch a taskq that can
2183 * sleep to get the memory.
2184 */
2185 if (!create_contig_pending) {
2186 if (taskq_dispatch(system_taskq, call_create_contiglist,
2187 NULL, TQ_NOSLEEP) != NULL)
2188 create_contig_pending = 1;
2189 }
2190 contig_pfnlist_buildfailed++; /* count list build failures */
2191 ret = 0;
2192 goto out;
2193 }
2194 create_contig_pending = 0;
2195 ASSERT(contig_pfn_cnt == 0);
2196 for (pfn = 0; pfn < mfn_count; pfn++) {
2197 pp = page_numtopp_nolock(pfn);
2198 if (pp == NULL || !PP_ISFREE(pp))
2199 continue;
2200 contig_pfn_list[contig_pfn_cnt] = pfn;
2201 if (++contig_pfn_cnt == contig_pfn_max)
2202 break;
2203 }
2204 /*
2205 * Sanity check the new list.
2206 */
2207 if (contig_pfn_cnt < 2) { /* no contig pfns */
2208 contig_pfn_cnt = 0;
2209 contig_pfnlist_buildfailed++;
2210 kmem_free(contig_pfn_list, contig_pfn_max * sizeof (pfn_t));
2211 contig_pfn_list = NULL;
2212 contig_pfn_max = 0;
2213 ret = 0;
2214 goto out;
2215 }
2216 qsort(contig_pfn_list, contig_pfn_cnt, sizeof (pfn_t), mfn_compare);
2217 compact_contig_pfn_list();
2218 /*
2219 * Make sure next search of the newly created contiguous pfn
2220 * list starts at the beginning of the list.
2221 */
2222 next_alloc_pfn = 0;
2223 contig_pfnlist_builds++; /* count list builds */
2224 out:
2225 mutex_exit(&contig_list_lock);
2226 return (ret);
2227 }
2228
2229
2230 /*
2231 * Toss the current contig pfnlist. Someone is about to do a massive
2232 * update to pfn<->mfn mappings. So we have them destroy the list and lock
2233 * it till they are done with their update.
2234 */
2235 void
2236 clear_and_lock_contig_pfnlist()
2237 {
2238 pfn_t *listp = NULL;
2239 size_t listsize;
2240
2241 mutex_enter(&contig_list_lock);
2242 if (contig_pfn_list != NULL) {
2243 listp = contig_pfn_list;
2244 listsize = contig_pfn_max * sizeof (pfn_t);
2245 contig_pfn_list = NULL;
2246 contig_pfn_max = contig_pfn_cnt = 0;
2247 }
2248 if (listp != NULL)
2249 kmem_free(listp, listsize);
2250 }
2251
2252 /*
2253 * Unlock the contig_pfn_list. The next attempted use of it will cause
2254 * it to be re-created.
2255 */
2256 void
2257 unlock_contig_pfnlist()
2258 {
2259 mutex_exit(&contig_list_lock);
2260 }
2261
2262 /*
2263 * Update the contiguous pfn list in response to a pfn <-> mfn reassignment
2264 */
2265 void
2266 update_contig_pfnlist(pfn_t pfn, mfn_t oldmfn, mfn_t newmfn)
2267 {
2268 int probe_hi, probe_lo, probe_pos, insert_after, insert_point;
2269 pfn_t probe_pfn;
2270 mfn_t probe_mfn;
2271 int drop_lock = 0;
2272
2273 if (mutex_owner(&contig_list_lock) != curthread) {
2274 drop_lock = 1;
2275 mutex_enter(&contig_list_lock);
2276 }
2277 if (contig_pfn_list == NULL)
2278 goto done;
2279 contig_pfnlist_updates++;
2280 /*
2281 * Find the pfn in the current list. Use a binary chop to locate it.
2282 */
2283 probe_hi = contig_pfn_cnt - 1;
2284 probe_lo = 0;
2285 probe_pos = (probe_hi + probe_lo) / 2;
2286 while ((probe_pfn = contig_pfn_list[probe_pos]) != pfn) {
2287 if (probe_pos == probe_lo) { /* pfn not in list */
2288 probe_pos = -1;
2289 break;
2290 }
2291 if (pfn_to_mfn(probe_pfn) <= oldmfn)
2292 probe_lo = probe_pos;
2293 else
2294 probe_hi = probe_pos;
2295 probe_pos = (probe_hi + probe_lo) / 2;
2296 }
2297 if (probe_pos >= 0) {
2298 /*
2299 * Remove pfn from list and ensure next alloc
2300 * position stays in bounds.
2301 */
2302 if (--contig_pfn_cnt <= next_alloc_pfn)
2303 next_alloc_pfn = 0;
2304 if (contig_pfn_cnt < 2) { /* no contig pfns */
2305 contig_pfn_cnt = 0;
2306 kmem_free(contig_pfn_list,
2307 contig_pfn_max * sizeof (pfn_t));
2308 contig_pfn_list = NULL;
2309 contig_pfn_max = 0;
2310 goto done;
2311 }
2312 ovbcopy(&contig_pfn_list[probe_pos + 1],
2313 &contig_pfn_list[probe_pos],
2314 (contig_pfn_cnt - probe_pos) * sizeof (pfn_t));
2315 }
2316 if (newmfn == MFN_INVALID)
2317 goto done;
2318 /*
2319 * Check if new mfn has adjacent mfns in the list
2320 */
2321 probe_hi = contig_pfn_cnt - 1;
2322 probe_lo = 0;
2323 insert_after = -2;
2324 do {
2325 probe_pos = (probe_hi + probe_lo) / 2;
2326 probe_mfn = pfn_to_mfn(contig_pfn_list[probe_pos]);
2327 if (newmfn == probe_mfn + 1)
2328 insert_after = probe_pos;
2329 else if (newmfn == probe_mfn - 1)
2330 insert_after = probe_pos - 1;
2331 if (probe_pos == probe_lo)
2332 break;
2333 if (probe_mfn <= newmfn)
2334 probe_lo = probe_pos;
2335 else
2336 probe_hi = probe_pos;
2337 } while (insert_after == -2);
2338 /*
2339 * If there is space in the list and there are adjacent mfns
2340 * insert the pfn in to its proper place in the list.
2341 */
2342 if (insert_after != -2 && contig_pfn_cnt + 1 <= contig_pfn_max) {
2343 insert_point = insert_after + 1;
2344 ovbcopy(&contig_pfn_list[insert_point],
2345 &contig_pfn_list[insert_point + 1],
2346 (contig_pfn_cnt - insert_point) * sizeof (pfn_t));
2347 contig_pfn_list[insert_point] = pfn;
2348 contig_pfn_cnt++;
2349 }
2350 done:
2351 if (drop_lock)
2352 mutex_exit(&contig_list_lock);
2353 }
2354
2355 /*
2356 * Called to (re-)populate the io_pool from the free page lists.
2357 */
2358 long
2359 populate_io_pool(void)
2360 {
2361 pfn_t pfn;
2362 mfn_t mfn, max_mfn;
2363 page_t *pp;
2364
2365 /*
2366 * Figure out the bounds of the pool on first invocation.
2367 * We use a percentage of memory for the io pool size.
2368 * we allow that to shrink, but not to less than a fixed minimum
2369 */
2370 if (io_pool_cnt_max == 0) {
2371 io_pool_cnt_max = physmem / (100 / io_pool_physmem_pct);
2372 io_pool_cnt_lowater = io_pool_cnt_max;
2373 /*
2374 * This is the first time in populate_io_pool, grab a va to use
2375 * when we need to allocate pages.
2376 */
2377 io_pool_kva = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP);
2378 }
2379 /*
2380 * If we are out of pages in the pool, then grow the size of the pool
2381 */
2382 if (io_pool_cnt == 0) {
2383 /*
2384 * Grow the max size of the io pool by 5%, but never more than
2385 * 25% of physical memory.
2386 */
2387 if (io_pool_cnt_max < physmem / 4)
2388 io_pool_cnt_max += io_pool_cnt_max / 20;
2389 }
2390 io_pool_grows++; /* should be a kstat? */
2391
2392 /*
2393 * Get highest mfn on this platform, but limit to the 32 bit DMA max.
2394 */
2395 (void) mfn_to_pfn(start_mfn);
2396 max_mfn = MIN(cached_max_mfn, PFN_4GIG);
2397 for (mfn = start_mfn; mfn < max_mfn; start_mfn = ++mfn) {
2398 pfn = mfn_to_pfn(mfn);
2399 if (pfn & PFN_IS_FOREIGN_MFN)
2400 continue;
2401 /*
2402 * try to allocate it from free pages
2403 */
2404 pp = page_numtopp_alloc(pfn);
2405 if (pp == NULL)
2406 continue;
2407 PP_CLRFREE(pp);
2408 add_page_to_pool(pp, 1);
2409 if (io_pool_cnt >= io_pool_cnt_max)
2410 break;
2411 }
2412
2413 return (io_pool_cnt);
2414 }
2415
2416 /*
2417 * Destroy a page that was being used for DMA I/O. It may or
2418 * may not actually go back to the io_pool.
2419 */
2420 void
2421 page_destroy_io(page_t *pp)
2422 {
2423 mfn_t mfn = mfn_list[pp->p_pagenum];
2424
2425 /*
2426 * When the page was alloc'd a reservation was made, release it now
2427 */
2428 page_unresv(1);
2429 /*
2430 * Unload translations, if any, then hash out the
2431 * page to erase its identity.
2432 */
2433 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
2434 page_hashout(pp, NULL);
2435
2436 /*
2437 * If the page came from the free lists, just put it back to them.
2438 * DomU pages always go on the free lists as well.
2439 */
2440 if (!DOMAIN_IS_INITDOMAIN(xen_info) || mfn >= PFN_4GIG) {
2441 page_free(pp, 1);
2442 return;
2443 }
2444
2445 add_page_to_pool(pp, 0);
2446 }
2447
2448
2449 long contig_searches; /* count of times contig pages requested */
2450 long contig_search_restarts; /* count of contig ranges tried */
2451 long contig_search_failed; /* count of contig alloc failures */
2452
2453 /*
2454 * Free partial page list
2455 */
2456 static void
2457 free_partial_list(page_t **pplist)
2458 {
2459 page_t *pp;
2460
2461 while (*pplist != NULL) {
2462 pp = *pplist;
2463 page_io_pool_sub(pplist, pp, pp);
2464 page_free(pp, 1);
2465 }
2466 }
2467
2468 /*
2469 * Look thru the contiguous pfns that are not part of the io_pool for
2470 * contiguous free pages. Return a list of the found pages or NULL.
2471 */
2472 page_t *
2473 find_contig_free(uint_t npages, uint_t flags, uint64_t pfnseg,
2474 pgcnt_t pfnalign)
2475 {
2476 page_t *pp, *plist = NULL;
2477 mfn_t mfn, prev_mfn, start_mfn;
2478 pfn_t pfn;
2479 int pages_needed, pages_requested;
2480 int search_start;
2481
2482 /*
2483 * create the contig pfn list if not already done
2484 */
2485 retry:
2486 mutex_enter(&contig_list_lock);
2487 if (contig_pfn_list == NULL) {
2488 mutex_exit(&contig_list_lock);
2489 if (!create_contig_pfnlist(flags)) {
2490 return (NULL);
2491 }
2492 goto retry;
2493 }
2494 contig_searches++;
2495 /*
2496 * Search contiguous pfn list for physically contiguous pages not in
2497 * the io_pool. Start the search where the last search left off.
2498 */
2499 pages_requested = pages_needed = npages;
2500 search_start = next_alloc_pfn;
2501 start_mfn = prev_mfn = 0;
2502 while (pages_needed) {
2503 pfn = contig_pfn_list[next_alloc_pfn];
2504 mfn = pfn_to_mfn(pfn);
2505 /*
2506 * Check if mfn is first one or contig to previous one and
2507 * if page corresponding to mfn is free and that mfn
2508 * range is not crossing a segment boundary.
2509 */
2510 if ((prev_mfn == 0 || mfn == prev_mfn + 1) &&
2511 (pp = page_numtopp_alloc(pfn)) != NULL &&
2512 !((mfn & pfnseg) < (start_mfn & pfnseg))) {
2513 PP_CLRFREE(pp);
2514 page_io_pool_add(&plist, pp);
2515 pages_needed--;
2516 if (prev_mfn == 0) {
2517 if (pfnalign &&
2518 mfn != P2ROUNDUP(mfn, pfnalign)) {
2519 /*
2520 * not properly aligned
2521 */
2522 contig_search_restarts++;
2523 free_partial_list(&plist);
2524 pages_needed = pages_requested;
2525 start_mfn = prev_mfn = 0;
2526 goto skip;
2527 }
2528 start_mfn = mfn;
2529 }
2530 prev_mfn = mfn;
2531 } else {
2532 contig_search_restarts++;
2533 free_partial_list(&plist);
2534 pages_needed = pages_requested;
2535 start_mfn = prev_mfn = 0;
2536 }
2537 skip:
2538 if (++next_alloc_pfn == contig_pfn_cnt)
2539 next_alloc_pfn = 0;
2540 if (next_alloc_pfn == search_start)
2541 break; /* all pfns searched */
2542 }
2543 mutex_exit(&contig_list_lock);
2544 if (pages_needed) {
2545 contig_search_failed++;
2546 /*
2547 * Failed to find enough contig pages.
2548 * free partial page list
2549 */
2550 free_partial_list(&plist);
2551 }
2552 return (plist);
2553 }
2554
2555 /*
2556 * Search the reserved io pool pages for a page range with the
2557 * desired characteristics.
2558 */
2559 page_t *
2560 page_io_pool_alloc(ddi_dma_attr_t *mattr, int contig, pgcnt_t minctg)
2561 {
2562 page_t *pp_first, *pp_last;
2563 page_t *pp, **poolp;
2564 pgcnt_t nwanted, pfnalign;
2565 uint64_t pfnseg;
2566 mfn_t mfn, tmfn, hi_mfn, lo_mfn;
2567 int align, attempt = 0;
2568
2569 if (minctg == 1)
2570 contig = 0;
2571 lo_mfn = mmu_btop(mattr->dma_attr_addr_lo);
2572 hi_mfn = mmu_btop(mattr->dma_attr_addr_hi);
2573 pfnseg = mmu_btop(mattr->dma_attr_seg);
2574 align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer);
2575 if (align > MMU_PAGESIZE)
2576 pfnalign = mmu_btop(align);
2577 else
2578 pfnalign = 0;
2579
2580 try_again:
2581 /*
2582 * See if we want pages for a legacy device
2583 */
2584 if (hi_mfn < PFN_16MEG)
2585 poolp = &io_pool_16m;
2586 else
2587 poolp = &io_pool_4g;
2588 try_smaller:
2589 /*
2590 * Take pages from I/O pool. We'll use pages from the highest
2591 * MFN range possible.
2592 */
2593 pp_first = pp_last = NULL;
2594 mutex_enter(&io_pool_lock);
2595 nwanted = minctg;
2596 for (pp = *poolp; pp && nwanted > 0; ) {
2597 pp = pp->p_prev;
2598
2599 /*
2600 * skip pages above allowable range
2601 */
2602 mfn = mfn_list[pp->p_pagenum];
2603 if (hi_mfn < mfn)
2604 goto skip;
2605
2606 /*
2607 * stop at pages below allowable range
2608 */
2609 if (lo_mfn > mfn)
2610 break;
2611 restart:
2612 if (pp_last == NULL) {
2613 /*
2614 * Check alignment
2615 */
2616 tmfn = mfn - (minctg - 1);
2617 if (pfnalign && tmfn != P2ROUNDUP(tmfn, pfnalign))
2618 goto skip; /* not properly aligned */
2619 /*
2620 * Check segment
2621 */
2622 if ((mfn & pfnseg) < (tmfn & pfnseg))
2623 goto skip; /* crosses seg boundary */
2624 /*
2625 * Start building page list
2626 */
2627 pp_first = pp_last = pp;
2628 nwanted--;
2629 } else {
2630 /*
2631 * check physical contiguity if required
2632 */
2633 if (contig &&
2634 mfn_list[pp_first->p_pagenum] != mfn + 1) {
2635 /*
2636 * not a contiguous page, restart list.
2637 */
2638 pp_last = NULL;
2639 nwanted = minctg;
2640 goto restart;
2641 } else { /* add page to list */
2642 pp_first = pp;
2643 nwanted--;
2644 }
2645 }
2646 skip:
2647 if (pp == *poolp)
2648 break;
2649 }
2650
2651 /*
2652 * If we didn't find memory. Try the more constrained pool, then
2653 * sweep free pages into the DMA pool and try again.
2654 */
2655 if (nwanted != 0) {
2656 mutex_exit(&io_pool_lock);
2657 /*
2658 * If we were looking in the less constrained pool and
2659 * didn't find pages, try the more constrained pool.
2660 */
2661 if (poolp == &io_pool_4g) {
2662 poolp = &io_pool_16m;
2663 goto try_smaller;
2664 }
2665 kmem_reap();
2666 if (++attempt < 4) {
2667 /*
2668 * Grab some more io_pool pages
2669 */
2670 (void) populate_io_pool();
2671 goto try_again; /* go around and retry */
2672 }
2673 return (NULL);
2674 }
2675 /*
2676 * Found the pages, now snip them from the list
2677 */
2678 page_io_pool_sub(poolp, pp_first, pp_last);
2679 io_pool_cnt -= minctg;
2680 /*
2681 * reset low water mark
2682 */
2683 if (io_pool_cnt < io_pool_cnt_lowater)
2684 io_pool_cnt_lowater = io_pool_cnt;
2685 mutex_exit(&io_pool_lock);
2686 return (pp_first);
2687 }
2688
2689 page_t *
2690 page_swap_with_hypervisor(struct vnode *vp, u_offset_t off, caddr_t vaddr,
2691 ddi_dma_attr_t *mattr, uint_t flags, pgcnt_t minctg)
2692 {
2693 uint_t kflags;
2694 int order, extra, extpages, i, contig, nbits, extents;
2695 page_t *pp, *expp, *pp_first, **pplist = NULL;
2696 mfn_t *mfnlist = NULL;
2697
2698 contig = flags & PG_PHYSCONTIG;
2699 if (minctg == 1)
2700 contig = 0;
2701 flags &= ~PG_PHYSCONTIG;
2702 kflags = flags & PG_WAIT ? KM_SLEEP : KM_NOSLEEP;
2703 /*
2704 * Hypervisor will allocate extents, if we want contig
2705 * pages extent must be >= minctg
2706 */
2707 if (contig) {
2708 order = highbit(minctg) - 1;
2709 if (minctg & ((1 << order) - 1))
2710 order++;
2711 extpages = 1 << order;
2712 } else {
2713 order = 0;
2714 extpages = minctg;
2715 }
2716 if (extpages > minctg) {
2717 extra = extpages - minctg;
2718 if (!page_resv(extra, kflags))
2719 return (NULL);
2720 }
2721 pp_first = NULL;
2722 pplist = kmem_alloc(extpages * sizeof (page_t *), kflags);
2723 if (pplist == NULL)
2724 goto balloon_fail;
2725 mfnlist = kmem_alloc(extpages * sizeof (mfn_t), kflags);
2726 if (mfnlist == NULL)
2727 goto balloon_fail;
2728 pp = page_create_va(vp, off, minctg * PAGESIZE, flags, &kvseg, vaddr);
2729 if (pp == NULL)
2730 goto balloon_fail;
2731 pp_first = pp;
2732 if (extpages > minctg) {
2733 /*
2734 * fill out the rest of extent pages to swap
2735 * with the hypervisor
2736 */
2737 for (i = 0; i < extra; i++) {
2738 expp = page_create_va(vp,
2739 (u_offset_t)(uintptr_t)io_pool_kva,
2740 PAGESIZE, flags, &kvseg, io_pool_kva);
2741 if (expp == NULL)
2742 goto balloon_fail;
2743 (void) hat_pageunload(expp, HAT_FORCE_PGUNLOAD);
2744 page_io_unlock(expp);
2745 page_hashout(expp, NULL);
2746 page_io_lock(expp);
2747 /*
2748 * add page to end of list
2749 */
2750 expp->p_prev = pp_first->p_prev;
2751 expp->p_next = pp_first;
2752 expp->p_prev->p_next = expp;
2753 pp_first->p_prev = expp;
2754 }
2755
2756 }
2757 for (i = 0; i < extpages; i++) {
2758 pplist[i] = pp;
2759 pp = pp->p_next;
2760 }
2761 nbits = highbit(mattr->dma_attr_addr_hi);
2762 extents = contig ? 1 : minctg;
2763 if (balloon_replace_pages(extents, pplist, nbits, order,
2764 mfnlist) != extents) {
2765 if (ioalloc_dbg)
2766 cmn_err(CE_NOTE, "request to hypervisor"
2767 " for %d pages, maxaddr %" PRIx64 " failed",
2768 extpages, mattr->dma_attr_addr_hi);
2769 goto balloon_fail;
2770 }
2771
2772 kmem_free(pplist, extpages * sizeof (page_t *));
2773 kmem_free(mfnlist, extpages * sizeof (mfn_t));
2774 /*
2775 * Return any excess pages to free list
2776 */
2777 if (extpages > minctg) {
2778 for (i = 0; i < extra; i++) {
2779 pp = pp_first->p_prev;
2780 page_sub(&pp_first, pp);
2781 page_io_unlock(pp);
2782 page_unresv(1);
2783 page_free(pp, 1);
2784 }
2785 }
2786 return (pp_first);
2787 balloon_fail:
2788 /*
2789 * Return pages to free list and return failure
2790 */
2791 while (pp_first != NULL) {
2792 pp = pp_first;
2793 page_sub(&pp_first, pp);
2794 page_io_unlock(pp);
2795 if (pp->p_vnode != NULL)
2796 page_hashout(pp, NULL);
2797 page_free(pp, 1);
2798 }
2799 if (pplist)
2800 kmem_free(pplist, extpages * sizeof (page_t *));
2801 if (mfnlist)
2802 kmem_free(mfnlist, extpages * sizeof (mfn_t));
2803 page_unresv(extpages - minctg);
2804 return (NULL);
2805 }
2806
2807 static void
2808 return_partial_alloc(page_t *plist)
2809 {
2810 page_t *pp;
2811
2812 while (plist != NULL) {
2813 pp = plist;
2814 page_sub(&plist, pp);
2815 page_io_unlock(pp);
2816 page_destroy_io(pp);
2817 }
2818 }
2819
2820 static page_t *
2821 page_get_contigpages(
2822 struct vnode *vp,
2823 u_offset_t off,
2824 int *npagesp,
2825 uint_t flags,
2826 caddr_t vaddr,
2827 ddi_dma_attr_t *mattr)
2828 {
2829 mfn_t max_mfn = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL);
2830 page_t *plist; /* list to return */
2831 page_t *pp, *mcpl;
2832 int contig, anyaddr, npages, getone = 0;
2833 mfn_t lo_mfn;
2834 mfn_t hi_mfn;
2835 pgcnt_t pfnalign = 0;
2836 int align, sgllen;
2837 uint64_t pfnseg;
2838 pgcnt_t minctg;
2839
2840 npages = *npagesp;
2841 ASSERT(mattr != NULL);
2842 lo_mfn = mmu_btop(mattr->dma_attr_addr_lo);
2843 hi_mfn = mmu_btop(mattr->dma_attr_addr_hi);
2844 sgllen = mattr->dma_attr_sgllen;
2845 pfnseg = mmu_btop(mattr->dma_attr_seg);
2846 align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer);
2847 if (align > MMU_PAGESIZE)
2848 pfnalign = mmu_btop(align);
2849
2850 contig = flags & PG_PHYSCONTIG;
2851 if (npages == -1) {
2852 npages = 1;
2853 pfnalign = 0;
2854 }
2855 /*
2856 * Clear the contig flag if only one page is needed.
2857 */
2858 if (npages == 1) {
2859 getone = 1;
2860 contig = 0;
2861 }
2862
2863 /*
2864 * Check if any page in the system is fine.
2865 */
2866 anyaddr = lo_mfn == 0 && hi_mfn >= max_mfn;
2867 if (!contig && anyaddr && !pfnalign) {
2868 flags &= ~PG_PHYSCONTIG;
2869 plist = page_create_va(vp, off, npages * MMU_PAGESIZE,
2870 flags, &kvseg, vaddr);
2871 if (plist != NULL) {
2872 *npagesp = 0;
2873 return (plist);
2874 }
2875 }
2876 plist = NULL;
2877 minctg = howmany(npages, sgllen);
2878 while (npages > sgllen || getone) {
2879 if (minctg > npages)
2880 minctg = npages;
2881 mcpl = NULL;
2882 /*
2883 * We could want contig pages with no address range limits.
2884 */
2885 if (anyaddr && contig) {
2886 /*
2887 * Look for free contig pages to satisfy the request.
2888 */
2889 mcpl = find_contig_free(minctg, flags, pfnseg,
2890 pfnalign);
2891 }
2892 /*
2893 * Try the reserved io pools next
2894 */
2895 if (mcpl == NULL)
2896 mcpl = page_io_pool_alloc(mattr, contig, minctg);
2897 if (mcpl != NULL) {
2898 pp = mcpl;
2899 do {
2900 if (!page_hashin(pp, vp, off, NULL)) {
2901 panic("page_get_contigpages:"
2902 " hashin failed"
2903 " pp %p, vp %p, off %llx",
2904 (void *)pp, (void *)vp, off);
2905 }
2906 off += MMU_PAGESIZE;
2907 PP_CLRFREE(pp);
2908 PP_CLRAGED(pp);
2909 page_set_props(pp, P_REF);
2910 page_io_lock(pp);
2911 pp = pp->p_next;
2912 } while (pp != mcpl);
2913 } else {
2914 /*
2915 * Hypervisor exchange doesn't handle segment or
2916 * alignment constraints
2917 */
2918 if (mattr->dma_attr_seg < mattr->dma_attr_addr_hi ||
2919 pfnalign)
2920 goto fail;
2921 /*
2922 * Try exchanging pages with the hypervisor
2923 */
2924 mcpl = page_swap_with_hypervisor(vp, off, vaddr, mattr,
2925 flags, minctg);
2926 if (mcpl == NULL)
2927 goto fail;
2928 off += minctg * MMU_PAGESIZE;
2929 }
2930 check_dma(mattr, mcpl, minctg);
2931 /*
2932 * Here with a minctg run of contiguous pages, add them to the
2933 * list we will return for this request.
2934 */
2935 page_list_concat(&plist, &mcpl);
2936 npages -= minctg;
2937 *npagesp = npages;
2938 sgllen--;
2939 if (getone)
2940 break;
2941 }
2942 return (plist);
2943 fail:
2944 return_partial_alloc(plist);
2945 return (NULL);
2946 }
2947
2948 /*
2949 * Allocator for domain 0 I/O pages. We match the required
2950 * DMA attributes and contiguity constraints.
2951 */
2952 /*ARGSUSED*/
2953 page_t *
2954 page_create_io(
2955 struct vnode *vp,
2956 u_offset_t off,
2957 uint_t bytes,
2958 uint_t flags,
2959 struct as *as,
2960 caddr_t vaddr,
2961 ddi_dma_attr_t *mattr)
2962 {
2963 page_t *plist = NULL, *pp;
2964 int npages = 0, contig, anyaddr, pages_req;
2965 mfn_t lo_mfn;
2966 mfn_t hi_mfn;
2967 pgcnt_t pfnalign = 0;
2968 int align;
2969 int is_domu = 0;
2970 int dummy, bytes_got;
2971 mfn_t max_mfn = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL);
2972
2973 ASSERT(mattr != NULL);
2974 lo_mfn = mmu_btop(mattr->dma_attr_addr_lo);
2975 hi_mfn = mmu_btop(mattr->dma_attr_addr_hi);
2976 align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer);
2977 if (align > MMU_PAGESIZE)
2978 pfnalign = mmu_btop(align);
2979
2980 /*
2981 * Clear the contig flag if only one page is needed or the scatter
2982 * gather list length is >= npages.
2983 */
2984 pages_req = npages = mmu_btopr(bytes);
2985 contig = (flags & PG_PHYSCONTIG);
2986 bytes = P2ROUNDUP(bytes, MMU_PAGESIZE);
2987 if (bytes == MMU_PAGESIZE || mattr->dma_attr_sgllen >= npages)
2988 contig = 0;
2989
2990 /*
2991 * Check if any old page in the system is fine.
2992 * DomU should always go down this path.
2993 */
2994 is_domu = !DOMAIN_IS_INITDOMAIN(xen_info);
2995 anyaddr = lo_mfn == 0 && hi_mfn >= max_mfn && !pfnalign;
2996 if ((!contig && anyaddr) || is_domu) {
2997 flags &= ~PG_PHYSCONTIG;
2998 plist = page_create_va(vp, off, bytes, flags, &kvseg, vaddr);
2999 if (plist != NULL)
3000 return (plist);
3001 else if (is_domu)
3002 return (NULL); /* no memory available */
3003 }
3004 /*
3005 * DomU should never reach here
3006 */
3007 if (contig) {
3008 plist = page_get_contigpages(vp, off, &npages, flags, vaddr,
3009 mattr);
3010 if (plist == NULL)
3011 goto fail;
3012 bytes_got = (pages_req - npages) << MMU_PAGESHIFT;
3013 vaddr += bytes_got;
3014 off += bytes_got;
3015 /*
3016 * We now have all the contiguous pages we need, but
3017 * we may still need additional non-contiguous pages.
3018 */
3019 }
3020 /*
3021 * now loop collecting the requested number of pages, these do
3022 * not have to be contiguous pages but we will use the contig
3023 * page alloc code to get the pages since it will honor any
3024 * other constraints the pages may have.
3025 */
3026 while (npages--) {
3027 dummy = -1;
3028 pp = page_get_contigpages(vp, off, &dummy, flags, vaddr, mattr);
3029 if (pp == NULL)
3030 goto fail;
3031 page_add(&plist, pp);
3032 vaddr += MMU_PAGESIZE;
3033 off += MMU_PAGESIZE;
3034 }
3035 return (plist);
3036 fail:
3037 /*
3038 * Failed to get enough pages, return ones we did get
3039 */
3040 return_partial_alloc(plist);
3041 return (NULL);
3042 }
3043
3044 /*
3045 * Lock and return the page with the highest mfn that we can find. last_mfn
3046 * holds the last one found, so the next search can start from there. We
3047 * also keep a counter so that we don't loop forever if the machine has no
3048 * free pages.
3049 *
3050 * This is called from the balloon thread to find pages to give away. new_high
3051 * is used when new mfn's have been added to the system - we will reset our
3052 * search if the new mfn's are higher than our current search position.
3053 */
3054 page_t *
3055 page_get_high_mfn(mfn_t new_high)
3056 {
3057 static mfn_t last_mfn = 0;
3058 pfn_t pfn;
3059 page_t *pp;
3060 ulong_t loop_count = 0;
3061
3062 if (new_high > last_mfn)
3063 last_mfn = new_high;
3064
3065 for (; loop_count < mfn_count; loop_count++, last_mfn--) {
3066 if (last_mfn == 0) {
3067 last_mfn = cached_max_mfn;
3068 }
3069
3070 pfn = mfn_to_pfn(last_mfn);
3071 if (pfn & PFN_IS_FOREIGN_MFN)
3072 continue;
3073
3074 /* See if the page is free. If so, lock it. */
3075 pp = page_numtopp_alloc(pfn);
3076 if (pp == NULL)
3077 continue;
3078 PP_CLRFREE(pp);
3079
3080 ASSERT(PAGE_EXCL(pp));
3081 ASSERT(pp->p_vnode == NULL);
3082 ASSERT(!hat_page_is_mapped(pp));
3083 last_mfn--;
3084 return (pp);
3085 }
3086 return (NULL);
3087 }
3088
3089 #else /* !__xpv */
3090
3091 /*
3092 * get a page from any list with the given mnode
3093 */
3094 static page_t *
3095 page_get_mnode_anylist(ulong_t origbin, uchar_t szc, uint_t flags,
3096 int mnode, int mtype, ddi_dma_attr_t *dma_attr)
3097 {
3098 kmutex_t *pcm;
3099 int i;
3100 page_t *pp;
3101 page_t *first_pp;
3102 uint64_t pgaddr;
3103 ulong_t bin;
3104 int mtypestart;
3105 int plw_initialized;
3106 page_list_walker_t plw;
3107
3108 VM_STAT_ADD(pga_vmstats.pgma_alloc);
3109
3110 ASSERT((flags & PG_MATCH_COLOR) == 0);
3111 ASSERT(szc == 0);
3112 ASSERT(dma_attr != NULL);
3113
3114 MTYPE_START(mnode, mtype, flags);
3115 if (mtype < 0) {
3116 VM_STAT_ADD(pga_vmstats.pgma_allocempty);
3117 return (NULL);
3118 }
3119
3120 mtypestart = mtype;
3121
3122 bin = origbin;
3123
3124 /*
3125 * check up to page_colors + 1 bins - origbin may be checked twice
3126 * because of BIN_STEP skip
3127 */
3128 do {
3129 plw_initialized = 0;
3130
3131 for (plw.plw_count = 0;
3132 plw.plw_count < page_colors; plw.plw_count++) {
3133
3134 if (PAGE_FREELISTS(mnode, szc, bin, mtype) == NULL)
3135 goto nextfreebin;
3136
3137 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
3138 mutex_enter(pcm);
3139 pp = PAGE_FREELISTS(mnode, szc, bin, mtype);
3140 first_pp = pp;
3141 while (pp != NULL) {
3142 if (IS_DUMP_PAGE(pp) || page_trylock(pp,
3143 SE_EXCL) == 0) {
3144 pp = pp->p_next;
3145 if (pp == first_pp) {
3146 pp = NULL;
3147 }
3148 continue;
3149 }
3150
3151 ASSERT(PP_ISFREE(pp));
3152 ASSERT(PP_ISAGED(pp));
3153 ASSERT(pp->p_vnode == NULL);
3154 ASSERT(pp->p_hash == NULL);
3155 ASSERT(pp->p_offset == (u_offset_t)-1);
3156 ASSERT(pp->p_szc == szc);
3157 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
3158 /* check if page within DMA attributes */
3159 pgaddr = pa_to_ma(pfn_to_pa(pp->p_pagenum));
3160 if ((pgaddr >= dma_attr->dma_attr_addr_lo) &&
3161 (pgaddr + MMU_PAGESIZE - 1 <=
3162 dma_attr->dma_attr_addr_hi)) {
3163 break;
3164 }
3165
3166 /* continue looking */
3167 page_unlock(pp);
3168 pp = pp->p_next;
3169 if (pp == first_pp)
3170 pp = NULL;
3171
3172 }
3173 if (pp != NULL) {
3174 ASSERT(mtype == PP_2_MTYPE(pp));
3175 ASSERT(pp->p_szc == 0);
3176
3177 /* found a page with specified DMA attributes */
3178 page_sub(&PAGE_FREELISTS(mnode, szc, bin,
3179 mtype), pp);
3180 page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST);
3181
3182 if ((PP_ISFREE(pp) == 0) ||
3183 (PP_ISAGED(pp) == 0)) {
3184 cmn_err(CE_PANIC, "page %p is not free",
3185 (void *)pp);
3186 }
3187
3188 mutex_exit(pcm);
3189 check_dma(dma_attr, pp, 1);
3190 VM_STAT_ADD(pga_vmstats.pgma_allocok);
3191 return (pp);
3192 }
3193 mutex_exit(pcm);
3194 nextfreebin:
3195 if (plw_initialized == 0) {
3196 page_list_walk_init(szc, 0, bin, 1, 0, &plw);
3197 ASSERT(plw.plw_ceq_dif == page_colors);
3198 plw_initialized = 1;
3199 }
3200
3201 if (plw.plw_do_split) {
3202 pp = page_freelist_split(szc, bin, mnode,
3203 mtype,
3204 mmu_btop(dma_attr->dma_attr_addr_lo),
3205 mmu_btop(dma_attr->dma_attr_addr_hi + 1),
3206 &plw);
3207 if (pp != NULL) {
3208 check_dma(dma_attr, pp, 1);
3209 return (pp);
3210 }
3211 }
3212
3213 bin = page_list_walk_next_bin(szc, bin, &plw);
3214 }
3215
3216 MTYPE_NEXT(mnode, mtype, flags);
3217 } while (mtype >= 0);
3218
3219 /* failed to find a page in the freelist; try it in the cachelist */
3220
3221 /* reset mtype start for cachelist search */
3222 mtype = mtypestart;
3223 ASSERT(mtype >= 0);
3224
3225 /* start with the bin of matching color */
3226 bin = origbin;
3227
3228 do {
3229 for (i = 0; i <= page_colors; i++) {
3230 if (PAGE_CACHELISTS(mnode, bin, mtype) == NULL)
3231 goto nextcachebin;
3232 pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST);
3233 mutex_enter(pcm);
3234 pp = PAGE_CACHELISTS(mnode, bin, mtype);
3235 first_pp = pp;
3236 while (pp != NULL) {
3237 if (IS_DUMP_PAGE(pp) || page_trylock(pp,
3238 SE_EXCL) == 0) {
3239 pp = pp->p_next;
3240 if (pp == first_pp)
3241 pp = NULL;
3242 continue;
3243 }
3244 ASSERT(pp->p_vnode);
3245 ASSERT(PP_ISAGED(pp) == 0);
3246 ASSERT(pp->p_szc == 0);
3247 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
3248
3249 /* check if page within DMA attributes */
3250
3251 pgaddr = pa_to_ma(pfn_to_pa(pp->p_pagenum));
3252 if ((pgaddr >= dma_attr->dma_attr_addr_lo) &&
3253 (pgaddr + MMU_PAGESIZE - 1 <=
3254 dma_attr->dma_attr_addr_hi)) {
3255 break;
3256 }
3257
3258 /* continue looking */
3259 page_unlock(pp);
3260 pp = pp->p_next;
3261 if (pp == first_pp)
3262 pp = NULL;
3263 }
3264
3265 if (pp != NULL) {
3266 ASSERT(mtype == PP_2_MTYPE(pp));
3267 ASSERT(pp->p_szc == 0);
3268
3269 /* found a page with specified DMA attributes */
3270 page_sub(&PAGE_CACHELISTS(mnode, bin,
3271 mtype), pp);
3272 page_ctr_sub(mnode, mtype, pp, PG_CACHE_LIST);
3273
3274 mutex_exit(pcm);
3275 ASSERT(pp->p_vnode);
3276 ASSERT(PP_ISAGED(pp) == 0);
3277 check_dma(dma_attr, pp, 1);
3278 VM_STAT_ADD(pga_vmstats.pgma_allocok);
3279 return (pp);
3280 }
3281 mutex_exit(pcm);
3282 nextcachebin:
3283 bin += (i == 0) ? BIN_STEP : 1;
3284 bin &= page_colors_mask;
3285 }
3286 MTYPE_NEXT(mnode, mtype, flags);
3287 } while (mtype >= 0);
3288
3289 VM_STAT_ADD(pga_vmstats.pgma_allocfailed);
3290 return (NULL);
3291 }
3292
3293 /*
3294 * This function is similar to page_get_freelist()/page_get_cachelist()
3295 * but it searches both the lists to find a page with the specified
3296 * color (or no color) and DMA attributes. The search is done in the
3297 * freelist first and then in the cache list within the highest memory
3298 * range (based on DMA attributes) before searching in the lower
3299 * memory ranges.
3300 *
3301 * Note: This function is called only by page_create_io().
3302 */
3303 /*ARGSUSED*/
3304 static page_t *
3305 page_get_anylist(struct vnode *vp, u_offset_t off, struct as *as, caddr_t vaddr,
3306 size_t size, uint_t flags, ddi_dma_attr_t *dma_attr, lgrp_t *lgrp)
3307 {
3308 uint_t bin;
3309 int mtype;
3310 page_t *pp;
3311 int n;
3312 int m;
3313 int szc;
3314 int fullrange;
3315 int mnode;
3316 int local_failed_stat = 0;
3317 lgrp_mnode_cookie_t lgrp_cookie;
3318
3319 VM_STAT_ADD(pga_vmstats.pga_alloc);
3320
3321 /* only base pagesize currently supported */
3322 if (size != MMU_PAGESIZE)
3323 return (NULL);
3324
3325 /*
3326 * If we're passed a specific lgroup, we use it. Otherwise,
3327 * assume first-touch placement is desired.
3328 */
3329 if (!LGRP_EXISTS(lgrp))
3330 lgrp = lgrp_home_lgrp();
3331
3332 /* LINTED */
3333 AS_2_BIN(as, seg, vp, vaddr, bin, 0);
3334
3335 /*
3336 * Only hold one freelist or cachelist lock at a time, that way we
3337 * can start anywhere and not have to worry about lock
3338 * ordering.
3339 */
3340 if (dma_attr == NULL) {
3341 n = mtype16m;
3342 m = mtypetop;
3343 fullrange = 1;
3344 VM_STAT_ADD(pga_vmstats.pga_nulldmaattr);
3345 } else {
3346 pfn_t pfnlo = mmu_btop(dma_attr->dma_attr_addr_lo);
3347 pfn_t pfnhi = mmu_btop(dma_attr->dma_attr_addr_hi);
3348
3349 /*
3350 * We can guarantee alignment only for page boundary.
3351 */
3352 if (dma_attr->dma_attr_align > MMU_PAGESIZE)
3353 return (NULL);
3354
3355 /* Sanity check the dma_attr */
3356 if (pfnlo > pfnhi)
3357 return (NULL);
3358
3359 n = pfn_2_mtype(pfnlo);
3360 m = pfn_2_mtype(pfnhi);
3361
3362 fullrange = ((pfnlo == mnoderanges[n].mnr_pfnlo) &&
3363 (pfnhi >= mnoderanges[m].mnr_pfnhi));
3364 }
3365 VM_STAT_COND_ADD(fullrange == 0, pga_vmstats.pga_notfullrange);
3366
3367 szc = 0;
3368
3369 /* cylcing thru mtype handled by RANGE0 if n == mtype16m */
3370 if (n == mtype16m) {
3371 flags |= PGI_MT_RANGE0;
3372 n = m;
3373 }
3374
3375 /*
3376 * Try local memory node first, but try remote if we can't
3377 * get a page of the right color.
3378 */
3379 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_HIER);
3380 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3381 /*
3382 * allocate pages from high pfn to low.
3383 */
3384 mtype = m;
3385 do {
3386 if (fullrange != 0) {
3387 pp = page_get_mnode_freelist(mnode,
3388 bin, mtype, szc, flags);
3389 if (pp == NULL) {
3390 pp = page_get_mnode_cachelist(
3391 bin, flags, mnode, mtype);
3392 }
3393 } else {
3394 pp = page_get_mnode_anylist(bin, szc,
3395 flags, mnode, mtype, dma_attr);
3396 }
3397 if (pp != NULL) {
3398 VM_STAT_ADD(pga_vmstats.pga_allocok);
3399 check_dma(dma_attr, pp, 1);
3400 return (pp);
3401 }
3402 } while (mtype != n &&
3403 (mtype = mnoderanges[mtype].mnr_next) != -1);
3404 if (!local_failed_stat) {
3405 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1);
3406 local_failed_stat = 1;
3407 }
3408 }
3409 VM_STAT_ADD(pga_vmstats.pga_allocfailed);
3410
3411 return (NULL);
3412 }
3413
3414 /*
3415 * page_create_io()
3416 *
3417 * This function is a copy of page_create_va() with an additional
3418 * argument 'mattr' that specifies DMA memory requirements to
3419 * the page list functions. This function is used by the segkmem
3420 * allocator so it is only to create new pages (i.e PG_EXCL is
3421 * set).
3422 *
3423 * Note: This interface is currently used by x86 PSM only and is
3424 * not fully specified so the commitment level is only for
3425 * private interface specific to x86. This interface uses PSM
3426 * specific page_get_anylist() interface.
3427 */
3428
3429 #define PAGE_HASH_SEARCH(index, pp, vp, off) { \
3430 for ((pp) = page_hash[(index)]; (pp); (pp) = (pp)->p_hash) { \
3431 if ((pp)->p_vnode == (vp) && (pp)->p_offset == (off)) \
3432 break; \
3433 } \
3434 }
3435
3436
3437 page_t *
3438 page_create_io(
3439 struct vnode *vp,
3440 u_offset_t off,
3441 uint_t bytes,
3442 uint_t flags,
3443 struct as *as,
3444 caddr_t vaddr,
3445 ddi_dma_attr_t *mattr) /* DMA memory attributes if any */
3446 {
3447 page_t *plist = NULL;
3448 uint_t plist_len = 0;
3449 pgcnt_t npages;
3450 page_t *npp = NULL;
3451 uint_t pages_req;
3452 page_t *pp;
3453 kmutex_t *phm = NULL;
3454 uint_t index;
3455
3456 TRACE_4(TR_FAC_VM, TR_PAGE_CREATE_START,
3457 "page_create_start:vp %p off %llx bytes %u flags %x",
3458 vp, off, bytes, flags);
3459
3460 ASSERT((flags & ~(PG_EXCL | PG_WAIT | PG_PHYSCONTIG)) == 0);
3461
3462 pages_req = npages = mmu_btopr(bytes);
3463
3464 /*
3465 * Do the freemem and pcf accounting.
3466 */
3467 if (!page_create_wait(npages, flags)) {
3468 return (NULL);
3469 }
3470
3471 TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SUCCESS,
3472 "page_create_success:vp %p off %llx", vp, off);
3473
3474 /*
3475 * If satisfying this request has left us with too little
3476 * memory, start the wheels turning to get some back. The
3477 * first clause of the test prevents waking up the pageout
3478 * daemon in situations where it would decide that there's
3479 * nothing to do.
3480 */
3481 if (nscan < desscan && freemem < minfree) {
3482 TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL,
3483 "pageout_cv_signal:freemem %ld", freemem);
3484 cv_signal(&proc_pageout->p_cv);
3485 }
3486
3487 if (flags & PG_PHYSCONTIG) {
3488
3489 plist = page_get_contigpage(&npages, mattr, 1);
3490 if (plist == NULL) {
3491 page_create_putback(npages);
3492 return (NULL);
3493 }
3494
3495 pp = plist;
3496
3497 do {
3498 if (!page_hashin(pp, vp, off, NULL)) {
3499 panic("pg_creat_io: hashin failed %p %p %llx",
3500 (void *)pp, (void *)vp, off);
3501 }
3502 VM_STAT_ADD(page_create_new);
3503 off += MMU_PAGESIZE;
3504 PP_CLRFREE(pp);
3505 PP_CLRAGED(pp);
3506 page_set_props(pp, P_REF);
3507 pp = pp->p_next;
3508 } while (pp != plist);
3509
3510 if (!npages) {
3511 check_dma(mattr, plist, pages_req);
3512 return (plist);
3513 } else {
3514 vaddr += (pages_req - npages) << MMU_PAGESHIFT;
3515 }
3516
3517 /*
3518 * fall-thru:
3519 *
3520 * page_get_contigpage returns when npages <= sgllen.
3521 * Grab the rest of the non-contig pages below from anylist.
3522 */
3523 }
3524
3525 /*
3526 * Loop around collecting the requested number of pages.
3527 * Most of the time, we have to `create' a new page. With
3528 * this in mind, pull the page off the free list before
3529 * getting the hash lock. This will minimize the hash
3530 * lock hold time, nesting, and the like. If it turns
3531 * out we don't need the page, we put it back at the end.
3532 */
3533 while (npages--) {
3534 phm = NULL;
3535
3536 index = PAGE_HASH_FUNC(vp, off);
3537 top:
3538 ASSERT(phm == NULL);
3539 ASSERT(index == PAGE_HASH_FUNC(vp, off));
3540 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
3541
3542 if (npp == NULL) {
3543 /*
3544 * Try to get the page of any color either from
3545 * the freelist or from the cache list.
3546 */
3547 npp = page_get_anylist(vp, off, as, vaddr, MMU_PAGESIZE,
3548 flags & ~PG_MATCH_COLOR, mattr, NULL);
3549 if (npp == NULL) {
3550 if (mattr == NULL) {
3551 /*
3552 * Not looking for a special page;
3553 * panic!
3554 */
3555 panic("no page found %d", (int)npages);
3556 }
3557 /*
3558 * No page found! This can happen
3559 * if we are looking for a page
3560 * within a specific memory range
3561 * for DMA purposes. If PG_WAIT is
3562 * specified then we wait for a
3563 * while and then try again. The
3564 * wait could be forever if we
3565 * don't get the page(s) we need.
3566 *
3567 * Note: XXX We really need a mechanism
3568 * to wait for pages in the desired
3569 * range. For now, we wait for any
3570 * pages and see if we can use it.
3571 */
3572
3573 if ((mattr != NULL) && (flags & PG_WAIT)) {
3574 delay(10);
3575 goto top;
3576 }
3577 goto fail; /* undo accounting stuff */
3578 }
3579
3580 if (PP_ISAGED(npp) == 0) {
3581 /*
3582 * Since this page came from the
3583 * cachelist, we must destroy the
3584 * old vnode association.
3585 */
3586 page_hashout(npp, (kmutex_t *)NULL);
3587 }
3588 }
3589
3590 /*
3591 * We own this page!
3592 */
3593 ASSERT(PAGE_EXCL(npp));
3594 ASSERT(npp->p_vnode == NULL);
3595 ASSERT(!hat_page_is_mapped(npp));
3596 PP_CLRFREE(npp);
3597 PP_CLRAGED(npp);
3598
3599 /*
3600 * Here we have a page in our hot little mits and are
3601 * just waiting to stuff it on the appropriate lists.
3602 * Get the mutex and check to see if it really does
3603 * not exist.
3604 */
3605 phm = PAGE_HASH_MUTEX(index);
3606 mutex_enter(phm);
3607 PAGE_HASH_SEARCH(index, pp, vp, off);
3608 if (pp == NULL) {
3609 VM_STAT_ADD(page_create_new);
3610 pp = npp;
3611 npp = NULL;
3612 if (!page_hashin(pp, vp, off, phm)) {
3613 /*
3614 * Since we hold the page hash mutex and
3615 * just searched for this page, page_hashin
3616 * had better not fail. If it does, that
3617 * means somethread did not follow the
3618 * page hash mutex rules. Panic now and
3619 * get it over with. As usual, go down
3620 * holding all the locks.
3621 */
3622 ASSERT(MUTEX_HELD(phm));
3623 panic("page_create: hashin fail %p %p %llx %p",
3624 (void *)pp, (void *)vp, off, (void *)phm);
3625
3626 }
3627 ASSERT(MUTEX_HELD(phm));
3628 mutex_exit(phm);
3629 phm = NULL;
3630
3631 /*
3632 * Hat layer locking need not be done to set
3633 * the following bits since the page is not hashed
3634 * and was on the free list (i.e., had no mappings).
3635 *
3636 * Set the reference bit to protect
3637 * against immediate pageout
3638 *
3639 * XXXmh modify freelist code to set reference
3640 * bit so we don't have to do it here.
3641 */
3642 page_set_props(pp, P_REF);
3643 } else {
3644 ASSERT(MUTEX_HELD(phm));
3645 mutex_exit(phm);
3646 phm = NULL;
3647 /*
3648 * NOTE: This should not happen for pages associated
3649 * with kernel vnode 'kvp'.
3650 */
3651 /* XX64 - to debug why this happens! */
3652 ASSERT(!VN_ISKAS(vp));
3653 if (VN_ISKAS(vp))
3654 cmn_err(CE_NOTE,
3655 "page_create: page not expected "
3656 "in hash list for kernel vnode - pp 0x%p",
3657 (void *)pp);
3658 VM_STAT_ADD(page_create_exists);
3659 goto fail;
3660 }
3661
3662 /*
3663 * Got a page! It is locked. Acquire the i/o
3664 * lock since we are going to use the p_next and
3665 * p_prev fields to link the requested pages together.
3666 */
3667 page_io_lock(pp);
3668 page_add(&plist, pp);
3669 plist = plist->p_next;
3670 off += MMU_PAGESIZE;
3671 vaddr += MMU_PAGESIZE;
3672 }
3673
3674 check_dma(mattr, plist, pages_req);
3675 return (plist);
3676
3677 fail:
3678 if (npp != NULL) {
3679 /*
3680 * Did not need this page after all.
3681 * Put it back on the free list.
3682 */
3683 VM_STAT_ADD(page_create_putbacks);
3684 PP_SETFREE(npp);
3685 PP_SETAGED(npp);
3686 npp->p_offset = (u_offset_t)-1;
3687 page_list_add(npp, PG_FREE_LIST | PG_LIST_TAIL);
3688 page_unlock(npp);
3689 }
3690
3691 /*
3692 * Give up the pages we already got.
3693 */
3694 while (plist != NULL) {
3695 pp = plist;
3696 page_sub(&plist, pp);
3697 page_io_unlock(pp);
3698 plist_len++;
3699 /*LINTED: constant in conditional ctx*/
3700 VN_DISPOSE(pp, B_INVAL, 0, kcred);
3701 }
3702
3703 /*
3704 * VN_DISPOSE does freemem accounting for the pages in plist
3705 * by calling page_free. So, we need to undo the pcf accounting
3706 * for only the remaining pages.
3707 */
3708 VM_STAT_ADD(page_create_putbacks);
3709 page_create_putback(pages_req - plist_len);
3710
3711 return (NULL);
3712 }
3713 #endif /* !__xpv */
3714
3715
3716 /*
3717 * Copy the data from the physical page represented by "frompp" to
3718 * that represented by "topp". ppcopy uses CPU->cpu_caddr1 and
3719 * CPU->cpu_caddr2. It assumes that no one uses either map at interrupt
3720 * level and no one sleeps with an active mapping there.
3721 *
3722 * Note that the ref/mod bits in the page_t's are not affected by
3723 * this operation, hence it is up to the caller to update them appropriately.
3724 */
3725 int
3726 ppcopy(page_t *frompp, page_t *topp)
3727 {
3728 caddr_t pp_addr1;
3729 caddr_t pp_addr2;
3730 hat_mempte_t pte1;
3731 hat_mempte_t pte2;
3732 kmutex_t *ppaddr_mutex;
3733 label_t ljb;
3734 int ret = 1;
3735
3736 ASSERT_STACK_ALIGNED();
3737 ASSERT(PAGE_LOCKED(frompp));
3738 ASSERT(PAGE_LOCKED(topp));
3739
3740 if (kpm_enable) {
3741 pp_addr1 = hat_kpm_page2va(frompp, 0);
3742 pp_addr2 = hat_kpm_page2va(topp, 0);
3743 kpreempt_disable();
3744 } else {
3745 /*
3746 * disable pre-emption so that CPU can't change
3747 */
3748 kpreempt_disable();
3749
3750 pp_addr1 = CPU->cpu_caddr1;
3751 pp_addr2 = CPU->cpu_caddr2;
3752 pte1 = CPU->cpu_caddr1pte;
3753 pte2 = CPU->cpu_caddr2pte;
3754
3755 ppaddr_mutex = &CPU->cpu_ppaddr_mutex;
3756 mutex_enter(ppaddr_mutex);
3757
3758 hat_mempte_remap(page_pptonum(frompp), pp_addr1, pte1,
3759 PROT_READ | HAT_STORECACHING_OK, HAT_LOAD_NOCONSIST);
3760 hat_mempte_remap(page_pptonum(topp), pp_addr2, pte2,
3761 PROT_READ | PROT_WRITE | HAT_STORECACHING_OK,
3762 HAT_LOAD_NOCONSIST);
3763 }
3764
3765 if (on_fault(&ljb)) {
3766 ret = 0;
3767 goto faulted;
3768 }
3769 if (use_sse_pagecopy)
3770 #ifdef __xpv
3771 page_copy_no_xmm(pp_addr2, pp_addr1);
3772 #else
3773 hwblkpagecopy(pp_addr1, pp_addr2);
3774 #endif
3775 else
3776 bcopy(pp_addr1, pp_addr2, PAGESIZE);
3777
3778 no_fault();
3779 faulted:
3780 if (!kpm_enable) {
3781 #ifdef __xpv
3782 /*
3783 * We can't leave unused mappings laying about under the
3784 * hypervisor, so blow them away.
3785 */
3786 if (HYPERVISOR_update_va_mapping((uintptr_t)pp_addr1, 0,
3787 UVMF_INVLPG | UVMF_LOCAL) < 0)
3788 panic("HYPERVISOR_update_va_mapping() failed");
3789 if (HYPERVISOR_update_va_mapping((uintptr_t)pp_addr2, 0,
3790 UVMF_INVLPG | UVMF_LOCAL) < 0)
3791 panic("HYPERVISOR_update_va_mapping() failed");
3792 #endif
3793 mutex_exit(ppaddr_mutex);
3794 }
3795 kpreempt_enable();
3796 return (ret);
3797 }
3798
3799 void
3800 pagezero(page_t *pp, uint_t off, uint_t len)
3801 {
3802 ASSERT(PAGE_LOCKED(pp));
3803 pfnzero(page_pptonum(pp), off, len);
3804 }
3805
3806 /*
3807 * Zero the physical page from off to off + len given by pfn
3808 * without changing the reference and modified bits of page.
3809 *
3810 * We use this using CPU private page address #2, see ppcopy() for more info.
3811 * pfnzero() must not be called at interrupt level.
3812 */
3813 void
3814 pfnzero(pfn_t pfn, uint_t off, uint_t len)
3815 {
3816 caddr_t pp_addr2;
3817 hat_mempte_t pte2;
3818 kmutex_t *ppaddr_mutex = NULL;
3819
3820 ASSERT_STACK_ALIGNED();
3821 ASSERT(len <= MMU_PAGESIZE);
3822 ASSERT(off <= MMU_PAGESIZE);
3823 ASSERT(off + len <= MMU_PAGESIZE);
3824
3825 if (kpm_enable && !pfn_is_foreign(pfn)) {
3826 pp_addr2 = hat_kpm_pfn2va(pfn);
3827 kpreempt_disable();
3828 } else {
3829 kpreempt_disable();
3830
3831 pp_addr2 = CPU->cpu_caddr2;
3832 pte2 = CPU->cpu_caddr2pte;
3833
3834 ppaddr_mutex = &CPU->cpu_ppaddr_mutex;
3835 mutex_enter(ppaddr_mutex);
3836
3837 hat_mempte_remap(pfn, pp_addr2, pte2,
3838 PROT_READ | PROT_WRITE | HAT_STORECACHING_OK,
3839 HAT_LOAD_NOCONSIST);
3840 }
3841
3842 if (use_sse_pagezero) {
3843 #ifdef __xpv
3844 uint_t rem;
3845
3846 /*
3847 * zero a byte at a time until properly aligned for
3848 * block_zero_no_xmm().
3849 */
3850 while (!P2NPHASE(off, ((uint_t)BLOCKZEROALIGN)) && len-- > 0)
3851 pp_addr2[off++] = 0;
3852
3853 /*
3854 * Now use faster block_zero_no_xmm() for any range
3855 * that is properly aligned and sized.
3856 */
3857 rem = P2PHASE(len, ((uint_t)BLOCKZEROALIGN));
3858 len -= rem;
3859 if (len != 0) {
3860 block_zero_no_xmm(pp_addr2 + off, len);
3861 off += len;
3862 }
3863
3864 /*
3865 * zero remainder with byte stores.
3866 */
3867 while (rem-- > 0)
3868 pp_addr2[off++] = 0;
3869 #else
3870 hwblkclr(pp_addr2 + off, len);
3871 #endif
3872 } else {
3873 bzero(pp_addr2 + off, len);
3874 }
3875
3876 if (!kpm_enable || pfn_is_foreign(pfn)) {
3877 #ifdef __xpv
3878 /*
3879 * On the hypervisor this page might get used for a page
3880 * table before any intervening change to this mapping,
3881 * so blow it away.
3882 */
3883 if (HYPERVISOR_update_va_mapping((uintptr_t)pp_addr2, 0,
3884 UVMF_INVLPG) < 0)
3885 panic("HYPERVISOR_update_va_mapping() failed");
3886 #endif
3887 mutex_exit(ppaddr_mutex);
3888 }
3889
3890 kpreempt_enable();
3891 }
3892
3893 /*
3894 * Platform-dependent page scrub call.
3895 */
3896 void
3897 pagescrub(page_t *pp, uint_t off, uint_t len)
3898 {
3899 /*
3900 * For now, we rely on the fact that pagezero() will
3901 * always clear UEs.
3902 */
3903 pagezero(pp, off, len);
3904 }
3905
3906 /*
3907 * set up two private addresses for use on a given CPU for use in ppcopy()
3908 */
3909 void
3910 setup_vaddr_for_ppcopy(struct cpu *cpup)
3911 {
3912 void *addr;
3913 hat_mempte_t pte_pa;
3914
3915 addr = vmem_alloc(heap_arena, mmu_ptob(1), VM_SLEEP);
3916 pte_pa = hat_mempte_setup(addr);
3917 cpup->cpu_caddr1 = addr;
3918 cpup->cpu_caddr1pte = pte_pa;
3919
3920 addr = vmem_alloc(heap_arena, mmu_ptob(1), VM_SLEEP);
3921 pte_pa = hat_mempte_setup(addr);
3922 cpup->cpu_caddr2 = addr;
3923 cpup->cpu_caddr2pte = pte_pa;
3924
3925 mutex_init(&cpup->cpu_ppaddr_mutex, NULL, MUTEX_DEFAULT, NULL);
3926 }
3927
3928 /*
3929 * Undo setup_vaddr_for_ppcopy
3930 */
3931 void
3932 teardown_vaddr_for_ppcopy(struct cpu *cpup)
3933 {
3934 mutex_destroy(&cpup->cpu_ppaddr_mutex);
3935
3936 hat_mempte_release(cpup->cpu_caddr2, cpup->cpu_caddr2pte);
3937 cpup->cpu_caddr2pte = 0;
3938 vmem_free(heap_arena, cpup->cpu_caddr2, mmu_ptob(1));
3939 cpup->cpu_caddr2 = 0;
3940
3941 hat_mempte_release(cpup->cpu_caddr1, cpup->cpu_caddr1pte);
3942 cpup->cpu_caddr1pte = 0;
3943 vmem_free(heap_arena, cpup->cpu_caddr1, mmu_ptob(1));
3944 cpup->cpu_caddr1 = 0;
3945 }
3946
3947 /*
3948 * Function for flushing D-cache when performing module relocations
3949 * to an alternate mapping. Unnecessary on Intel / AMD platforms.
3950 */
3951 void
3952 dcache_flushall()
3953 {}
3954
3955 /*
3956 * Allocate a memory page. The argument 'seed' can be any pseudo-random
3957 * number to vary where the pages come from. This is quite a hacked up
3958 * method -- it works for now, but really needs to be fixed up a bit.
3959 *
3960 * We currently use page_create_va() on the kvp with fake offsets,
3961 * segments and virt address. This is pretty bogus, but was copied from the
3962 * old hat_i86.c code. A better approach would be to specify either mnode
3963 * random or mnode local and takes a page from whatever color has the MOST
3964 * available - this would have a minimal impact on page coloring.
3965 */
3966 page_t *
3967 page_get_physical(uintptr_t seed)
3968 {
3969 page_t *pp;
3970 u_offset_t offset;
3971 static struct seg tmpseg;
3972 static uintptr_t ctr = 0;
3973
3974 /*
3975 * This code is gross, we really need a simpler page allocator.
3976 *
3977 * We need to assign an offset for the page to call page_create_va()
3978 * To avoid conflicts with other pages, we get creative with the offset.
3979 * For 32 bits, we need an offset > 4Gig
3980 * For 64 bits, need an offset somewhere in the VA hole.
3981 */
3982 offset = seed;
3983 if (offset > kernelbase)
3984 offset -= kernelbase;
3985 offset <<= MMU_PAGESHIFT;
3986 #if defined(__amd64)
3987 offset += mmu.hole_start; /* something in VA hole */
3988 #else
3989 offset += 1ULL << 40; /* something > 4 Gig */
3990 #endif
3991
3992 if (page_resv(1, KM_NOSLEEP) == 0)
3993 return (NULL);
3994
3995 #ifdef DEBUG
3996 pp = page_exists(&kvp, offset);
3997 if (pp != NULL)
3998 panic("page already exists %p", (void *)pp);
3999 #endif
4000
4001 pp = page_create_va(&kvp, offset, MMU_PAGESIZE, PG_EXCL,
4002 &tmpseg, (caddr_t)(ctr += MMU_PAGESIZE)); /* changing VA usage */
4003 if (pp != NULL) {
4004 page_io_unlock(pp);
4005 page_downgrade(pp);
4006 }
4007 return (pp);
4008 }