1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26 /*
27 * UNIX machine dependent virtual memory support.
28 */
29
30 #include <sys/vm.h>
31 #include <sys/exec.h>
32
33 #include <sys/exechdr.h>
34 #include <vm/seg_kmem.h>
35 #include <sys/atomic.h>
36 #include <sys/archsystm.h>
37 #include <sys/machsystm.h>
38 #include <sys/kdi.h>
39 #include <sys/cpu_module.h>
40 #include <sys/secflags.h>
41
42 #include <vm/hat_sfmmu.h>
43
44 #include <sys/memnode.h>
45
46 #include <sys/mem_config.h>
47 #include <sys/mem_cage.h>
48 #include <vm/vm_dep.h>
49 #include <vm/page.h>
50 #include <sys/platform_module.h>
51
52 /*
53 * These variables are set by module specific config routines.
54 * They are only set by modules which will use physical cache page coloring.
55 */
56 int do_pg_coloring = 0;
57
58 /*
59 * These variables can be conveniently patched at kernel load time to
60 * prevent do_pg_coloring from being enabled by
61 * module specific config routines.
62 */
63
64 int use_page_coloring = 1;
65
66 /*
67 * initialized by page_coloring_init()
68 */
69 extern uint_t page_colors;
70 extern uint_t page_colors_mask;
71 extern uint_t page_coloring_shift;
72 int cpu_page_colors;
73 uint_t vac_colors = 0;
74 uint_t vac_colors_mask = 0;
75
76 /* cpu specific coloring initialization */
77 extern void page_coloring_init_cpu();
78 #pragma weak page_coloring_init_cpu
79
80 /*
81 * get the ecache setsize for the current cpu.
82 */
83 #define CPUSETSIZE() (cpunodes[CPU->cpu_id].ecache_setsize)
84
85 plcnt_t plcnt; /* page list count */
86
87 /*
88 * This variable is set by the cpu module to contain the lowest
89 * address not affected by the SF_ERRATA_57 workaround. It should
90 * remain 0 if the workaround is not needed.
91 */
92 #if defined(SF_ERRATA_57)
93 caddr_t errata57_limit;
94 #endif
95
96 extern void page_relocate_hash(page_t *, page_t *);
97
98 /*
99 * these must be defined in platform specific areas
100 */
101 extern void map_addr_proc(caddr_t *, size_t, offset_t, int, caddr_t,
102 struct proc *, uint_t);
103 extern page_t *page_get_freelist(struct vnode *, u_offset_t, struct seg *,
104 caddr_t, size_t, uint_t, struct lgrp *);
105 /*
106 * Convert page frame number to an OBMEM page frame number
107 * (i.e. put in the type bits -- zero for this implementation)
108 */
109 pfn_t
110 impl_obmem_pfnum(pfn_t pf)
111 {
112 return (pf);
113 }
114
115 /*
116 * Use physmax to determine the highest physical page of DRAM memory
117 * It is assumed that any physical addresses above physmax is in IO space.
118 * We don't bother checking the low end because we assume that memory space
119 * begins at physical page frame 0.
120 *
121 * Return 1 if the page frame is onboard DRAM memory, else 0.
122 * Returns 0 for nvram so it won't be cached.
123 */
124 int
125 pf_is_memory(pfn_t pf)
126 {
127 /* We must be IO space */
128 if (pf > physmax)
129 return (0);
130
131 /* We must be memory space */
132 return (1);
133 }
134
135 /*
136 * Handle a pagefault.
137 */
138 faultcode_t
139 pagefault(caddr_t addr, enum fault_type type, enum seg_rw rw, int iskernel)
140 {
141 struct as *as;
142 struct proc *p;
143 faultcode_t res;
144 caddr_t base;
145 size_t len;
146 int err;
147
148 if (INVALID_VADDR(addr))
149 return (FC_NOMAP);
150
151 if (iskernel) {
152 as = &kas;
153 } else {
154 p = curproc;
155 as = p->p_as;
156 #if defined(SF_ERRATA_57)
157 /*
158 * Prevent infinite loops due to a segment driver
159 * setting the execute permissions and the sfmmu hat
160 * silently ignoring them.
161 */
162 if (rw == S_EXEC && AS_TYPE_64BIT(as) &&
163 addr < errata57_limit) {
164 res = FC_NOMAP;
165 goto out;
166 }
167 #endif
168 }
169
170 /*
171 * Dispatch pagefault.
172 */
173 res = as_fault(as->a_hat, as, addr, 1, type, rw);
174
175 /*
176 * If this isn't a potential unmapped hole in the user's
177 * UNIX data or stack segments, just return status info.
178 */
179 if (!(res == FC_NOMAP && iskernel == 0))
180 goto out;
181
182 /*
183 * Check to see if we happened to faulted on a currently unmapped
184 * part of the UNIX data or stack segments. If so, create a zfod
185 * mapping there and then try calling the fault routine again.
186 */
187 base = p->p_brkbase;
188 len = p->p_brksize;
189
190 if (addr < base || addr >= base + len) { /* data seg? */
191 base = (caddr_t)(p->p_usrstack - p->p_stksize);
192 len = p->p_stksize;
193 if (addr < base || addr >= p->p_usrstack) { /* stack seg? */
194 /* not in either UNIX data or stack segments */
195 res = FC_NOMAP;
196 goto out;
197 }
198 }
199
200 /* the rest of this function implements a 3.X 4.X 5.X compatibility */
201 /* This code is probably not needed anymore */
202
203 /* expand the gap to the page boundaries on each side */
204 len = (((uintptr_t)base + len + PAGEOFFSET) & PAGEMASK) -
205 ((uintptr_t)base & PAGEMASK);
206 base = (caddr_t)((uintptr_t)base & PAGEMASK);
207
208 as_rangelock(as);
209 as_purge(as);
210 if (as_gap(as, PAGESIZE, &base, &len, AH_CONTAIN, addr) == 0) {
211 err = as_map(as, base, len, segvn_create, zfod_argsp);
212 as_rangeunlock(as);
213 if (err) {
214 res = FC_MAKE_ERR(err);
215 goto out;
216 }
217 } else {
218 /*
219 * This page is already mapped by another thread after we
220 * returned from as_fault() above. We just fallthrough
221 * as_fault() below.
222 */
223 as_rangeunlock(as);
224 }
225
226 res = as_fault(as->a_hat, as, addr, 1, F_INVAL, rw);
227
228 out:
229
230 return (res);
231 }
232
233 /*
234 * This is the routine which defines the address limit implied
235 * by the flag '_MAP_LOW32'. USERLIMIT32 matches the highest
236 * mappable address in a 32-bit process on this platform (though
237 * perhaps we should make it be UINT32_MAX here?)
238 */
239 void
240 map_addr(caddr_t *addrp, size_t len, offset_t off, int vacalign, uint_t flags)
241 {
242 struct proc *p = curproc;
243 caddr_t userlimit = flags & _MAP_LOW32 ?
244 (caddr_t)USERLIMIT32 : p->p_as->a_userlimit;
245 map_addr_proc(addrp, len, off, vacalign, userlimit, p, flags);
246 }
247
248 /*
249 * Some V9 CPUs have holes in the middle of the 64-bit virtual address range.
250 */
251 caddr_t hole_start, hole_end;
252
253 /*
254 * kpm mapping window
255 */
256 caddr_t kpm_vbase;
257 size_t kpm_size;
258 uchar_t kpm_size_shift;
259
260 int valid_va_range_aligned_wraparound;
261 /*
262 * Determine whether [*basep, *basep + *lenp) contains a mappable range of
263 * addresses at least "minlen" long, where the base of the range is at "off"
264 * phase from an "align" boundary and there is space for a "redzone"-sized
265 * redzone on either side of the range. On success, 1 is returned and *basep
266 * and *lenp are adjusted to describe the acceptable range (including
267 * the redzone). On failure, 0 is returned.
268 */
269 int
270 valid_va_range_aligned(caddr_t *basep, size_t *lenp, size_t minlen, int dir,
271 size_t align, size_t redzone, size_t off)
272 {
273 caddr_t hi, lo;
274 size_t tot_len;
275
276 ASSERT(align == 0 ? off == 0 : off < align);
277 ASSERT(ISP2(align));
278 ASSERT(align == 0 || align >= PAGESIZE);
279
280 lo = *basep;
281 hi = lo + *lenp;
282 tot_len = minlen + 2 * redzone; /* need at least this much space */
283
284 /* If hi rolled over the top try cutting back. */
285 if (hi < lo) {
286 *lenp = 0UL - (uintptr_t)lo - 1UL;
287 /* Trying to see if this really happens, and then if so, why */
288 valid_va_range_aligned_wraparound++;
289 hi = lo + *lenp;
290 }
291 if (*lenp < tot_len) {
292 return (0);
293 }
294
295 /*
296 * Deal with a possible hole in the address range between
297 * hole_start and hole_end that should never be mapped by the MMU.
298 */
299
300 if (lo < hole_start) {
301 if (hi > hole_start)
302 if (hi < hole_end)
303 hi = hole_start;
304 else
305 /* lo < hole_start && hi >= hole_end */
306 if (dir == AH_LO) {
307 /*
308 * prefer lowest range
309 */
310 if (hole_start - lo >= tot_len)
311 hi = hole_start;
312 else if (hi - hole_end >= tot_len)
313 lo = hole_end;
314 else
315 return (0);
316 } else {
317 /*
318 * prefer highest range
319 */
320 if (hi - hole_end >= tot_len)
321 lo = hole_end;
322 else if (hole_start - lo >= tot_len)
323 hi = hole_start;
324 else
325 return (0);
326 }
327 } else {
328 /* lo >= hole_start */
329 if (hi < hole_end)
330 return (0);
331 if (lo < hole_end)
332 lo = hole_end;
333 }
334
335 /* Check if remaining length is too small */
336 if (hi - lo < tot_len) {
337 return (0);
338 }
339 if (align > 1) {
340 caddr_t tlo = lo + redzone;
341 caddr_t thi = hi - redzone;
342 tlo = (caddr_t)P2PHASEUP((uintptr_t)tlo, align, off);
343 if (tlo < lo + redzone) {
344 return (0);
345 }
346 if (thi < tlo || thi - tlo < minlen) {
347 return (0);
348 }
349 }
350 *basep = lo;
351 *lenp = hi - lo;
352 return (1);
353 }
354
355 /*
356 * Determine whether [*basep, *basep + *lenp) contains a mappable range of
357 * addresses at least "minlen" long. On success, 1 is returned and *basep
358 * and *lenp are adjusted to describe the acceptable range. On failure, 0
359 * is returned.
360 */
361 int
362 valid_va_range(caddr_t *basep, size_t *lenp, size_t minlen, int dir)
363 {
364 return (valid_va_range_aligned(basep, lenp, minlen, dir, 0, 0, 0));
365 }
366
367 /*
368 * Default to forbidding the first 64k of address space. This protects most
369 * reasonably sized structures from dereferences through NULL:
370 * ((foo_t *)0)->bar
371 */
372 uintptr_t forbidden_null_mapping_sz = 0x10000;
373
374 /*
375 * Determine whether [addr, addr+len] with protections `prot' are valid
376 * for a user address space.
377 */
378 /*ARGSUSED*/
379 int
380 valid_usr_range(caddr_t addr, size_t len, uint_t prot, struct as *as,
381 caddr_t userlimit)
382 {
383 caddr_t eaddr = addr + len;
384
385 if (eaddr <= addr || addr >= userlimit || eaddr > userlimit)
386 return (RANGE_BADADDR);
387
388 if ((addr <= (caddr_t)forbidden_null_mapping_sz) &&
389 secflag_enabled(as->a_proc, PROC_SEC_FORBIDNULLMAP))
390 return (RANGE_BADADDR);
391
392 /*
393 * Determine if the address range falls within an illegal
394 * range of the MMU.
395 */
396 if (eaddr > hole_start && addr < hole_end)
397 return (RANGE_BADADDR);
398
399 #if defined(SF_ERRATA_57)
400 /*
401 * Make sure USERLIMIT isn't raised too high
402 */
403 ASSERT64(addr <= (caddr_t)0xffffffff80000000ul ||
404 errata57_limit == 0);
405
406 if (AS_TYPE_64BIT(as) &&
407 (addr < errata57_limit) &&
408 (prot & PROT_EXEC))
409 return (RANGE_BADPROT);
410 #endif /* SF_ERRATA57 */
411 return (RANGE_OKAY);
412 }
413
414 /*
415 * Routine used to check to see if an a.out can be executed
416 * by the current machine/architecture.
417 */
418 int
419 chkaout(struct exdata *exp)
420 {
421 if (exp->ux_mach == M_SPARC)
422 return (0);
423 else
424 return (ENOEXEC);
425 }
426
427 /*
428 * The following functions return information about an a.out
429 * which is used when a program is executed.
430 */
431
432 /*
433 * Return the load memory address for the data segment.
434 */
435 caddr_t
436 getdmem(struct exec *exp)
437 {
438 /*
439 * XXX - Sparc Reference Hack approaching
440 * Remember that we are loading
441 * 8k executables into a 4k machine
442 * DATA_ALIGN == 2 * PAGESIZE
443 */
444 if (exp->a_text)
445 return ((caddr_t)(roundup(USRTEXT + exp->a_text, DATA_ALIGN)));
446 else
447 return ((caddr_t)USRTEXT);
448 }
449
450 /*
451 * Return the starting disk address for the data segment.
452 */
453 ulong_t
454 getdfile(struct exec *exp)
455 {
456 if (exp->a_magic == ZMAGIC)
457 return (exp->a_text);
458 else
459 return (sizeof (struct exec) + exp->a_text);
460 }
461
462 /*
463 * Return the load memory address for the text segment.
464 */
465
466 /*ARGSUSED*/
467 caddr_t
468 gettmem(struct exec *exp)
469 {
470 return ((caddr_t)USRTEXT);
471 }
472
473 /*
474 * Return the file byte offset for the text segment.
475 */
476 uint_t
477 gettfile(struct exec *exp)
478 {
479 if (exp->a_magic == ZMAGIC)
480 return (0);
481 else
482 return (sizeof (struct exec));
483 }
484
485 void
486 getexinfo(
487 struct exdata *edp_in,
488 struct exdata *edp_out,
489 int *pagetext,
490 int *pagedata)
491 {
492 *edp_out = *edp_in; /* structure copy */
493
494 if ((edp_in->ux_mag == ZMAGIC) &&
495 ((edp_in->vp->v_flag & VNOMAP) == 0)) {
496 *pagetext = 1;
497 *pagedata = 1;
498 } else {
499 *pagetext = 0;
500 *pagedata = 0;
501 }
502 }
503
504 /*
505 * Return non 0 value if the address may cause a VAC alias with KPM mappings.
506 * KPM selects an address such that it's equal offset modulo shm_alignment and
507 * assumes it can't be in VAC conflict with any larger than PAGESIZE mapping.
508 */
509 int
510 map_addr_vacalign_check(caddr_t addr, u_offset_t off)
511 {
512 if (vac) {
513 return (((uintptr_t)addr ^ off) & shm_alignment - 1);
514 } else {
515 return (0);
516 }
517 }
518
519 /*
520 * Sanity control. Don't use large pages regardless of user
521 * settings if there's less than priv or shm_lpg_min_physmem memory installed.
522 * The units for this variable is 8K pages.
523 */
524 pgcnt_t shm_lpg_min_physmem = 131072; /* 1GB */
525 pgcnt_t privm_lpg_min_physmem = 131072; /* 1GB */
526
527 static size_t
528 map_pgszheap(struct proc *p, caddr_t addr, size_t len)
529 {
530 size_t pgsz = MMU_PAGESIZE;
531 int szc;
532
533 /*
534 * If len is zero, retrieve from proc and don't demote the page size.
535 * Use atleast the default pagesize.
536 */
537 if (len == 0) {
538 len = p->p_brkbase + p->p_brksize - p->p_bssbase;
539 }
540 len = MAX(len, default_uheap_lpsize);
541
542 for (szc = mmu_page_sizes - 1; szc >= 0; szc--) {
543 pgsz = hw_page_array[szc].hp_size;
544 if ((disable_auto_data_large_pages & (1 << szc)) ||
545 pgsz > max_uheap_lpsize)
546 continue;
547 if (len >= pgsz) {
548 break;
549 }
550 }
551
552 /*
553 * If addr == 0 we were called by memcntl() when the
554 * size code is 0. Don't set pgsz less than current size.
555 */
556 if (addr == 0 && (pgsz < hw_page_array[p->p_brkpageszc].hp_size)) {
557 pgsz = hw_page_array[p->p_brkpageszc].hp_size;
558 }
559
560 return (pgsz);
561 }
562
563 static size_t
564 map_pgszstk(struct proc *p, caddr_t addr, size_t len)
565 {
566 size_t pgsz = MMU_PAGESIZE;
567 int szc;
568
569 /*
570 * If len is zero, retrieve from proc and don't demote the page size.
571 * Use atleast the default pagesize.
572 */
573 if (len == 0) {
574 len = p->p_stksize;
575 }
576 len = MAX(len, default_ustack_lpsize);
577
578 for (szc = mmu_page_sizes - 1; szc >= 0; szc--) {
579 pgsz = hw_page_array[szc].hp_size;
580 if ((disable_auto_data_large_pages & (1 << szc)) ||
581 pgsz > max_ustack_lpsize)
582 continue;
583 if (len >= pgsz) {
584 break;
585 }
586 }
587
588 /*
589 * If addr == 0 we were called by memcntl() or exec_args() when the
590 * size code is 0. Don't set pgsz less than current size.
591 */
592 if (addr == 0 && (pgsz < hw_page_array[p->p_stkpageszc].hp_size)) {
593 pgsz = hw_page_array[p->p_stkpageszc].hp_size;
594 }
595
596 return (pgsz);
597 }
598
599 static size_t
600 map_pgszism(caddr_t addr, size_t len)
601 {
602 uint_t szc;
603 size_t pgsz;
604
605 for (szc = mmu_page_sizes - 1; szc >= TTE4M; szc--) {
606 if (disable_ism_large_pages & (1 << szc))
607 continue;
608
609 pgsz = hw_page_array[szc].hp_size;
610 if ((len >= pgsz) && IS_P2ALIGNED(addr, pgsz))
611 return (pgsz);
612 }
613
614 return (DEFAULT_ISM_PAGESIZE);
615 }
616
617 /*
618 * Suggest a page size to be used to map a segment of type maptype and length
619 * len. Returns a page size (not a size code).
620 */
621 /* ARGSUSED */
622 size_t
623 map_pgsz(int maptype, struct proc *p, caddr_t addr, size_t len, int memcntl)
624 {
625 size_t pgsz = MMU_PAGESIZE;
626
627 ASSERT(maptype != MAPPGSZ_VA);
628
629 if (maptype != MAPPGSZ_ISM && physmem < privm_lpg_min_physmem) {
630 return (MMU_PAGESIZE);
631 }
632
633 switch (maptype) {
634 case MAPPGSZ_ISM:
635 pgsz = map_pgszism(addr, len);
636 break;
637
638 case MAPPGSZ_STK:
639 if (max_ustack_lpsize > MMU_PAGESIZE) {
640 pgsz = map_pgszstk(p, addr, len);
641 }
642 break;
643
644 case MAPPGSZ_HEAP:
645 if (max_uheap_lpsize > MMU_PAGESIZE) {
646 pgsz = map_pgszheap(p, addr, len);
647 }
648 break;
649 }
650 return (pgsz);
651 }
652
653
654 /* assumes TTE8K...TTE4M == szc */
655
656 static uint_t
657 map_szcvec(caddr_t addr, size_t size, uintptr_t off, int disable_lpgs,
658 size_t max_lpsize, size_t min_physmem)
659 {
660 caddr_t eaddr = addr + size;
661 uint_t szcvec = 0;
662 caddr_t raddr;
663 caddr_t readdr;
664 size_t pgsz;
665 int i;
666
667 if (physmem < min_physmem || max_lpsize <= MMU_PAGESIZE) {
668 return (0);
669 }
670 for (i = mmu_page_sizes - 1; i > 0; i--) {
671 if (disable_lpgs & (1 << i)) {
672 continue;
673 }
674 pgsz = page_get_pagesize(i);
675 if (pgsz > max_lpsize) {
676 continue;
677 }
678 raddr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
679 readdr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
680 if (raddr < addr || raddr >= readdr) {
681 continue;
682 }
683 if (P2PHASE((uintptr_t)addr ^ off, pgsz)) {
684 continue;
685 }
686 szcvec |= (1 << i);
687 /*
688 * And or in the remaining enabled page sizes.
689 */
690 szcvec |= P2PHASE(~disable_lpgs, (1 << i));
691 szcvec &= ~1; /* no need to return 8K pagesize */
692 break;
693 }
694 return (szcvec);
695 }
696
697 /*
698 * Return a bit vector of large page size codes that
699 * can be used to map [addr, addr + len) region.
700 */
701 /* ARGSUSED */
702 uint_t
703 map_pgszcvec(caddr_t addr, size_t size, uintptr_t off, int flags, int type,
704 int memcntl)
705 {
706 if (flags & MAP_TEXT) {
707 return (map_szcvec(addr, size, off,
708 disable_auto_text_large_pages,
709 max_utext_lpsize, shm_lpg_min_physmem));
710
711 } else if (flags & MAP_INITDATA) {
712 return (map_szcvec(addr, size, off,
713 disable_auto_data_large_pages,
714 max_uidata_lpsize, privm_lpg_min_physmem));
715
716 } else if (type == MAPPGSZC_SHM) {
717 return (map_szcvec(addr, size, off,
718 disable_auto_data_large_pages,
719 max_shm_lpsize, shm_lpg_min_physmem));
720
721 } else if (type == MAPPGSZC_HEAP) {
722 return (map_szcvec(addr, size, off,
723 disable_auto_data_large_pages,
724 max_uheap_lpsize, privm_lpg_min_physmem));
725
726 } else if (type == MAPPGSZC_STACK) {
727 return (map_szcvec(addr, size, off,
728 disable_auto_data_large_pages,
729 max_ustack_lpsize, privm_lpg_min_physmem));
730
731 } else {
732 return (map_szcvec(addr, size, off,
733 disable_auto_data_large_pages,
734 max_privmap_lpsize, privm_lpg_min_physmem));
735 }
736 }
737
738 /*
739 * Anchored in the table below are counters used to keep track
740 * of free contiguous physical memory. Each element of the table contains
741 * the array of counters, the size of array which is allocated during
742 * startup based on physmax and a shift value used to convert a pagenum
743 * into a counter array index or vice versa. The table has page size
744 * for rows and region size for columns:
745 *
746 * page_counters[page_size][region_size]
747 *
748 * page_size: TTE size code of pages on page_size freelist.
749 *
750 * region_size: TTE size code of a candidate larger page made up
751 * made up of contiguous free page_size pages.
752 *
753 * As you go across a page_size row increasing region_size each
754 * element keeps track of how many (region_size - 1) size groups
755 * made up of page_size free pages can be coalesced into a
756 * regsion_size page. Yuck! Lets try an example:
757 *
758 * page_counters[1][3] is the table element used for identifying
759 * candidate 4M pages from contiguous pages off the 64K free list.
760 * Each index in the page_counters[1][3].array spans 4M. Its the
761 * number of free 512K size (regsion_size - 1) groups of contiguous
762 * 64K free pages. So when page_counters[1][3].counters[n] == 8
763 * we know we have a candidate 4M page made up of 512K size groups
764 * of 64K free pages.
765 */
766
767 /*
768 * Per page size free lists. 3rd (max_mem_nodes) and 4th (page coloring bins)
769 * dimensions are allocated dynamically.
770 */
771 page_t ***page_freelists[MMU_PAGE_SIZES][MAX_MEM_TYPES];
772
773 /*
774 * For now there is only a single size cache list.
775 * Allocated dynamically.
776 */
777 page_t ***page_cachelists[MAX_MEM_TYPES];
778
779 kmutex_t *fpc_mutex[NPC_MUTEX];
780 kmutex_t *cpc_mutex[NPC_MUTEX];
781
782 /*
783 * Calculate space needed for page freelists and counters
784 */
785 size_t
786 calc_free_pagelist_sz(void)
787 {
788 int szc;
789 size_t alloc_sz, cache_sz, free_sz;
790
791 /*
792 * one cachelist per color, node, and type
793 */
794 cache_sz = (page_get_pagecolors(0) * sizeof (page_t *)) +
795 sizeof (page_t **);
796 cache_sz *= max_mem_nodes * MAX_MEM_TYPES;
797
798 /*
799 * one freelist per size, color, node, and type
800 */
801 free_sz = sizeof (page_t **);
802 for (szc = 0; szc < mmu_page_sizes; szc++)
803 free_sz += sizeof (page_t *) * page_get_pagecolors(szc);
804 free_sz *= max_mem_nodes * MAX_MEM_TYPES;
805
806 alloc_sz = cache_sz + free_sz + page_ctrs_sz();
807 return (alloc_sz);
808 }
809
810 caddr_t
811 alloc_page_freelists(caddr_t alloc_base)
812 {
813 int mnode, mtype;
814 int szc, clrs;
815
816 /*
817 * We only support small pages in the cachelist.
818 */
819 for (mtype = 0; mtype < MAX_MEM_TYPES; mtype++) {
820 page_cachelists[mtype] = (page_t ***)alloc_base;
821 alloc_base += (max_mem_nodes * sizeof (page_t **));
822 for (mnode = 0; mnode < max_mem_nodes; mnode++) {
823 page_cachelists[mtype][mnode] = (page_t **)alloc_base;
824 alloc_base +=
825 (page_get_pagecolors(0) * sizeof (page_t *));
826 }
827 }
828
829 /*
830 * Allocate freelists bins for all
831 * supported page sizes.
832 */
833 for (szc = 0; szc < mmu_page_sizes; szc++) {
834 clrs = page_get_pagecolors(szc);
835 for (mtype = 0; mtype < MAX_MEM_TYPES; mtype++) {
836 page_freelists[szc][mtype] = (page_t ***)alloc_base;
837 alloc_base += (max_mem_nodes * sizeof (page_t **));
838 for (mnode = 0; mnode < max_mem_nodes; mnode++) {
839 page_freelists[szc][mtype][mnode] =
840 (page_t **)alloc_base;
841 alloc_base += (clrs * (sizeof (page_t *)));
842 }
843 }
844 }
845
846 alloc_base = page_ctrs_alloc(alloc_base);
847 return (alloc_base);
848 }
849
850 /*
851 * Allocate page_freelists locks for a memnode from the nucleus data
852 * area. This is the first time that mmu_page_sizes is used during
853 * bootup, so check mmu_page_sizes initialization.
854 */
855 int
856 ndata_alloc_page_mutexs(struct memlist *ndata)
857 {
858 size_t alloc_sz;
859 caddr_t alloc_base;
860 int i;
861 void page_coloring_init();
862
863 page_coloring_init();
864 if (&mmu_init_mmu_page_sizes) {
865 if (!mmu_init_mmu_page_sizes(0)) {
866 cmn_err(CE_PANIC, "mmu_page_sizes %d not initialized",
867 mmu_page_sizes);
868 }
869 }
870 ASSERT(mmu_page_sizes >= DEFAULT_MMU_PAGE_SIZES);
871
872 /* fpc_mutex and cpc_mutex */
873 alloc_sz = 2 * NPC_MUTEX * max_mem_nodes * sizeof (kmutex_t);
874
875 alloc_base = ndata_alloc(ndata, alloc_sz, ecache_alignsize);
876 if (alloc_base == NULL)
877 return (-1);
878
879 ASSERT(((uintptr_t)alloc_base & (ecache_alignsize - 1)) == 0);
880
881 for (i = 0; i < NPC_MUTEX; i++) {
882 fpc_mutex[i] = (kmutex_t *)alloc_base;
883 alloc_base += (sizeof (kmutex_t) * max_mem_nodes);
884 cpc_mutex[i] = (kmutex_t *)alloc_base;
885 alloc_base += (sizeof (kmutex_t) * max_mem_nodes);
886 }
887 return (0);
888 }
889
890 /*
891 * To select our starting bin, we stride through the bins with a stride
892 * of 337. Why 337? It's prime, it's largeish, and it performs well both
893 * in simulation and practice for different workloads on varying cache sizes.
894 */
895 uint32_t color_start_current = 0;
896 uint32_t color_start_stride = 337;
897 int color_start_random = 0;
898
899 /* ARGSUSED */
900 uint_t
901 get_color_start(struct as *as)
902 {
903 uint32_t old, new;
904
905 if (consistent_coloring == 2 || color_start_random) {
906 return ((uint_t)(((gettick()) << (vac_shift - MMU_PAGESHIFT)) &
907 (hw_page_array[0].hp_colors - 1)));
908 }
909
910 do {
911 old = color_start_current;
912 new = old + (color_start_stride << (vac_shift - MMU_PAGESHIFT));
913 } while (atomic_cas_32(&color_start_current, old, new) != old);
914
915 return ((uint_t)(new));
916 }
917
918 /*
919 * Called once at startup from kphysm_init() -- before memialloc()
920 * is invoked to do the 1st page_free()/page_freelist_add().
921 *
922 * initializes page_colors and page_colors_mask based on ecache_setsize.
923 *
924 * Also initializes the counter locks.
925 */
926 void
927 page_coloring_init()
928 {
929 int a, i;
930 uint_t colors;
931
932 if (do_pg_coloring == 0) {
933 page_colors = 1;
934 for (i = 0; i < mmu_page_sizes; i++) {
935 colorequivszc[i] = 0;
936 hw_page_array[i].hp_colors = 1;
937 }
938 return;
939 }
940
941 /*
942 * Calculate page_colors from ecache_setsize. ecache_setsize contains
943 * the max ecache setsize of all cpus configured in the system or, for
944 * cheetah+ systems, the max possible ecache setsize for all possible
945 * cheetah+ cpus.
946 */
947 page_colors = ecache_setsize / MMU_PAGESIZE;
948 page_colors_mask = page_colors - 1;
949
950 vac_colors = vac_size / MMU_PAGESIZE;
951 vac_colors_mask = vac_colors -1;
952
953 page_coloring_shift = 0;
954 a = ecache_setsize;
955 while (a >>= 1) {
956 page_coloring_shift++;
957 }
958
959 /* initialize number of colors per page size */
960 for (i = 0; i < mmu_page_sizes; i++) {
961 hw_page_array[i].hp_colors = (page_colors_mask >>
962 (hw_page_array[i].hp_shift - hw_page_array[0].hp_shift))
963 + 1;
964 colorequivszc[i] = 0;
965 }
966
967 /*
968 * initialize cpu_page_colors if ecache setsizes are homogenous.
969 * cpu_page_colors set to -1 during DR operation or during startup
970 * if setsizes are heterogenous.
971 *
972 * The value of cpu_page_colors determines if additional color bins
973 * need to be checked for a particular color in the page_get routines.
974 */
975 if (cpu_setsize > 0 && cpu_page_colors == 0 &&
976 cpu_setsize < ecache_setsize) {
977 cpu_page_colors = cpu_setsize / MMU_PAGESIZE;
978 a = lowbit(page_colors) - lowbit(cpu_page_colors);
979 ASSERT(a > 0);
980 ASSERT(a < 16);
981
982 for (i = 0; i < mmu_page_sizes; i++) {
983 if ((colors = hw_page_array[i].hp_colors) <= 1) {
984 continue;
985 }
986 while ((colors >> a) == 0)
987 a--;
988 ASSERT(a >= 0);
989
990 /* higher 4 bits encodes color equiv mask */
991 colorequivszc[i] = (a << 4);
992 }
993 }
994
995 /* do cpu specific color initialization */
996 if (&page_coloring_init_cpu) {
997 page_coloring_init_cpu();
998 }
999 }
1000
1001 int
1002 bp_color(struct buf *bp)
1003 {
1004 int color = -1;
1005
1006 if (vac) {
1007 if ((bp->b_flags & B_PAGEIO) != 0) {
1008 color = sfmmu_get_ppvcolor(bp->b_pages);
1009 } else if (bp->b_un.b_addr != NULL) {
1010 color = sfmmu_get_addrvcolor(bp->b_un.b_addr);
1011 }
1012 }
1013 return (color < 0 ? 0 : ptob(color));
1014 }
1015
1016 /*
1017 * Function for flushing D-cache when performing module relocations
1018 * to an alternate mapping. Stubbed out on all platforms except sun4u,
1019 * at least for now.
1020 */
1021 void
1022 dcache_flushall()
1023 {
1024 sfmmu_cache_flushall();
1025 }
1026
1027 static int
1028 kdi_range_overlap(uintptr_t va1, size_t sz1, uintptr_t va2, size_t sz2)
1029 {
1030 if (va1 < va2 && va1 + sz1 <= va2)
1031 return (0);
1032
1033 if (va2 < va1 && va2 + sz2 <= va1)
1034 return (0);
1035
1036 return (1);
1037 }
1038
1039 /*
1040 * Return the number of bytes, relative to the beginning of a given range, that
1041 * are non-toxic (can be read from and written to with relative impunity).
1042 */
1043 size_t
1044 kdi_range_is_nontoxic(uintptr_t va, size_t sz, int write)
1045 {
1046 /* OBP reads are harmless, but we don't want people writing there */
1047 if (write && kdi_range_overlap(va, sz, OFW_START_ADDR, OFW_END_ADDR -
1048 OFW_START_ADDR + 1))
1049 return (va < OFW_START_ADDR ? OFW_START_ADDR - va : 0);
1050
1051 if (kdi_range_overlap(va, sz, PIOMAPBASE, PIOMAPSIZE))
1052 return (va < PIOMAPBASE ? PIOMAPBASE - va : 0);
1053
1054 return (sz); /* no overlap */
1055 }
1056
1057 /*
1058 * Minimum physmem required for enabling large pages for kernel heap
1059 * Currently we do not enable lp for kmem on systems with less
1060 * than 1GB of memory. This value can be changed via /etc/system
1061 */
1062 size_t segkmem_lpminphysmem = 0x40000000; /* 1GB */
1063
1064 /*
1065 * this function chooses large page size for kernel heap
1066 */
1067 size_t
1068 get_segkmem_lpsize(size_t lpsize)
1069 {
1070 size_t memtotal = physmem * PAGESIZE;
1071 size_t mmusz;
1072 uint_t szc;
1073
1074 if (memtotal < segkmem_lpminphysmem)
1075 return (PAGESIZE);
1076
1077 if (plat_lpkmem_is_supported != NULL &&
1078 plat_lpkmem_is_supported() == 0)
1079 return (PAGESIZE);
1080
1081 mmusz = mmu_get_kernel_lpsize(lpsize);
1082 szc = page_szc(mmusz);
1083
1084 while (szc) {
1085 if (!(disable_large_pages & (1 << szc)))
1086 return (page_get_pagesize(szc));
1087 szc--;
1088 }
1089 return (PAGESIZE);
1090 }