1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * UNIX machine dependent virtual memory support. 28 */ 29 30 #include <sys/vm.h> 31 #include <sys/exec.h> 32 33 #include <sys/exechdr.h> 34 #include <vm/seg_kmem.h> 35 #include <sys/atomic.h> 36 #include <sys/archsystm.h> 37 #include <sys/machsystm.h> 38 #include <sys/kdi.h> 39 #include <sys/cpu_module.h> 40 #include <sys/secflags.h> 41 42 #include <vm/hat_sfmmu.h> 43 44 #include <sys/memnode.h> 45 46 #include <sys/mem_config.h> 47 #include <sys/mem_cage.h> 48 #include <vm/vm_dep.h> 49 #include <vm/page.h> 50 #include <sys/platform_module.h> 51 52 /* 53 * These variables are set by module specific config routines. 54 * They are only set by modules which will use physical cache page coloring. 55 */ 56 int do_pg_coloring = 0; 57 58 /* 59 * These variables can be conveniently patched at kernel load time to 60 * prevent do_pg_coloring from being enabled by 61 * module specific config routines. 62 */ 63 64 int use_page_coloring = 1; 65 66 /* 67 * initialized by page_coloring_init() 68 */ 69 extern uint_t page_colors; 70 extern uint_t page_colors_mask; 71 extern uint_t page_coloring_shift; 72 int cpu_page_colors; 73 uint_t vac_colors = 0; 74 uint_t vac_colors_mask = 0; 75 76 /* cpu specific coloring initialization */ 77 extern void page_coloring_init_cpu(); 78 #pragma weak page_coloring_init_cpu 79 80 /* 81 * get the ecache setsize for the current cpu. 82 */ 83 #define CPUSETSIZE() (cpunodes[CPU->cpu_id].ecache_setsize) 84 85 plcnt_t plcnt; /* page list count */ 86 87 /* 88 * This variable is set by the cpu module to contain the lowest 89 * address not affected by the SF_ERRATA_57 workaround. It should 90 * remain 0 if the workaround is not needed. 91 */ 92 #if defined(SF_ERRATA_57) 93 caddr_t errata57_limit; 94 #endif 95 96 extern void page_relocate_hash(page_t *, page_t *); 97 98 /* 99 * these must be defined in platform specific areas 100 */ 101 extern void map_addr_proc(caddr_t *, size_t, offset_t, int, caddr_t, 102 struct proc *, uint_t); 103 extern page_t *page_get_freelist(struct vnode *, u_offset_t, struct seg *, 104 caddr_t, size_t, uint_t, struct lgrp *); 105 /* 106 * Convert page frame number to an OBMEM page frame number 107 * (i.e. put in the type bits -- zero for this implementation) 108 */ 109 pfn_t 110 impl_obmem_pfnum(pfn_t pf) 111 { 112 return (pf); 113 } 114 115 /* 116 * Use physmax to determine the highest physical page of DRAM memory 117 * It is assumed that any physical addresses above physmax is in IO space. 118 * We don't bother checking the low end because we assume that memory space 119 * begins at physical page frame 0. 120 * 121 * Return 1 if the page frame is onboard DRAM memory, else 0. 122 * Returns 0 for nvram so it won't be cached. 123 */ 124 int 125 pf_is_memory(pfn_t pf) 126 { 127 /* We must be IO space */ 128 if (pf > physmax) 129 return (0); 130 131 /* We must be memory space */ 132 return (1); 133 } 134 135 /* 136 * Handle a pagefault. 137 */ 138 faultcode_t 139 pagefault(caddr_t addr, enum fault_type type, enum seg_rw rw, int iskernel) 140 { 141 struct as *as; 142 struct proc *p; 143 faultcode_t res; 144 caddr_t base; 145 size_t len; 146 int err; 147 148 if (INVALID_VADDR(addr)) 149 return (FC_NOMAP); 150 151 if (iskernel) { 152 as = &kas; 153 } else { 154 p = curproc; 155 as = p->p_as; 156 #if defined(SF_ERRATA_57) 157 /* 158 * Prevent infinite loops due to a segment driver 159 * setting the execute permissions and the sfmmu hat 160 * silently ignoring them. 161 */ 162 if (rw == S_EXEC && AS_TYPE_64BIT(as) && 163 addr < errata57_limit) { 164 res = FC_NOMAP; 165 goto out; 166 } 167 #endif 168 } 169 170 /* 171 * Dispatch pagefault. 172 */ 173 res = as_fault(as->a_hat, as, addr, 1, type, rw); 174 175 /* 176 * If this isn't a potential unmapped hole in the user's 177 * UNIX data or stack segments, just return status info. 178 */ 179 if (!(res == FC_NOMAP && iskernel == 0)) 180 goto out; 181 182 /* 183 * Check to see if we happened to faulted on a currently unmapped 184 * part of the UNIX data or stack segments. If so, create a zfod 185 * mapping there and then try calling the fault routine again. 186 */ 187 base = p->p_brkbase; 188 len = p->p_brksize; 189 190 if (addr < base || addr >= base + len) { /* data seg? */ 191 base = (caddr_t)(p->p_usrstack - p->p_stksize); 192 len = p->p_stksize; 193 if (addr < base || addr >= p->p_usrstack) { /* stack seg? */ 194 /* not in either UNIX data or stack segments */ 195 res = FC_NOMAP; 196 goto out; 197 } 198 } 199 200 /* the rest of this function implements a 3.X 4.X 5.X compatibility */ 201 /* This code is probably not needed anymore */ 202 203 /* expand the gap to the page boundaries on each side */ 204 len = (((uintptr_t)base + len + PAGEOFFSET) & PAGEMASK) - 205 ((uintptr_t)base & PAGEMASK); 206 base = (caddr_t)((uintptr_t)base & PAGEMASK); 207 208 as_rangelock(as); 209 as_purge(as); 210 if (as_gap(as, PAGESIZE, &base, &len, AH_CONTAIN, addr) == 0) { 211 err = as_map(as, base, len, segvn_create, zfod_argsp); 212 as_rangeunlock(as); 213 if (err) { 214 res = FC_MAKE_ERR(err); 215 goto out; 216 } 217 } else { 218 /* 219 * This page is already mapped by another thread after we 220 * returned from as_fault() above. We just fallthrough 221 * as_fault() below. 222 */ 223 as_rangeunlock(as); 224 } 225 226 res = as_fault(as->a_hat, as, addr, 1, F_INVAL, rw); 227 228 out: 229 230 return (res); 231 } 232 233 /* 234 * This is the routine which defines the address limit implied 235 * by the flag '_MAP_LOW32'. USERLIMIT32 matches the highest 236 * mappable address in a 32-bit process on this platform (though 237 * perhaps we should make it be UINT32_MAX here?) 238 */ 239 void 240 map_addr(caddr_t *addrp, size_t len, offset_t off, int vacalign, uint_t flags) 241 { 242 struct proc *p = curproc; 243 caddr_t userlimit = flags & _MAP_LOW32 ? 244 (caddr_t)USERLIMIT32 : p->p_as->a_userlimit; 245 map_addr_proc(addrp, len, off, vacalign, userlimit, p, flags); 246 } 247 248 /* 249 * Some V9 CPUs have holes in the middle of the 64-bit virtual address range. 250 */ 251 caddr_t hole_start, hole_end; 252 253 /* 254 * kpm mapping window 255 */ 256 caddr_t kpm_vbase; 257 size_t kpm_size; 258 uchar_t kpm_size_shift; 259 260 int valid_va_range_aligned_wraparound; 261 /* 262 * Determine whether [*basep, *basep + *lenp) contains a mappable range of 263 * addresses at least "minlen" long, where the base of the range is at "off" 264 * phase from an "align" boundary and there is space for a "redzone"-sized 265 * redzone on either side of the range. On success, 1 is returned and *basep 266 * and *lenp are adjusted to describe the acceptable range (including 267 * the redzone). On failure, 0 is returned. 268 */ 269 int 270 valid_va_range_aligned(caddr_t *basep, size_t *lenp, size_t minlen, int dir, 271 size_t align, size_t redzone, size_t off) 272 { 273 caddr_t hi, lo; 274 size_t tot_len; 275 276 ASSERT(align == 0 ? off == 0 : off < align); 277 ASSERT(ISP2(align)); 278 ASSERT(align == 0 || align >= PAGESIZE); 279 280 lo = *basep; 281 hi = lo + *lenp; 282 tot_len = minlen + 2 * redzone; /* need at least this much space */ 283 284 /* If hi rolled over the top try cutting back. */ 285 if (hi < lo) { 286 *lenp = 0UL - (uintptr_t)lo - 1UL; 287 /* Trying to see if this really happens, and then if so, why */ 288 valid_va_range_aligned_wraparound++; 289 hi = lo + *lenp; 290 } 291 if (*lenp < tot_len) { 292 return (0); 293 } 294 295 /* 296 * Deal with a possible hole in the address range between 297 * hole_start and hole_end that should never be mapped by the MMU. 298 */ 299 300 if (lo < hole_start) { 301 if (hi > hole_start) 302 if (hi < hole_end) 303 hi = hole_start; 304 else 305 /* lo < hole_start && hi >= hole_end */ 306 if (dir == AH_LO) { 307 /* 308 * prefer lowest range 309 */ 310 if (hole_start - lo >= tot_len) 311 hi = hole_start; 312 else if (hi - hole_end >= tot_len) 313 lo = hole_end; 314 else 315 return (0); 316 } else { 317 /* 318 * prefer highest range 319 */ 320 if (hi - hole_end >= tot_len) 321 lo = hole_end; 322 else if (hole_start - lo >= tot_len) 323 hi = hole_start; 324 else 325 return (0); 326 } 327 } else { 328 /* lo >= hole_start */ 329 if (hi < hole_end) 330 return (0); 331 if (lo < hole_end) 332 lo = hole_end; 333 } 334 335 /* Check if remaining length is too small */ 336 if (hi - lo < tot_len) { 337 return (0); 338 } 339 if (align > 1) { 340 caddr_t tlo = lo + redzone; 341 caddr_t thi = hi - redzone; 342 tlo = (caddr_t)P2PHASEUP((uintptr_t)tlo, align, off); 343 if (tlo < lo + redzone) { 344 return (0); 345 } 346 if (thi < tlo || thi - tlo < minlen) { 347 return (0); 348 } 349 } 350 *basep = lo; 351 *lenp = hi - lo; 352 return (1); 353 } 354 355 /* 356 * Determine whether [*basep, *basep + *lenp) contains a mappable range of 357 * addresses at least "minlen" long. On success, 1 is returned and *basep 358 * and *lenp are adjusted to describe the acceptable range. On failure, 0 359 * is returned. 360 */ 361 int 362 valid_va_range(caddr_t *basep, size_t *lenp, size_t minlen, int dir) 363 { 364 return (valid_va_range_aligned(basep, lenp, minlen, dir, 0, 0, 0)); 365 } 366 367 /* 368 * Default to forbidding the first 64k of address space. This protects most 369 * reasonably sized structures from dereferences through NULL: 370 * ((foo_t *)0)->bar 371 */ 372 uintptr_t forbidden_null_mapping_sz = 0x10000; 373 374 /* 375 * Determine whether [addr, addr+len] with protections `prot' are valid 376 * for a user address space. 377 */ 378 /*ARGSUSED*/ 379 int 380 valid_usr_range(caddr_t addr, size_t len, uint_t prot, struct as *as, 381 caddr_t userlimit) 382 { 383 caddr_t eaddr = addr + len; 384 385 if (eaddr <= addr || addr >= userlimit || eaddr > userlimit) 386 return (RANGE_BADADDR); 387 388 if ((addr <= (caddr_t)forbidden_null_mapping_sz) && 389 secflag_enabled(as->a_proc, PROC_SEC_FORBIDNULLMAP)) 390 return (RANGE_BADADDR); 391 392 /* 393 * Determine if the address range falls within an illegal 394 * range of the MMU. 395 */ 396 if (eaddr > hole_start && addr < hole_end) 397 return (RANGE_BADADDR); 398 399 #if defined(SF_ERRATA_57) 400 /* 401 * Make sure USERLIMIT isn't raised too high 402 */ 403 ASSERT64(addr <= (caddr_t)0xffffffff80000000ul || 404 errata57_limit == 0); 405 406 if (AS_TYPE_64BIT(as) && 407 (addr < errata57_limit) && 408 (prot & PROT_EXEC)) 409 return (RANGE_BADPROT); 410 #endif /* SF_ERRATA57 */ 411 return (RANGE_OKAY); 412 } 413 414 /* 415 * Routine used to check to see if an a.out can be executed 416 * by the current machine/architecture. 417 */ 418 int 419 chkaout(struct exdata *exp) 420 { 421 if (exp->ux_mach == M_SPARC) 422 return (0); 423 else 424 return (ENOEXEC); 425 } 426 427 /* 428 * The following functions return information about an a.out 429 * which is used when a program is executed. 430 */ 431 432 /* 433 * Return the load memory address for the data segment. 434 */ 435 caddr_t 436 getdmem(struct exec *exp) 437 { 438 /* 439 * XXX - Sparc Reference Hack approaching 440 * Remember that we are loading 441 * 8k executables into a 4k machine 442 * DATA_ALIGN == 2 * PAGESIZE 443 */ 444 if (exp->a_text) 445 return ((caddr_t)(roundup(USRTEXT + exp->a_text, DATA_ALIGN))); 446 else 447 return ((caddr_t)USRTEXT); 448 } 449 450 /* 451 * Return the starting disk address for the data segment. 452 */ 453 ulong_t 454 getdfile(struct exec *exp) 455 { 456 if (exp->a_magic == ZMAGIC) 457 return (exp->a_text); 458 else 459 return (sizeof (struct exec) + exp->a_text); 460 } 461 462 /* 463 * Return the load memory address for the text segment. 464 */ 465 466 /*ARGSUSED*/ 467 caddr_t 468 gettmem(struct exec *exp) 469 { 470 return ((caddr_t)USRTEXT); 471 } 472 473 /* 474 * Return the file byte offset for the text segment. 475 */ 476 uint_t 477 gettfile(struct exec *exp) 478 { 479 if (exp->a_magic == ZMAGIC) 480 return (0); 481 else 482 return (sizeof (struct exec)); 483 } 484 485 void 486 getexinfo( 487 struct exdata *edp_in, 488 struct exdata *edp_out, 489 int *pagetext, 490 int *pagedata) 491 { 492 *edp_out = *edp_in; /* structure copy */ 493 494 if ((edp_in->ux_mag == ZMAGIC) && 495 ((edp_in->vp->v_flag & VNOMAP) == 0)) { 496 *pagetext = 1; 497 *pagedata = 1; 498 } else { 499 *pagetext = 0; 500 *pagedata = 0; 501 } 502 } 503 504 /* 505 * Return non 0 value if the address may cause a VAC alias with KPM mappings. 506 * KPM selects an address such that it's equal offset modulo shm_alignment and 507 * assumes it can't be in VAC conflict with any larger than PAGESIZE mapping. 508 */ 509 int 510 map_addr_vacalign_check(caddr_t addr, u_offset_t off) 511 { 512 if (vac) { 513 return (((uintptr_t)addr ^ off) & shm_alignment - 1); 514 } else { 515 return (0); 516 } 517 } 518 519 /* 520 * Sanity control. Don't use large pages regardless of user 521 * settings if there's less than priv or shm_lpg_min_physmem memory installed. 522 * The units for this variable is 8K pages. 523 */ 524 pgcnt_t shm_lpg_min_physmem = 131072; /* 1GB */ 525 pgcnt_t privm_lpg_min_physmem = 131072; /* 1GB */ 526 527 static size_t 528 map_pgszheap(struct proc *p, caddr_t addr, size_t len) 529 { 530 size_t pgsz = MMU_PAGESIZE; 531 int szc; 532 533 /* 534 * If len is zero, retrieve from proc and don't demote the page size. 535 * Use atleast the default pagesize. 536 */ 537 if (len == 0) { 538 len = p->p_brkbase + p->p_brksize - p->p_bssbase; 539 } 540 len = MAX(len, default_uheap_lpsize); 541 542 for (szc = mmu_page_sizes - 1; szc >= 0; szc--) { 543 pgsz = hw_page_array[szc].hp_size; 544 if ((disable_auto_data_large_pages & (1 << szc)) || 545 pgsz > max_uheap_lpsize) 546 continue; 547 if (len >= pgsz) { 548 break; 549 } 550 } 551 552 /* 553 * If addr == 0 we were called by memcntl() when the 554 * size code is 0. Don't set pgsz less than current size. 555 */ 556 if (addr == 0 && (pgsz < hw_page_array[p->p_brkpageszc].hp_size)) { 557 pgsz = hw_page_array[p->p_brkpageszc].hp_size; 558 } 559 560 return (pgsz); 561 } 562 563 static size_t 564 map_pgszstk(struct proc *p, caddr_t addr, size_t len) 565 { 566 size_t pgsz = MMU_PAGESIZE; 567 int szc; 568 569 /* 570 * If len is zero, retrieve from proc and don't demote the page size. 571 * Use atleast the default pagesize. 572 */ 573 if (len == 0) { 574 len = p->p_stksize; 575 } 576 len = MAX(len, default_ustack_lpsize); 577 578 for (szc = mmu_page_sizes - 1; szc >= 0; szc--) { 579 pgsz = hw_page_array[szc].hp_size; 580 if ((disable_auto_data_large_pages & (1 << szc)) || 581 pgsz > max_ustack_lpsize) 582 continue; 583 if (len >= pgsz) { 584 break; 585 } 586 } 587 588 /* 589 * If addr == 0 we were called by memcntl() or exec_args() when the 590 * size code is 0. Don't set pgsz less than current size. 591 */ 592 if (addr == 0 && (pgsz < hw_page_array[p->p_stkpageszc].hp_size)) { 593 pgsz = hw_page_array[p->p_stkpageszc].hp_size; 594 } 595 596 return (pgsz); 597 } 598 599 static size_t 600 map_pgszism(caddr_t addr, size_t len) 601 { 602 uint_t szc; 603 size_t pgsz; 604 605 for (szc = mmu_page_sizes - 1; szc >= TTE4M; szc--) { 606 if (disable_ism_large_pages & (1 << szc)) 607 continue; 608 609 pgsz = hw_page_array[szc].hp_size; 610 if ((len >= pgsz) && IS_P2ALIGNED(addr, pgsz)) 611 return (pgsz); 612 } 613 614 return (DEFAULT_ISM_PAGESIZE); 615 } 616 617 /* 618 * Suggest a page size to be used to map a segment of type maptype and length 619 * len. Returns a page size (not a size code). 620 */ 621 /* ARGSUSED */ 622 size_t 623 map_pgsz(int maptype, struct proc *p, caddr_t addr, size_t len, int memcntl) 624 { 625 size_t pgsz = MMU_PAGESIZE; 626 627 ASSERT(maptype != MAPPGSZ_VA); 628 629 if (maptype != MAPPGSZ_ISM && physmem < privm_lpg_min_physmem) { 630 return (MMU_PAGESIZE); 631 } 632 633 switch (maptype) { 634 case MAPPGSZ_ISM: 635 pgsz = map_pgszism(addr, len); 636 break; 637 638 case MAPPGSZ_STK: 639 if (max_ustack_lpsize > MMU_PAGESIZE) { 640 pgsz = map_pgszstk(p, addr, len); 641 } 642 break; 643 644 case MAPPGSZ_HEAP: 645 if (max_uheap_lpsize > MMU_PAGESIZE) { 646 pgsz = map_pgszheap(p, addr, len); 647 } 648 break; 649 } 650 return (pgsz); 651 } 652 653 654 /* assumes TTE8K...TTE4M == szc */ 655 656 static uint_t 657 map_szcvec(caddr_t addr, size_t size, uintptr_t off, int disable_lpgs, 658 size_t max_lpsize, size_t min_physmem) 659 { 660 caddr_t eaddr = addr + size; 661 uint_t szcvec = 0; 662 caddr_t raddr; 663 caddr_t readdr; 664 size_t pgsz; 665 int i; 666 667 if (physmem < min_physmem || max_lpsize <= MMU_PAGESIZE) { 668 return (0); 669 } 670 for (i = mmu_page_sizes - 1; i > 0; i--) { 671 if (disable_lpgs & (1 << i)) { 672 continue; 673 } 674 pgsz = page_get_pagesize(i); 675 if (pgsz > max_lpsize) { 676 continue; 677 } 678 raddr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz); 679 readdr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz); 680 if (raddr < addr || raddr >= readdr) { 681 continue; 682 } 683 if (P2PHASE((uintptr_t)addr ^ off, pgsz)) { 684 continue; 685 } 686 szcvec |= (1 << i); 687 /* 688 * And or in the remaining enabled page sizes. 689 */ 690 szcvec |= P2PHASE(~disable_lpgs, (1 << i)); 691 szcvec &= ~1; /* no need to return 8K pagesize */ 692 break; 693 } 694 return (szcvec); 695 } 696 697 /* 698 * Return a bit vector of large page size codes that 699 * can be used to map [addr, addr + len) region. 700 */ 701 /* ARGSUSED */ 702 uint_t 703 map_pgszcvec(caddr_t addr, size_t size, uintptr_t off, int flags, int type, 704 int memcntl) 705 { 706 if (flags & MAP_TEXT) { 707 return (map_szcvec(addr, size, off, 708 disable_auto_text_large_pages, 709 max_utext_lpsize, shm_lpg_min_physmem)); 710 711 } else if (flags & MAP_INITDATA) { 712 return (map_szcvec(addr, size, off, 713 disable_auto_data_large_pages, 714 max_uidata_lpsize, privm_lpg_min_physmem)); 715 716 } else if (type == MAPPGSZC_SHM) { 717 return (map_szcvec(addr, size, off, 718 disable_auto_data_large_pages, 719 max_shm_lpsize, shm_lpg_min_physmem)); 720 721 } else if (type == MAPPGSZC_HEAP) { 722 return (map_szcvec(addr, size, off, 723 disable_auto_data_large_pages, 724 max_uheap_lpsize, privm_lpg_min_physmem)); 725 726 } else if (type == MAPPGSZC_STACK) { 727 return (map_szcvec(addr, size, off, 728 disable_auto_data_large_pages, 729 max_ustack_lpsize, privm_lpg_min_physmem)); 730 731 } else { 732 return (map_szcvec(addr, size, off, 733 disable_auto_data_large_pages, 734 max_privmap_lpsize, privm_lpg_min_physmem)); 735 } 736 } 737 738 /* 739 * Anchored in the table below are counters used to keep track 740 * of free contiguous physical memory. Each element of the table contains 741 * the array of counters, the size of array which is allocated during 742 * startup based on physmax and a shift value used to convert a pagenum 743 * into a counter array index or vice versa. The table has page size 744 * for rows and region size for columns: 745 * 746 * page_counters[page_size][region_size] 747 * 748 * page_size: TTE size code of pages on page_size freelist. 749 * 750 * region_size: TTE size code of a candidate larger page made up 751 * made up of contiguous free page_size pages. 752 * 753 * As you go across a page_size row increasing region_size each 754 * element keeps track of how many (region_size - 1) size groups 755 * made up of page_size free pages can be coalesced into a 756 * regsion_size page. Yuck! Lets try an example: 757 * 758 * page_counters[1][3] is the table element used for identifying 759 * candidate 4M pages from contiguous pages off the 64K free list. 760 * Each index in the page_counters[1][3].array spans 4M. Its the 761 * number of free 512K size (regsion_size - 1) groups of contiguous 762 * 64K free pages. So when page_counters[1][3].counters[n] == 8 763 * we know we have a candidate 4M page made up of 512K size groups 764 * of 64K free pages. 765 */ 766 767 /* 768 * Per page size free lists. 3rd (max_mem_nodes) and 4th (page coloring bins) 769 * dimensions are allocated dynamically. 770 */ 771 page_t ***page_freelists[MMU_PAGE_SIZES][MAX_MEM_TYPES]; 772 773 /* 774 * For now there is only a single size cache list. 775 * Allocated dynamically. 776 */ 777 page_t ***page_cachelists[MAX_MEM_TYPES]; 778 779 kmutex_t *fpc_mutex[NPC_MUTEX]; 780 kmutex_t *cpc_mutex[NPC_MUTEX]; 781 782 /* 783 * Calculate space needed for page freelists and counters 784 */ 785 size_t 786 calc_free_pagelist_sz(void) 787 { 788 int szc; 789 size_t alloc_sz, cache_sz, free_sz; 790 791 /* 792 * one cachelist per color, node, and type 793 */ 794 cache_sz = (page_get_pagecolors(0) * sizeof (page_t *)) + 795 sizeof (page_t **); 796 cache_sz *= max_mem_nodes * MAX_MEM_TYPES; 797 798 /* 799 * one freelist per size, color, node, and type 800 */ 801 free_sz = sizeof (page_t **); 802 for (szc = 0; szc < mmu_page_sizes; szc++) 803 free_sz += sizeof (page_t *) * page_get_pagecolors(szc); 804 free_sz *= max_mem_nodes * MAX_MEM_TYPES; 805 806 alloc_sz = cache_sz + free_sz + page_ctrs_sz(); 807 return (alloc_sz); 808 } 809 810 caddr_t 811 alloc_page_freelists(caddr_t alloc_base) 812 { 813 int mnode, mtype; 814 int szc, clrs; 815 816 /* 817 * We only support small pages in the cachelist. 818 */ 819 for (mtype = 0; mtype < MAX_MEM_TYPES; mtype++) { 820 page_cachelists[mtype] = (page_t ***)alloc_base; 821 alloc_base += (max_mem_nodes * sizeof (page_t **)); 822 for (mnode = 0; mnode < max_mem_nodes; mnode++) { 823 page_cachelists[mtype][mnode] = (page_t **)alloc_base; 824 alloc_base += 825 (page_get_pagecolors(0) * sizeof (page_t *)); 826 } 827 } 828 829 /* 830 * Allocate freelists bins for all 831 * supported page sizes. 832 */ 833 for (szc = 0; szc < mmu_page_sizes; szc++) { 834 clrs = page_get_pagecolors(szc); 835 for (mtype = 0; mtype < MAX_MEM_TYPES; mtype++) { 836 page_freelists[szc][mtype] = (page_t ***)alloc_base; 837 alloc_base += (max_mem_nodes * sizeof (page_t **)); 838 for (mnode = 0; mnode < max_mem_nodes; mnode++) { 839 page_freelists[szc][mtype][mnode] = 840 (page_t **)alloc_base; 841 alloc_base += (clrs * (sizeof (page_t *))); 842 } 843 } 844 } 845 846 alloc_base = page_ctrs_alloc(alloc_base); 847 return (alloc_base); 848 } 849 850 /* 851 * Allocate page_freelists locks for a memnode from the nucleus data 852 * area. This is the first time that mmu_page_sizes is used during 853 * bootup, so check mmu_page_sizes initialization. 854 */ 855 int 856 ndata_alloc_page_mutexs(struct memlist *ndata) 857 { 858 size_t alloc_sz; 859 caddr_t alloc_base; 860 int i; 861 void page_coloring_init(); 862 863 page_coloring_init(); 864 if (&mmu_init_mmu_page_sizes) { 865 if (!mmu_init_mmu_page_sizes(0)) { 866 cmn_err(CE_PANIC, "mmu_page_sizes %d not initialized", 867 mmu_page_sizes); 868 } 869 } 870 ASSERT(mmu_page_sizes >= DEFAULT_MMU_PAGE_SIZES); 871 872 /* fpc_mutex and cpc_mutex */ 873 alloc_sz = 2 * NPC_MUTEX * max_mem_nodes * sizeof (kmutex_t); 874 875 alloc_base = ndata_alloc(ndata, alloc_sz, ecache_alignsize); 876 if (alloc_base == NULL) 877 return (-1); 878 879 ASSERT(((uintptr_t)alloc_base & (ecache_alignsize - 1)) == 0); 880 881 for (i = 0; i < NPC_MUTEX; i++) { 882 fpc_mutex[i] = (kmutex_t *)alloc_base; 883 alloc_base += (sizeof (kmutex_t) * max_mem_nodes); 884 cpc_mutex[i] = (kmutex_t *)alloc_base; 885 alloc_base += (sizeof (kmutex_t) * max_mem_nodes); 886 } 887 return (0); 888 } 889 890 /* 891 * To select our starting bin, we stride through the bins with a stride 892 * of 337. Why 337? It's prime, it's largeish, and it performs well both 893 * in simulation and practice for different workloads on varying cache sizes. 894 */ 895 uint32_t color_start_current = 0; 896 uint32_t color_start_stride = 337; 897 int color_start_random = 0; 898 899 /* ARGSUSED */ 900 uint_t 901 get_color_start(struct as *as) 902 { 903 uint32_t old, new; 904 905 if (consistent_coloring == 2 || color_start_random) { 906 return ((uint_t)(((gettick()) << (vac_shift - MMU_PAGESHIFT)) & 907 (hw_page_array[0].hp_colors - 1))); 908 } 909 910 do { 911 old = color_start_current; 912 new = old + (color_start_stride << (vac_shift - MMU_PAGESHIFT)); 913 } while (atomic_cas_32(&color_start_current, old, new) != old); 914 915 return ((uint_t)(new)); 916 } 917 918 /* 919 * Called once at startup from kphysm_init() -- before memialloc() 920 * is invoked to do the 1st page_free()/page_freelist_add(). 921 * 922 * initializes page_colors and page_colors_mask based on ecache_setsize. 923 * 924 * Also initializes the counter locks. 925 */ 926 void 927 page_coloring_init() 928 { 929 int a, i; 930 uint_t colors; 931 932 if (do_pg_coloring == 0) { 933 page_colors = 1; 934 for (i = 0; i < mmu_page_sizes; i++) { 935 colorequivszc[i] = 0; 936 hw_page_array[i].hp_colors = 1; 937 } 938 return; 939 } 940 941 /* 942 * Calculate page_colors from ecache_setsize. ecache_setsize contains 943 * the max ecache setsize of all cpus configured in the system or, for 944 * cheetah+ systems, the max possible ecache setsize for all possible 945 * cheetah+ cpus. 946 */ 947 page_colors = ecache_setsize / MMU_PAGESIZE; 948 page_colors_mask = page_colors - 1; 949 950 vac_colors = vac_size / MMU_PAGESIZE; 951 vac_colors_mask = vac_colors -1; 952 953 page_coloring_shift = 0; 954 a = ecache_setsize; 955 while (a >>= 1) { 956 page_coloring_shift++; 957 } 958 959 /* initialize number of colors per page size */ 960 for (i = 0; i < mmu_page_sizes; i++) { 961 hw_page_array[i].hp_colors = (page_colors_mask >> 962 (hw_page_array[i].hp_shift - hw_page_array[0].hp_shift)) 963 + 1; 964 colorequivszc[i] = 0; 965 } 966 967 /* 968 * initialize cpu_page_colors if ecache setsizes are homogenous. 969 * cpu_page_colors set to -1 during DR operation or during startup 970 * if setsizes are heterogenous. 971 * 972 * The value of cpu_page_colors determines if additional color bins 973 * need to be checked for a particular color in the page_get routines. 974 */ 975 if (cpu_setsize > 0 && cpu_page_colors == 0 && 976 cpu_setsize < ecache_setsize) { 977 cpu_page_colors = cpu_setsize / MMU_PAGESIZE; 978 a = lowbit(page_colors) - lowbit(cpu_page_colors); 979 ASSERT(a > 0); 980 ASSERT(a < 16); 981 982 for (i = 0; i < mmu_page_sizes; i++) { 983 if ((colors = hw_page_array[i].hp_colors) <= 1) { 984 continue; 985 } 986 while ((colors >> a) == 0) 987 a--; 988 ASSERT(a >= 0); 989 990 /* higher 4 bits encodes color equiv mask */ 991 colorequivszc[i] = (a << 4); 992 } 993 } 994 995 /* do cpu specific color initialization */ 996 if (&page_coloring_init_cpu) { 997 page_coloring_init_cpu(); 998 } 999 } 1000 1001 int 1002 bp_color(struct buf *bp) 1003 { 1004 int color = -1; 1005 1006 if (vac) { 1007 if ((bp->b_flags & B_PAGEIO) != 0) { 1008 color = sfmmu_get_ppvcolor(bp->b_pages); 1009 } else if (bp->b_un.b_addr != NULL) { 1010 color = sfmmu_get_addrvcolor(bp->b_un.b_addr); 1011 } 1012 } 1013 return (color < 0 ? 0 : ptob(color)); 1014 } 1015 1016 /* 1017 * Function for flushing D-cache when performing module relocations 1018 * to an alternate mapping. Stubbed out on all platforms except sun4u, 1019 * at least for now. 1020 */ 1021 void 1022 dcache_flushall() 1023 { 1024 sfmmu_cache_flushall(); 1025 } 1026 1027 static int 1028 kdi_range_overlap(uintptr_t va1, size_t sz1, uintptr_t va2, size_t sz2) 1029 { 1030 if (va1 < va2 && va1 + sz1 <= va2) 1031 return (0); 1032 1033 if (va2 < va1 && va2 + sz2 <= va1) 1034 return (0); 1035 1036 return (1); 1037 } 1038 1039 /* 1040 * Return the number of bytes, relative to the beginning of a given range, that 1041 * are non-toxic (can be read from and written to with relative impunity). 1042 */ 1043 size_t 1044 kdi_range_is_nontoxic(uintptr_t va, size_t sz, int write) 1045 { 1046 /* OBP reads are harmless, but we don't want people writing there */ 1047 if (write && kdi_range_overlap(va, sz, OFW_START_ADDR, OFW_END_ADDR - 1048 OFW_START_ADDR + 1)) 1049 return (va < OFW_START_ADDR ? OFW_START_ADDR - va : 0); 1050 1051 if (kdi_range_overlap(va, sz, PIOMAPBASE, PIOMAPSIZE)) 1052 return (va < PIOMAPBASE ? PIOMAPBASE - va : 0); 1053 1054 return (sz); /* no overlap */ 1055 } 1056 1057 /* 1058 * Minimum physmem required for enabling large pages for kernel heap 1059 * Currently we do not enable lp for kmem on systems with less 1060 * than 1GB of memory. This value can be changed via /etc/system 1061 */ 1062 size_t segkmem_lpminphysmem = 0x40000000; /* 1GB */ 1063 1064 /* 1065 * this function chooses large page size for kernel heap 1066 */ 1067 size_t 1068 get_segkmem_lpsize(size_t lpsize) 1069 { 1070 size_t memtotal = physmem * PAGESIZE; 1071 size_t mmusz; 1072 uint_t szc; 1073 1074 if (memtotal < segkmem_lpminphysmem) 1075 return (PAGESIZE); 1076 1077 if (plat_lpkmem_is_supported != NULL && 1078 plat_lpkmem_is_supported() == 0) 1079 return (PAGESIZE); 1080 1081 mmusz = mmu_get_kernel_lpsize(lpsize); 1082 szc = page_szc(mmusz); 1083 1084 while (szc) { 1085 if (!(disable_large_pages & (1 << szc))) 1086 return (page_get_pagesize(szc)); 1087 szc--; 1088 } 1089 return (PAGESIZE); 1090 }