1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * UNIX machine dependent virtual memory support. 28 */ 29 30 #include <sys/vm.h> 31 #include <sys/exec.h> 32 33 #include <sys/exechdr.h> 34 #include <vm/seg_kmem.h> 35 #include <sys/atomic.h> 36 #include <sys/archsystm.h> 37 #include <sys/machsystm.h> 38 #include <sys/kdi.h> 39 #include <sys/cpu_module.h> 40 41 #include <vm/hat_sfmmu.h> 42 43 #include <sys/memnode.h> 44 45 #include <sys/mem_config.h> 46 #include <sys/mem_cage.h> 47 #include <vm/vm_dep.h> 48 #include <vm/page.h> 49 #include <sys/platform_module.h> 50 51 /* 52 * These variables are set by module specific config routines. 53 * They are only set by modules which will use physical cache page coloring. 54 */ 55 int do_pg_coloring = 0; 56 57 /* 58 * These variables can be conveniently patched at kernel load time to 59 * prevent do_pg_coloring from being enabled by 60 * module specific config routines. 61 */ 62 63 int use_page_coloring = 1; 64 65 /* 66 * initialized by page_coloring_init() 67 */ 68 extern uint_t page_colors; 69 extern uint_t page_colors_mask; 70 extern uint_t page_coloring_shift; 71 int cpu_page_colors; 72 uint_t vac_colors = 0; 73 uint_t vac_colors_mask = 0; 74 75 /* cpu specific coloring initialization */ 76 extern void page_coloring_init_cpu(); 77 #pragma weak page_coloring_init_cpu 78 79 /* 80 * get the ecache setsize for the current cpu. 81 */ 82 #define CPUSETSIZE() (cpunodes[CPU->cpu_id].ecache_setsize) 83 84 plcnt_t plcnt; /* page list count */ 85 86 /* 87 * This variable is set by the cpu module to contain the lowest 88 * address not affected by the SF_ERRATA_57 workaround. It should 89 * remain 0 if the workaround is not needed. 90 */ 91 #if defined(SF_ERRATA_57) 92 caddr_t errata57_limit; 93 #endif 94 95 extern void page_relocate_hash(page_t *, page_t *); 96 97 /* 98 * these must be defined in platform specific areas 99 */ 100 extern void map_addr_proc(caddr_t *, size_t, offset_t, int, caddr_t, 101 struct proc *, uint_t); 102 extern page_t *page_get_freelist(struct vnode *, u_offset_t, struct seg *, 103 caddr_t, size_t, uint_t, struct lgrp *); 104 /* 105 * Convert page frame number to an OBMEM page frame number 106 * (i.e. put in the type bits -- zero for this implementation) 107 */ 108 pfn_t 109 impl_obmem_pfnum(pfn_t pf) 110 { 111 return (pf); 112 } 113 114 /* 115 * Use physmax to determine the highest physical page of DRAM memory 116 * It is assumed that any physical addresses above physmax is in IO space. 117 * We don't bother checking the low end because we assume that memory space 118 * begins at physical page frame 0. 119 * 120 * Return 1 if the page frame is onboard DRAM memory, else 0. 121 * Returns 0 for nvram so it won't be cached. 122 */ 123 int 124 pf_is_memory(pfn_t pf) 125 { 126 /* We must be IO space */ 127 if (pf > physmax) 128 return (0); 129 130 /* We must be memory space */ 131 return (1); 132 } 133 134 /* 135 * Handle a pagefault. 136 */ 137 faultcode_t 138 pagefault(caddr_t addr, enum fault_type type, enum seg_rw rw, int iskernel) 139 { 140 struct as *as; 141 struct proc *p; 142 faultcode_t res; 143 caddr_t base; 144 size_t len; 145 int err; 146 147 if (INVALID_VADDR(addr)) 148 return (FC_NOMAP); 149 150 if (iskernel) { 151 as = &kas; 152 } else { 153 p = curproc; 154 as = p->p_as; 155 #if defined(SF_ERRATA_57) 156 /* 157 * Prevent infinite loops due to a segment driver 158 * setting the execute permissions and the sfmmu hat 159 * silently ignoring them. 160 */ 161 if (rw == S_EXEC && AS_TYPE_64BIT(as) && 162 addr < errata57_limit) { 163 res = FC_NOMAP; 164 goto out; 165 } 166 #endif 167 } 168 169 /* 170 * Dispatch pagefault. 171 */ 172 res = as_fault(as->a_hat, as, addr, 1, type, rw); 173 174 /* 175 * If this isn't a potential unmapped hole in the user's 176 * UNIX data or stack segments, just return status info. 177 */ 178 if (!(res == FC_NOMAP && iskernel == 0)) 179 goto out; 180 181 /* 182 * Check to see if we happened to faulted on a currently unmapped 183 * part of the UNIX data or stack segments. If so, create a zfod 184 * mapping there and then try calling the fault routine again. 185 */ 186 base = p->p_brkbase; 187 len = p->p_brksize; 188 189 if (addr < base || addr >= base + len) { /* data seg? */ 190 base = (caddr_t)(p->p_usrstack - p->p_stksize); 191 len = p->p_stksize; 192 if (addr < base || addr >= p->p_usrstack) { /* stack seg? */ 193 /* not in either UNIX data or stack segments */ 194 res = FC_NOMAP; 195 goto out; 196 } 197 } 198 199 /* the rest of this function implements a 3.X 4.X 5.X compatibility */ 200 /* This code is probably not needed anymore */ 201 202 /* expand the gap to the page boundaries on each side */ 203 len = (((uintptr_t)base + len + PAGEOFFSET) & PAGEMASK) - 204 ((uintptr_t)base & PAGEMASK); 205 base = (caddr_t)((uintptr_t)base & PAGEMASK); 206 207 as_rangelock(as); 208 as_purge(as); 209 if (as_gap(as, PAGESIZE, &base, &len, AH_CONTAIN, addr) == 0) { 210 err = as_map(as, base, len, segvn_create, zfod_argsp); 211 as_rangeunlock(as); 212 if (err) { 213 res = FC_MAKE_ERR(err); 214 goto out; 215 } 216 } else { 217 /* 218 * This page is already mapped by another thread after we 219 * returned from as_fault() above. We just fallthrough 220 * as_fault() below. 221 */ 222 as_rangeunlock(as); 223 } 224 225 res = as_fault(as->a_hat, as, addr, 1, F_INVAL, rw); 226 227 out: 228 229 return (res); 230 } 231 232 /* 233 * This is the routine which defines the address limit implied 234 * by the flag '_MAP_LOW32'. USERLIMIT32 matches the highest 235 * mappable address in a 32-bit process on this platform (though 236 * perhaps we should make it be UINT32_MAX here?) 237 */ 238 void 239 map_addr(caddr_t *addrp, size_t len, offset_t off, int vacalign, uint_t flags) 240 { 241 struct proc *p = curproc; 242 caddr_t userlimit = flags & _MAP_LOW32 ? 243 (caddr_t)USERLIMIT32 : p->p_as->a_userlimit; 244 map_addr_proc(addrp, len, off, vacalign, userlimit, p, flags); 245 } 246 247 /* 248 * Some V9 CPUs have holes in the middle of the 64-bit virtual address range. 249 */ 250 caddr_t hole_start, hole_end; 251 252 /* 253 * kpm mapping window 254 */ 255 caddr_t kpm_vbase; 256 size_t kpm_size; 257 uchar_t kpm_size_shift; 258 259 int valid_va_range_aligned_wraparound; 260 /* 261 * Determine whether [*basep, *basep + *lenp) contains a mappable range of 262 * addresses at least "minlen" long, where the base of the range is at "off" 263 * phase from an "align" boundary and there is space for a "redzone"-sized 264 * redzone on either side of the range. On success, 1 is returned and *basep 265 * and *lenp are adjusted to describe the acceptable range (including 266 * the redzone). On failure, 0 is returned. 267 */ 268 int 269 valid_va_range_aligned(caddr_t *basep, size_t *lenp, size_t minlen, int dir, 270 size_t align, size_t redzone, size_t off) 271 { 272 caddr_t hi, lo; 273 size_t tot_len; 274 275 ASSERT(align == 0 ? off == 0 : off < align); 276 ASSERT(ISP2(align)); 277 ASSERT(align == 0 || align >= PAGESIZE); 278 279 lo = *basep; 280 hi = lo + *lenp; 281 tot_len = minlen + 2 * redzone; /* need at least this much space */ 282 283 /* If hi rolled over the top try cutting back. */ 284 if (hi < lo) { 285 *lenp = 0UL - (uintptr_t)lo - 1UL; 286 /* Trying to see if this really happens, and then if so, why */ 287 valid_va_range_aligned_wraparound++; 288 hi = lo + *lenp; 289 } 290 if (*lenp < tot_len) { 291 return (0); 292 } 293 294 /* 295 * Deal with a possible hole in the address range between 296 * hole_start and hole_end that should never be mapped by the MMU. 297 */ 298 299 if (lo < hole_start) { 300 if (hi > hole_start) 301 if (hi < hole_end) 302 hi = hole_start; 303 else 304 /* lo < hole_start && hi >= hole_end */ 305 if (dir == AH_LO) { 306 /* 307 * prefer lowest range 308 */ 309 if (hole_start - lo >= tot_len) 310 hi = hole_start; 311 else if (hi - hole_end >= tot_len) 312 lo = hole_end; 313 else 314 return (0); 315 } else { 316 /* 317 * prefer highest range 318 */ 319 if (hi - hole_end >= tot_len) 320 lo = hole_end; 321 else if (hole_start - lo >= tot_len) 322 hi = hole_start; 323 else 324 return (0); 325 } 326 } else { 327 /* lo >= hole_start */ 328 if (hi < hole_end) 329 return (0); 330 if (lo < hole_end) 331 lo = hole_end; 332 } 333 334 /* Check if remaining length is too small */ 335 if (hi - lo < tot_len) { 336 return (0); 337 } 338 if (align > 1) { 339 caddr_t tlo = lo + redzone; 340 caddr_t thi = hi - redzone; 341 tlo = (caddr_t)P2PHASEUP((uintptr_t)tlo, align, off); 342 if (tlo < lo + redzone) { 343 return (0); 344 } 345 if (thi < tlo || thi - tlo < minlen) { 346 return (0); 347 } 348 } 349 *basep = lo; 350 *lenp = hi - lo; 351 return (1); 352 } 353 354 /* 355 * Determine whether [*basep, *basep + *lenp) contains a mappable range of 356 * addresses at least "minlen" long. On success, 1 is returned and *basep 357 * and *lenp are adjusted to describe the acceptable range. On failure, 0 358 * is returned. 359 */ 360 int 361 valid_va_range(caddr_t *basep, size_t *lenp, size_t minlen, int dir) 362 { 363 return (valid_va_range_aligned(basep, lenp, minlen, dir, 0, 0, 0)); 364 } 365 366 /* 367 * Determine whether [addr, addr+len] with protections `prot' are valid 368 * for a user address space. 369 */ 370 /*ARGSUSED*/ 371 int 372 valid_usr_range(caddr_t addr, size_t len, uint_t prot, struct as *as, 373 caddr_t userlimit) 374 { 375 caddr_t eaddr = addr + len; 376 377 if (eaddr <= addr || addr >= userlimit || eaddr > userlimit) 378 return (RANGE_BADADDR); 379 380 /* 381 * Determine if the address range falls within an illegal 382 * range of the MMU. 383 */ 384 if (eaddr > hole_start && addr < hole_end) 385 return (RANGE_BADADDR); 386 387 #if defined(SF_ERRATA_57) 388 /* 389 * Make sure USERLIMIT isn't raised too high 390 */ 391 ASSERT64(addr <= (caddr_t)0xffffffff80000000ul || 392 errata57_limit == 0); 393 394 if (AS_TYPE_64BIT(as) && 395 (addr < errata57_limit) && 396 (prot & PROT_EXEC)) 397 return (RANGE_BADPROT); 398 #endif /* SF_ERRATA57 */ 399 return (RANGE_OKAY); 400 } 401 402 /* 403 * Routine used to check to see if an a.out can be executed 404 * by the current machine/architecture. 405 */ 406 int 407 chkaout(struct exdata *exp) 408 { 409 if (exp->ux_mach == M_SPARC) 410 return (0); 411 else 412 return (ENOEXEC); 413 } 414 415 /* 416 * The following functions return information about an a.out 417 * which is used when a program is executed. 418 */ 419 420 /* 421 * Return the load memory address for the data segment. 422 */ 423 caddr_t 424 getdmem(struct exec *exp) 425 { 426 /* 427 * XXX - Sparc Reference Hack approaching 428 * Remember that we are loading 429 * 8k executables into a 4k machine 430 * DATA_ALIGN == 2 * PAGESIZE 431 */ 432 if (exp->a_text) 433 return ((caddr_t)(roundup(USRTEXT + exp->a_text, DATA_ALIGN))); 434 else 435 return ((caddr_t)USRTEXT); 436 } 437 438 /* 439 * Return the starting disk address for the data segment. 440 */ 441 ulong_t 442 getdfile(struct exec *exp) 443 { 444 if (exp->a_magic == ZMAGIC) 445 return (exp->a_text); 446 else 447 return (sizeof (struct exec) + exp->a_text); 448 } 449 450 /* 451 * Return the load memory address for the text segment. 452 */ 453 454 /*ARGSUSED*/ 455 caddr_t 456 gettmem(struct exec *exp) 457 { 458 return ((caddr_t)USRTEXT); 459 } 460 461 /* 462 * Return the file byte offset for the text segment. 463 */ 464 uint_t 465 gettfile(struct exec *exp) 466 { 467 if (exp->a_magic == ZMAGIC) 468 return (0); 469 else 470 return (sizeof (struct exec)); 471 } 472 473 void 474 getexinfo( 475 struct exdata *edp_in, 476 struct exdata *edp_out, 477 int *pagetext, 478 int *pagedata) 479 { 480 *edp_out = *edp_in; /* structure copy */ 481 482 if ((edp_in->ux_mag == ZMAGIC) && 483 ((edp_in->vp->v_flag & VNOMAP) == 0)) { 484 *pagetext = 1; 485 *pagedata = 1; 486 } else { 487 *pagetext = 0; 488 *pagedata = 0; 489 } 490 } 491 492 /* 493 * Return non 0 value if the address may cause a VAC alias with KPM mappings. 494 * KPM selects an address such that it's equal offset modulo shm_alignment and 495 * assumes it can't be in VAC conflict with any larger than PAGESIZE mapping. 496 */ 497 int 498 map_addr_vacalign_check(caddr_t addr, u_offset_t off) 499 { 500 if (vac) { 501 return (((uintptr_t)addr ^ off) & shm_alignment - 1); 502 } else { 503 return (0); 504 } 505 } 506 507 /* 508 * Sanity control. Don't use large pages regardless of user 509 * settings if there's less than priv or shm_lpg_min_physmem memory installed. 510 * The units for this variable is 8K pages. 511 */ 512 pgcnt_t shm_lpg_min_physmem = 131072; /* 1GB */ 513 pgcnt_t privm_lpg_min_physmem = 131072; /* 1GB */ 514 515 static size_t 516 map_pgszheap(struct proc *p, caddr_t addr, size_t len) 517 { 518 size_t pgsz = MMU_PAGESIZE; 519 int szc; 520 521 /* 522 * If len is zero, retrieve from proc and don't demote the page size. 523 * Use atleast the default pagesize. 524 */ 525 if (len == 0) { 526 len = p->p_brkbase + p->p_brksize - p->p_bssbase; 527 } 528 len = MAX(len, default_uheap_lpsize); 529 530 for (szc = mmu_page_sizes - 1; szc >= 0; szc--) { 531 pgsz = hw_page_array[szc].hp_size; 532 if ((disable_auto_data_large_pages & (1 << szc)) || 533 pgsz > max_uheap_lpsize) 534 continue; 535 if (len >= pgsz) { 536 break; 537 } 538 } 539 540 /* 541 * If addr == 0 we were called by memcntl() when the 542 * size code is 0. Don't set pgsz less than current size. 543 */ 544 if (addr == 0 && (pgsz < hw_page_array[p->p_brkpageszc].hp_size)) { 545 pgsz = hw_page_array[p->p_brkpageszc].hp_size; 546 } 547 548 return (pgsz); 549 } 550 551 static size_t 552 map_pgszstk(struct proc *p, caddr_t addr, size_t len) 553 { 554 size_t pgsz = MMU_PAGESIZE; 555 int szc; 556 557 /* 558 * If len is zero, retrieve from proc and don't demote the page size. 559 * Use atleast the default pagesize. 560 */ 561 if (len == 0) { 562 len = p->p_stksize; 563 } 564 len = MAX(len, default_ustack_lpsize); 565 566 for (szc = mmu_page_sizes - 1; szc >= 0; szc--) { 567 pgsz = hw_page_array[szc].hp_size; 568 if ((disable_auto_data_large_pages & (1 << szc)) || 569 pgsz > max_ustack_lpsize) 570 continue; 571 if (len >= pgsz) { 572 break; 573 } 574 } 575 576 /* 577 * If addr == 0 we were called by memcntl() or exec_args() when the 578 * size code is 0. Don't set pgsz less than current size. 579 */ 580 if (addr == 0 && (pgsz < hw_page_array[p->p_stkpageszc].hp_size)) { 581 pgsz = hw_page_array[p->p_stkpageszc].hp_size; 582 } 583 584 return (pgsz); 585 } 586 587 static size_t 588 map_pgszism(caddr_t addr, size_t len) 589 { 590 uint_t szc; 591 size_t pgsz; 592 593 for (szc = mmu_page_sizes - 1; szc >= TTE4M; szc--) { 594 if (disable_ism_large_pages & (1 << szc)) 595 continue; 596 597 pgsz = hw_page_array[szc].hp_size; 598 if ((len >= pgsz) && IS_P2ALIGNED(addr, pgsz)) 599 return (pgsz); 600 } 601 602 return (DEFAULT_ISM_PAGESIZE); 603 } 604 605 /* 606 * Suggest a page size to be used to map a segment of type maptype and length 607 * len. Returns a page size (not a size code). 608 */ 609 /* ARGSUSED */ 610 size_t 611 map_pgsz(int maptype, struct proc *p, caddr_t addr, size_t len, int memcntl) 612 { 613 size_t pgsz = MMU_PAGESIZE; 614 615 ASSERT(maptype != MAPPGSZ_VA); 616 617 if (maptype != MAPPGSZ_ISM && physmem < privm_lpg_min_physmem) { 618 return (MMU_PAGESIZE); 619 } 620 621 switch (maptype) { 622 case MAPPGSZ_ISM: 623 pgsz = map_pgszism(addr, len); 624 break; 625 626 case MAPPGSZ_STK: 627 if (max_ustack_lpsize > MMU_PAGESIZE) { 628 pgsz = map_pgszstk(p, addr, len); 629 } 630 break; 631 632 case MAPPGSZ_HEAP: 633 if (max_uheap_lpsize > MMU_PAGESIZE) { 634 pgsz = map_pgszheap(p, addr, len); 635 } 636 break; 637 } 638 return (pgsz); 639 } 640 641 642 /* assumes TTE8K...TTE4M == szc */ 643 644 static uint_t 645 map_szcvec(caddr_t addr, size_t size, uintptr_t off, int disable_lpgs, 646 size_t max_lpsize, size_t min_physmem) 647 { 648 caddr_t eaddr = addr + size; 649 uint_t szcvec = 0; 650 caddr_t raddr; 651 caddr_t readdr; 652 size_t pgsz; 653 int i; 654 655 if (physmem < min_physmem || max_lpsize <= MMU_PAGESIZE) { 656 return (0); 657 } 658 for (i = mmu_page_sizes - 1; i > 0; i--) { 659 if (disable_lpgs & (1 << i)) { 660 continue; 661 } 662 pgsz = page_get_pagesize(i); 663 if (pgsz > max_lpsize) { 664 continue; 665 } 666 raddr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz); 667 readdr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz); 668 if (raddr < addr || raddr >= readdr) { 669 continue; 670 } 671 if (P2PHASE((uintptr_t)addr ^ off, pgsz)) { 672 continue; 673 } 674 szcvec |= (1 << i); 675 /* 676 * And or in the remaining enabled page sizes. 677 */ 678 szcvec |= P2PHASE(~disable_lpgs, (1 << i)); 679 szcvec &= ~1; /* no need to return 8K pagesize */ 680 break; 681 } 682 return (szcvec); 683 } 684 685 /* 686 * Return a bit vector of large page size codes that 687 * can be used to map [addr, addr + len) region. 688 */ 689 /* ARGSUSED */ 690 uint_t 691 map_pgszcvec(caddr_t addr, size_t size, uintptr_t off, int flags, int type, 692 int memcntl) 693 { 694 if (flags & MAP_TEXT) { 695 return (map_szcvec(addr, size, off, 696 disable_auto_text_large_pages, 697 max_utext_lpsize, shm_lpg_min_physmem)); 698 699 } else if (flags & MAP_INITDATA) { 700 return (map_szcvec(addr, size, off, 701 disable_auto_data_large_pages, 702 max_uidata_lpsize, privm_lpg_min_physmem)); 703 704 } else if (type == MAPPGSZC_SHM) { 705 return (map_szcvec(addr, size, off, 706 disable_auto_data_large_pages, 707 max_shm_lpsize, shm_lpg_min_physmem)); 708 709 } else if (type == MAPPGSZC_HEAP) { 710 return (map_szcvec(addr, size, off, 711 disable_auto_data_large_pages, 712 max_uheap_lpsize, privm_lpg_min_physmem)); 713 714 } else if (type == MAPPGSZC_STACK) { 715 return (map_szcvec(addr, size, off, 716 disable_auto_data_large_pages, 717 max_ustack_lpsize, privm_lpg_min_physmem)); 718 719 } else { 720 return (map_szcvec(addr, size, off, 721 disable_auto_data_large_pages, 722 max_privmap_lpsize, privm_lpg_min_physmem)); 723 } 724 } 725 726 /* 727 * Anchored in the table below are counters used to keep track 728 * of free contiguous physical memory. Each element of the table contains 729 * the array of counters, the size of array which is allocated during 730 * startup based on physmax and a shift value used to convert a pagenum 731 * into a counter array index or vice versa. The table has page size 732 * for rows and region size for columns: 733 * 734 * page_counters[page_size][region_size] 735 * 736 * page_size: TTE size code of pages on page_size freelist. 737 * 738 * region_size: TTE size code of a candidate larger page made up 739 * made up of contiguous free page_size pages. 740 * 741 * As you go across a page_size row increasing region_size each 742 * element keeps track of how many (region_size - 1) size groups 743 * made up of page_size free pages can be coalesced into a 744 * regsion_size page. Yuck! Lets try an example: 745 * 746 * page_counters[1][3] is the table element used for identifying 747 * candidate 4M pages from contiguous pages off the 64K free list. 748 * Each index in the page_counters[1][3].array spans 4M. Its the 749 * number of free 512K size (regsion_size - 1) groups of contiguous 750 * 64K free pages. So when page_counters[1][3].counters[n] == 8 751 * we know we have a candidate 4M page made up of 512K size groups 752 * of 64K free pages. 753 */ 754 755 /* 756 * Per page size free lists. 3rd (max_mem_nodes) and 4th (page coloring bins) 757 * dimensions are allocated dynamically. 758 */ 759 page_t ***page_freelists[MMU_PAGE_SIZES][MAX_MEM_TYPES]; 760 761 /* 762 * For now there is only a single size cache list. 763 * Allocated dynamically. 764 */ 765 page_t ***page_cachelists[MAX_MEM_TYPES]; 766 767 kmutex_t *fpc_mutex[NPC_MUTEX]; 768 kmutex_t *cpc_mutex[NPC_MUTEX]; 769 770 /* 771 * Calculate space needed for page freelists and counters 772 */ 773 size_t 774 calc_free_pagelist_sz(void) 775 { 776 int szc; 777 size_t alloc_sz, cache_sz, free_sz; 778 779 /* 780 * one cachelist per color, node, and type 781 */ 782 cache_sz = (page_get_pagecolors(0) * sizeof (page_t *)) + 783 sizeof (page_t **); 784 cache_sz *= max_mem_nodes * MAX_MEM_TYPES; 785 786 /* 787 * one freelist per size, color, node, and type 788 */ 789 free_sz = sizeof (page_t **); 790 for (szc = 0; szc < mmu_page_sizes; szc++) 791 free_sz += sizeof (page_t *) * page_get_pagecolors(szc); 792 free_sz *= max_mem_nodes * MAX_MEM_TYPES; 793 794 alloc_sz = cache_sz + free_sz + page_ctrs_sz(); 795 return (alloc_sz); 796 } 797 798 caddr_t 799 alloc_page_freelists(caddr_t alloc_base) 800 { 801 int mnode, mtype; 802 int szc, clrs; 803 804 /* 805 * We only support small pages in the cachelist. 806 */ 807 for (mtype = 0; mtype < MAX_MEM_TYPES; mtype++) { 808 page_cachelists[mtype] = (page_t ***)alloc_base; 809 alloc_base += (max_mem_nodes * sizeof (page_t **)); 810 for (mnode = 0; mnode < max_mem_nodes; mnode++) { 811 page_cachelists[mtype][mnode] = (page_t **)alloc_base; 812 alloc_base += 813 (page_get_pagecolors(0) * sizeof (page_t *)); 814 } 815 } 816 817 /* 818 * Allocate freelists bins for all 819 * supported page sizes. 820 */ 821 for (szc = 0; szc < mmu_page_sizes; szc++) { 822 clrs = page_get_pagecolors(szc); 823 for (mtype = 0; mtype < MAX_MEM_TYPES; mtype++) { 824 page_freelists[szc][mtype] = (page_t ***)alloc_base; 825 alloc_base += (max_mem_nodes * sizeof (page_t **)); 826 for (mnode = 0; mnode < max_mem_nodes; mnode++) { 827 page_freelists[szc][mtype][mnode] = 828 (page_t **)alloc_base; 829 alloc_base += (clrs * (sizeof (page_t *))); 830 } 831 } 832 } 833 834 alloc_base = page_ctrs_alloc(alloc_base); 835 return (alloc_base); 836 } 837 838 /* 839 * Allocate page_freelists locks for a memnode from the nucleus data 840 * area. This is the first time that mmu_page_sizes is used during 841 * bootup, so check mmu_page_sizes initialization. 842 */ 843 int 844 ndata_alloc_page_mutexs(struct memlist *ndata) 845 { 846 size_t alloc_sz; 847 caddr_t alloc_base; 848 int i; 849 void page_coloring_init(); 850 851 page_coloring_init(); 852 if (&mmu_init_mmu_page_sizes) { 853 if (!mmu_init_mmu_page_sizes(0)) { 854 cmn_err(CE_PANIC, "mmu_page_sizes %d not initialized", 855 mmu_page_sizes); 856 } 857 } 858 ASSERT(mmu_page_sizes >= DEFAULT_MMU_PAGE_SIZES); 859 860 /* fpc_mutex and cpc_mutex */ 861 alloc_sz = 2 * NPC_MUTEX * max_mem_nodes * sizeof (kmutex_t); 862 863 alloc_base = ndata_alloc(ndata, alloc_sz, ecache_alignsize); 864 if (alloc_base == NULL) 865 return (-1); 866 867 ASSERT(((uintptr_t)alloc_base & (ecache_alignsize - 1)) == 0); 868 869 for (i = 0; i < NPC_MUTEX; i++) { 870 fpc_mutex[i] = (kmutex_t *)alloc_base; 871 alloc_base += (sizeof (kmutex_t) * max_mem_nodes); 872 cpc_mutex[i] = (kmutex_t *)alloc_base; 873 alloc_base += (sizeof (kmutex_t) * max_mem_nodes); 874 } 875 return (0); 876 } 877 878 /* 879 * To select our starting bin, we stride through the bins with a stride 880 * of 337. Why 337? It's prime, it's largeish, and it performs well both 881 * in simulation and practice for different workloads on varying cache sizes. 882 */ 883 uint32_t color_start_current = 0; 884 uint32_t color_start_stride = 337; 885 int color_start_random = 0; 886 887 /* ARGSUSED */ 888 uint_t 889 get_color_start(struct as *as) 890 { 891 uint32_t old, new; 892 893 if (consistent_coloring == 2 || color_start_random) { 894 return ((uint_t)(((gettick()) << (vac_shift - MMU_PAGESHIFT)) & 895 (hw_page_array[0].hp_colors - 1))); 896 } 897 898 do { 899 old = color_start_current; 900 new = old + (color_start_stride << (vac_shift - MMU_PAGESHIFT)); 901 } while (atomic_cas_32(&color_start_current, old, new) != old); 902 903 return ((uint_t)(new)); 904 } 905 906 /* 907 * Called once at startup from kphysm_init() -- before memialloc() 908 * is invoked to do the 1st page_free()/page_freelist_add(). 909 * 910 * initializes page_colors and page_colors_mask based on ecache_setsize. 911 * 912 * Also initializes the counter locks. 913 */ 914 void 915 page_coloring_init() 916 { 917 int a, i; 918 uint_t colors; 919 920 if (do_pg_coloring == 0) { 921 page_colors = 1; 922 for (i = 0; i < mmu_page_sizes; i++) { 923 colorequivszc[i] = 0; 924 hw_page_array[i].hp_colors = 1; 925 } 926 return; 927 } 928 929 /* 930 * Calculate page_colors from ecache_setsize. ecache_setsize contains 931 * the max ecache setsize of all cpus configured in the system or, for 932 * cheetah+ systems, the max possible ecache setsize for all possible 933 * cheetah+ cpus. 934 */ 935 page_colors = ecache_setsize / MMU_PAGESIZE; 936 page_colors_mask = page_colors - 1; 937 938 vac_colors = vac_size / MMU_PAGESIZE; 939 vac_colors_mask = vac_colors -1; 940 941 page_coloring_shift = 0; 942 a = ecache_setsize; 943 while (a >>= 1) { 944 page_coloring_shift++; 945 } 946 947 /* initialize number of colors per page size */ 948 for (i = 0; i < mmu_page_sizes; i++) { 949 hw_page_array[i].hp_colors = (page_colors_mask >> 950 (hw_page_array[i].hp_shift - hw_page_array[0].hp_shift)) 951 + 1; 952 colorequivszc[i] = 0; 953 } 954 955 /* 956 * initialize cpu_page_colors if ecache setsizes are homogenous. 957 * cpu_page_colors set to -1 during DR operation or during startup 958 * if setsizes are heterogenous. 959 * 960 * The value of cpu_page_colors determines if additional color bins 961 * need to be checked for a particular color in the page_get routines. 962 */ 963 if (cpu_setsize > 0 && cpu_page_colors == 0 && 964 cpu_setsize < ecache_setsize) { 965 cpu_page_colors = cpu_setsize / MMU_PAGESIZE; 966 a = lowbit(page_colors) - lowbit(cpu_page_colors); 967 ASSERT(a > 0); 968 ASSERT(a < 16); 969 970 for (i = 0; i < mmu_page_sizes; i++) { 971 if ((colors = hw_page_array[i].hp_colors) <= 1) { 972 continue; 973 } 974 while ((colors >> a) == 0) 975 a--; 976 ASSERT(a >= 0); 977 978 /* higher 4 bits encodes color equiv mask */ 979 colorequivszc[i] = (a << 4); 980 } 981 } 982 983 /* do cpu specific color initialization */ 984 if (&page_coloring_init_cpu) { 985 page_coloring_init_cpu(); 986 } 987 } 988 989 int 990 bp_color(struct buf *bp) 991 { 992 int color = -1; 993 994 if (vac) { 995 if ((bp->b_flags & B_PAGEIO) != 0) { 996 color = sfmmu_get_ppvcolor(bp->b_pages); 997 } else if (bp->b_un.b_addr != NULL) { 998 color = sfmmu_get_addrvcolor(bp->b_un.b_addr); 999 } 1000 } 1001 return (color < 0 ? 0 : ptob(color)); 1002 } 1003 1004 /* 1005 * Function for flushing D-cache when performing module relocations 1006 * to an alternate mapping. Stubbed out on all platforms except sun4u, 1007 * at least for now. 1008 */ 1009 void 1010 dcache_flushall() 1011 { 1012 sfmmu_cache_flushall(); 1013 } 1014 1015 static int 1016 kdi_range_overlap(uintptr_t va1, size_t sz1, uintptr_t va2, size_t sz2) 1017 { 1018 if (va1 < va2 && va1 + sz1 <= va2) 1019 return (0); 1020 1021 if (va2 < va1 && va2 + sz2 <= va1) 1022 return (0); 1023 1024 return (1); 1025 } 1026 1027 /* 1028 * Return the number of bytes, relative to the beginning of a given range, that 1029 * are non-toxic (can be read from and written to with relative impunity). 1030 */ 1031 size_t 1032 kdi_range_is_nontoxic(uintptr_t va, size_t sz, int write) 1033 { 1034 /* OBP reads are harmless, but we don't want people writing there */ 1035 if (write && kdi_range_overlap(va, sz, OFW_START_ADDR, OFW_END_ADDR - 1036 OFW_START_ADDR + 1)) 1037 return (va < OFW_START_ADDR ? OFW_START_ADDR - va : 0); 1038 1039 if (kdi_range_overlap(va, sz, PIOMAPBASE, PIOMAPSIZE)) 1040 return (va < PIOMAPBASE ? PIOMAPBASE - va : 0); 1041 1042 return (sz); /* no overlap */ 1043 } 1044 1045 /* 1046 * Minimum physmem required for enabling large pages for kernel heap 1047 * Currently we do not enable lp for kmem on systems with less 1048 * than 1GB of memory. This value can be changed via /etc/system 1049 */ 1050 size_t segkmem_lpminphysmem = 0x40000000; /* 1GB */ 1051 1052 /* 1053 * this function chooses large page size for kernel heap 1054 */ 1055 size_t 1056 get_segkmem_lpsize(size_t lpsize) 1057 { 1058 size_t memtotal = physmem * PAGESIZE; 1059 size_t mmusz; 1060 uint_t szc; 1061 1062 if (memtotal < segkmem_lpminphysmem) 1063 return (PAGESIZE); 1064 1065 if (plat_lpkmem_is_supported != NULL && 1066 plat_lpkmem_is_supported() == 0) 1067 return (PAGESIZE); 1068 1069 mmusz = mmu_get_kernel_lpsize(lpsize); 1070 szc = page_szc(mmusz); 1071 1072 while (szc) { 1073 if (!(disable_large_pages & (1 << szc))) 1074 return (page_get_pagesize(szc)); 1075 szc--; 1076 } 1077 return (PAGESIZE); 1078 }