1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 
  26 /*
  27  * UNIX machine dependent virtual memory support.
  28  */
  29 
  30 #include <sys/vm.h>
  31 #include <sys/exec.h>
  32 
  33 #include <sys/exechdr.h>
  34 #include <vm/seg_kmem.h>
  35 #include <sys/atomic.h>
  36 #include <sys/archsystm.h>
  37 #include <sys/machsystm.h>
  38 #include <sys/kdi.h>
  39 #include <sys/cpu_module.h>
  40 #include <sys/secflags.h>
  41 
  42 #include <vm/hat_sfmmu.h>
  43 
  44 #include <sys/memnode.h>
  45 
  46 #include <sys/mem_config.h>
  47 #include <sys/mem_cage.h>
  48 #include <vm/vm_dep.h>
  49 #include <vm/page.h>
  50 #include <sys/platform_module.h>
  51 
  52 /*
  53  * These variables are set by module specific config routines.
  54  * They are only set by modules which will use physical cache page coloring.
  55  */
  56 int do_pg_coloring = 0;
  57 
  58 /*
  59  * These variables can be conveniently patched at kernel load time to
  60  * prevent do_pg_coloring from being enabled by
  61  * module specific config routines.
  62  */
  63 
  64 int use_page_coloring = 1;
  65 
  66 /*
  67  * initialized by page_coloring_init()
  68  */
  69 extern uint_t page_colors;
  70 extern uint_t page_colors_mask;
  71 extern uint_t page_coloring_shift;
  72 int cpu_page_colors;
  73 uint_t vac_colors = 0;
  74 uint_t vac_colors_mask = 0;
  75 
  76 /* cpu specific coloring initialization */
  77 extern void page_coloring_init_cpu();
  78 #pragma weak page_coloring_init_cpu
  79 
  80 /*
  81  * get the ecache setsize for the current cpu.
  82  */
  83 #define CPUSETSIZE()    (cpunodes[CPU->cpu_id].ecache_setsize)
  84 
  85 plcnt_t         plcnt;          /* page list count */
  86 
  87 /*
  88  * This variable is set by the cpu module to contain the lowest
  89  * address not affected by the SF_ERRATA_57 workaround.  It should
  90  * remain 0 if the workaround is not needed.
  91  */
  92 #if defined(SF_ERRATA_57)
  93 caddr_t errata57_limit;
  94 #endif
  95 
  96 extern void page_relocate_hash(page_t *, page_t *);
  97 
  98 /*
  99  * these must be defined in platform specific areas
 100  */
 101 extern void map_addr_proc(caddr_t *, size_t, offset_t, int, caddr_t,
 102         struct proc *, uint_t);
 103 extern page_t *page_get_freelist(struct vnode *, u_offset_t, struct seg *,
 104         caddr_t, size_t, uint_t, struct lgrp *);
 105 /*
 106  * Convert page frame number to an OBMEM page frame number
 107  * (i.e. put in the type bits -- zero for this implementation)
 108  */
 109 pfn_t
 110 impl_obmem_pfnum(pfn_t pf)
 111 {
 112         return (pf);
 113 }
 114 
 115 /*
 116  * Use physmax to determine the highest physical page of DRAM memory
 117  * It is assumed that any physical addresses above physmax is in IO space.
 118  * We don't bother checking the low end because we assume that memory space
 119  * begins at physical page frame 0.
 120  *
 121  * Return 1 if the page frame is onboard DRAM memory, else 0.
 122  * Returns 0 for nvram so it won't be cached.
 123  */
 124 int
 125 pf_is_memory(pfn_t pf)
 126 {
 127         /* We must be IO space */
 128         if (pf > physmax)
 129                 return (0);
 130 
 131         /* We must be memory space */
 132         return (1);
 133 }
 134 
 135 /*
 136  * Handle a pagefault.
 137  */
 138 faultcode_t
 139 pagefault(caddr_t addr, enum fault_type type, enum seg_rw rw, int iskernel)
 140 {
 141         struct as *as;
 142         struct proc *p;
 143         faultcode_t res;
 144         caddr_t base;
 145         size_t len;
 146         int err;
 147 
 148         if (INVALID_VADDR(addr))
 149                 return (FC_NOMAP);
 150 
 151         if (iskernel) {
 152                 as = &kas;
 153         } else {
 154                 p = curproc;
 155                 as = p->p_as;
 156 #if defined(SF_ERRATA_57)
 157                 /*
 158                  * Prevent infinite loops due to a segment driver
 159                  * setting the execute permissions and the sfmmu hat
 160                  * silently ignoring them.
 161                  */
 162                 if (rw == S_EXEC && AS_TYPE_64BIT(as) &&
 163                     addr < errata57_limit) {
 164                         res = FC_NOMAP;
 165                         goto out;
 166                 }
 167 #endif
 168         }
 169 
 170         /*
 171          * Dispatch pagefault.
 172          */
 173         res = as_fault(as->a_hat, as, addr, 1, type, rw);
 174 
 175         /*
 176          * If this isn't a potential unmapped hole in the user's
 177          * UNIX data or stack segments, just return status info.
 178          */
 179         if (!(res == FC_NOMAP && iskernel == 0))
 180                 goto out;
 181 
 182         /*
 183          * Check to see if we happened to faulted on a currently unmapped
 184          * part of the UNIX data or stack segments.  If so, create a zfod
 185          * mapping there and then try calling the fault routine again.
 186          */
 187         base = p->p_brkbase;
 188         len = p->p_brksize;
 189 
 190         if (addr < base || addr >= base + len) {          /* data seg? */
 191                 base = (caddr_t)(p->p_usrstack - p->p_stksize);
 192                 len = p->p_stksize;
 193                 if (addr < base || addr >= p->p_usrstack) {    /* stack seg? */
 194                         /* not in either UNIX data or stack segments */
 195                         res = FC_NOMAP;
 196                         goto out;
 197                 }
 198         }
 199 
 200         /* the rest of this function implements a 3.X 4.X 5.X compatibility */
 201         /* This code is probably not needed anymore */
 202 
 203         /* expand the gap to the page boundaries on each side */
 204         len = (((uintptr_t)base + len + PAGEOFFSET) & PAGEMASK) -
 205             ((uintptr_t)base & PAGEMASK);
 206         base = (caddr_t)((uintptr_t)base & PAGEMASK);
 207 
 208         as_rangelock(as);
 209         as_purge(as);
 210         if (as_gap(as, PAGESIZE, &base, &len, AH_CONTAIN, addr) == 0) {
 211                 err = as_map(as, base, len, segvn_create, zfod_argsp);
 212                 as_rangeunlock(as);
 213                 if (err) {
 214                         res = FC_MAKE_ERR(err);
 215                         goto out;
 216                 }
 217         } else {
 218                 /*
 219                  * This page is already mapped by another thread after we
 220                  * returned from as_fault() above.  We just fallthrough
 221                  * as_fault() below.
 222                  */
 223                 as_rangeunlock(as);
 224         }
 225 
 226         res = as_fault(as->a_hat, as, addr, 1, F_INVAL, rw);
 227 
 228 out:
 229 
 230         return (res);
 231 }
 232 
 233 /*
 234  * This is the routine which defines the address limit implied
 235  * by the flag '_MAP_LOW32'.  USERLIMIT32 matches the highest
 236  * mappable address in a 32-bit process on this platform (though
 237  * perhaps we should make it be UINT32_MAX here?)
 238  */
 239 void
 240 map_addr(caddr_t *addrp, size_t len, offset_t off, int vacalign, uint_t flags)
 241 {
 242         struct proc *p = curproc;
 243         caddr_t userlimit = flags & _MAP_LOW32 ?
 244             (caddr_t)USERLIMIT32 : p->p_as->a_userlimit;
 245         map_addr_proc(addrp, len, off, vacalign, userlimit, p, flags);
 246 }
 247 
 248 /*
 249  * Some V9 CPUs have holes in the middle of the 64-bit virtual address range.
 250  */
 251 caddr_t hole_start, hole_end;
 252 
 253 /*
 254  * kpm mapping window
 255  */
 256 caddr_t kpm_vbase;
 257 size_t  kpm_size;
 258 uchar_t kpm_size_shift;
 259 
 260 int valid_va_range_aligned_wraparound;
 261 /*
 262  * Determine whether [*basep, *basep + *lenp) contains a mappable range of
 263  * addresses at least "minlen" long, where the base of the range is at "off"
 264  * phase from an "align" boundary and there is space for a "redzone"-sized
 265  * redzone on either side of the range.  On success, 1 is returned and *basep
 266  * and *lenp are adjusted to describe the acceptable range (including
 267  * the redzone).  On failure, 0 is returned.
 268  */
 269 int
 270 valid_va_range_aligned(caddr_t *basep, size_t *lenp, size_t minlen, int dir,
 271     size_t align, size_t redzone, size_t off)
 272 {
 273         caddr_t hi, lo;
 274         size_t tot_len;
 275 
 276         ASSERT(align == 0 ? off == 0 : off < align);
 277         ASSERT(ISP2(align));
 278         ASSERT(align == 0 || align >= PAGESIZE);
 279 
 280         lo = *basep;
 281         hi = lo + *lenp;
 282         tot_len = minlen + 2 * redzone; /* need at least this much space */
 283 
 284         /* If hi rolled over the top try cutting back. */
 285         if (hi < lo) {
 286                 *lenp = 0UL - (uintptr_t)lo - 1UL;
 287                 /* Trying to see if this really happens, and then if so, why */
 288                 valid_va_range_aligned_wraparound++;
 289                 hi = lo + *lenp;
 290         }
 291         if (*lenp < tot_len) {
 292                 return (0);
 293         }
 294 
 295         /*
 296          * Deal with a possible hole in the address range between
 297          * hole_start and hole_end that should never be mapped by the MMU.
 298          */
 299 
 300         if (lo < hole_start) {
 301                 if (hi > hole_start)
 302                         if (hi < hole_end)
 303                                 hi = hole_start;
 304                         else
 305                                 /* lo < hole_start && hi >= hole_end */
 306                                 if (dir == AH_LO) {
 307                                         /*
 308                                          * prefer lowest range
 309                                          */
 310                                         if (hole_start - lo >= tot_len)
 311                                                 hi = hole_start;
 312                                         else if (hi - hole_end >= tot_len)
 313                                                 lo = hole_end;
 314                                         else
 315                                                 return (0);
 316                                 } else {
 317                                         /*
 318                                          * prefer highest range
 319                                          */
 320                                         if (hi - hole_end >= tot_len)
 321                                                 lo = hole_end;
 322                                         else if (hole_start - lo >= tot_len)
 323                                                 hi = hole_start;
 324                                         else
 325                                                 return (0);
 326                                 }
 327         } else {
 328                 /* lo >= hole_start */
 329                 if (hi < hole_end)
 330                         return (0);
 331                 if (lo < hole_end)
 332                         lo = hole_end;
 333         }
 334 
 335         /* Check if remaining length is too small */
 336         if (hi - lo < tot_len) {
 337                 return (0);
 338         }
 339         if (align > 1) {
 340                 caddr_t tlo = lo + redzone;
 341                 caddr_t thi = hi - redzone;
 342                 tlo = (caddr_t)P2PHASEUP((uintptr_t)tlo, align, off);
 343                 if (tlo < lo + redzone) {
 344                         return (0);
 345                 }
 346                 if (thi < tlo || thi - tlo < minlen) {
 347                         return (0);
 348                 }
 349         }
 350         *basep = lo;
 351         *lenp = hi - lo;
 352         return (1);
 353 }
 354 
 355 /*
 356  * Determine whether [*basep, *basep + *lenp) contains a mappable range of
 357  * addresses at least "minlen" long.  On success, 1 is returned and *basep
 358  * and *lenp are adjusted to describe the acceptable range.  On failure, 0
 359  * is returned.
 360  */
 361 int
 362 valid_va_range(caddr_t *basep, size_t *lenp, size_t minlen, int dir)
 363 {
 364         return (valid_va_range_aligned(basep, lenp, minlen, dir, 0, 0, 0));
 365 }
 366 
 367 /*
 368  * Default to forbidding the first 64k of address space.  This protects most
 369  * reasonably sized structures from dereferences through NULL:
 370  *     ((foo_t *)0)->bar
 371  */
 372 uintptr_t forbidden_null_mapping_sz = 0x10000;
 373 
 374 /*
 375  * Determine whether [addr, addr+len] with protections `prot' are valid
 376  * for a user address space.
 377  */
 378 /*ARGSUSED*/
 379 int
 380 valid_usr_range(caddr_t addr, size_t len, uint_t prot, struct as *as,
 381     caddr_t userlimit)
 382 {
 383         caddr_t eaddr = addr + len;
 384 
 385         if (eaddr <= addr || addr >= userlimit || eaddr > userlimit)
 386                 return (RANGE_BADADDR);
 387 
 388         if ((addr <= (caddr_t)forbidden_null_mapping_sz) &&
 389             secflag_enabled(as->a_proc, PROC_SEC_FORBIDNULLMAP))
 390                 return (RANGE_BADADDR);
 391 
 392         /*
 393          * Determine if the address range falls within an illegal
 394          * range of the MMU.
 395          */
 396         if (eaddr > hole_start && addr < hole_end)
 397                 return (RANGE_BADADDR);
 398 
 399 #if defined(SF_ERRATA_57)
 400         /*
 401          * Make sure USERLIMIT isn't raised too high
 402          */
 403         ASSERT64(addr <= (caddr_t)0xffffffff80000000ul ||
 404             errata57_limit == 0);
 405 
 406         if (AS_TYPE_64BIT(as) &&
 407             (addr < errata57_limit) &&
 408             (prot & PROT_EXEC))
 409                 return (RANGE_BADPROT);
 410 #endif /* SF_ERRATA57 */
 411         return (RANGE_OKAY);
 412 }
 413 
 414 /*
 415  * Routine used to check to see if an a.out can be executed
 416  * by the current machine/architecture.
 417  */
 418 int
 419 chkaout(struct exdata *exp)
 420 {
 421         if (exp->ux_mach == M_SPARC)
 422                 return (0);
 423         else
 424                 return (ENOEXEC);
 425 }
 426 
 427 /*
 428  * The following functions return information about an a.out
 429  * which is used when a program is executed.
 430  */
 431 
 432 /*
 433  * Return the load memory address for the data segment.
 434  */
 435 caddr_t
 436 getdmem(struct exec *exp)
 437 {
 438         /*
 439          * XXX - Sparc Reference Hack approaching
 440          * Remember that we are loading
 441          * 8k executables into a 4k machine
 442          * DATA_ALIGN == 2 * PAGESIZE
 443          */
 444         if (exp->a_text)
 445                 return ((caddr_t)(roundup(USRTEXT + exp->a_text, DATA_ALIGN)));
 446         else
 447                 return ((caddr_t)USRTEXT);
 448 }
 449 
 450 /*
 451  * Return the starting disk address for the data segment.
 452  */
 453 ulong_t
 454 getdfile(struct exec *exp)
 455 {
 456         if (exp->a_magic == ZMAGIC)
 457                 return (exp->a_text);
 458         else
 459                 return (sizeof (struct exec) + exp->a_text);
 460 }
 461 
 462 /*
 463  * Return the load memory address for the text segment.
 464  */
 465 
 466 /*ARGSUSED*/
 467 caddr_t
 468 gettmem(struct exec *exp)
 469 {
 470         return ((caddr_t)USRTEXT);
 471 }
 472 
 473 /*
 474  * Return the file byte offset for the text segment.
 475  */
 476 uint_t
 477 gettfile(struct exec *exp)
 478 {
 479         if (exp->a_magic == ZMAGIC)
 480                 return (0);
 481         else
 482                 return (sizeof (struct exec));
 483 }
 484 
 485 void
 486 getexinfo(
 487         struct exdata *edp_in,
 488         struct exdata *edp_out,
 489         int *pagetext,
 490         int *pagedata)
 491 {
 492         *edp_out = *edp_in;     /* structure copy */
 493 
 494         if ((edp_in->ux_mag == ZMAGIC) &&
 495             ((edp_in->vp->v_flag & VNOMAP) == 0)) {
 496                 *pagetext = 1;
 497                 *pagedata = 1;
 498         } else {
 499                 *pagetext = 0;
 500                 *pagedata = 0;
 501         }
 502 }
 503 
 504 /*
 505  * Return non 0 value if the address may cause a VAC alias with KPM mappings.
 506  * KPM selects an address such that it's equal offset modulo shm_alignment and
 507  * assumes it can't be in VAC conflict with any larger than PAGESIZE mapping.
 508  */
 509 int
 510 map_addr_vacalign_check(caddr_t addr, u_offset_t off)
 511 {
 512         if (vac) {
 513                 return (((uintptr_t)addr ^ off) & shm_alignment - 1);
 514         } else {
 515                 return (0);
 516         }
 517 }
 518 
 519 /*
 520  * Sanity control. Don't use large pages regardless of user
 521  * settings if there's less than priv or shm_lpg_min_physmem memory installed.
 522  * The units for this variable is 8K pages.
 523  */
 524 pgcnt_t shm_lpg_min_physmem = 131072;                   /* 1GB */
 525 pgcnt_t privm_lpg_min_physmem = 131072;                 /* 1GB */
 526 
 527 static size_t
 528 map_pgszheap(struct proc *p, caddr_t addr, size_t len)
 529 {
 530         size_t          pgsz = MMU_PAGESIZE;
 531         int             szc;
 532 
 533         /*
 534          * If len is zero, retrieve from proc and don't demote the page size.
 535          * Use atleast the default pagesize.
 536          */
 537         if (len == 0) {
 538                 len = p->p_brkbase + p->p_brksize - p->p_bssbase;
 539         }
 540         len = MAX(len, default_uheap_lpsize);
 541 
 542         for (szc = mmu_page_sizes - 1; szc >= 0; szc--) {
 543                 pgsz = hw_page_array[szc].hp_size;
 544                 if ((disable_auto_data_large_pages & (1 << szc)) ||
 545                     pgsz > max_uheap_lpsize)
 546                         continue;
 547                 if (len >= pgsz) {
 548                         break;
 549                 }
 550         }
 551 
 552         /*
 553          * If addr == 0 we were called by memcntl() when the
 554          * size code is 0.  Don't set pgsz less than current size.
 555          */
 556         if (addr == 0 && (pgsz < hw_page_array[p->p_brkpageszc].hp_size)) {
 557                 pgsz = hw_page_array[p->p_brkpageszc].hp_size;
 558         }
 559 
 560         return (pgsz);
 561 }
 562 
 563 static size_t
 564 map_pgszstk(struct proc *p, caddr_t addr, size_t len)
 565 {
 566         size_t          pgsz = MMU_PAGESIZE;
 567         int             szc;
 568 
 569         /*
 570          * If len is zero, retrieve from proc and don't demote the page size.
 571          * Use atleast the default pagesize.
 572          */
 573         if (len == 0) {
 574                 len = p->p_stksize;
 575         }
 576         len = MAX(len, default_ustack_lpsize);
 577 
 578         for (szc = mmu_page_sizes - 1; szc >= 0; szc--) {
 579                 pgsz = hw_page_array[szc].hp_size;
 580                 if ((disable_auto_data_large_pages & (1 << szc)) ||
 581                     pgsz > max_ustack_lpsize)
 582                         continue;
 583                 if (len >= pgsz) {
 584                         break;
 585                 }
 586         }
 587 
 588         /*
 589          * If addr == 0 we were called by memcntl() or exec_args() when the
 590          * size code is 0.  Don't set pgsz less than current size.
 591          */
 592         if (addr == 0 && (pgsz < hw_page_array[p->p_stkpageszc].hp_size)) {
 593                 pgsz = hw_page_array[p->p_stkpageszc].hp_size;
 594         }
 595 
 596         return (pgsz);
 597 }
 598 
 599 static size_t
 600 map_pgszism(caddr_t addr, size_t len)
 601 {
 602         uint_t szc;
 603         size_t pgsz;
 604 
 605         for (szc = mmu_page_sizes - 1; szc >= TTE4M; szc--) {
 606                 if (disable_ism_large_pages & (1 << szc))
 607                         continue;
 608 
 609                 pgsz = hw_page_array[szc].hp_size;
 610                 if ((len >= pgsz) && IS_P2ALIGNED(addr, pgsz))
 611                         return (pgsz);
 612         }
 613 
 614         return (DEFAULT_ISM_PAGESIZE);
 615 }
 616 
 617 /*
 618  * Suggest a page size to be used to map a segment of type maptype and length
 619  * len.  Returns a page size (not a size code).
 620  */
 621 /* ARGSUSED */
 622 size_t
 623 map_pgsz(int maptype, struct proc *p, caddr_t addr, size_t len, int memcntl)
 624 {
 625         size_t  pgsz = MMU_PAGESIZE;
 626 
 627         ASSERT(maptype != MAPPGSZ_VA);
 628 
 629         if (maptype != MAPPGSZ_ISM && physmem < privm_lpg_min_physmem) {
 630                 return (MMU_PAGESIZE);
 631         }
 632 
 633         switch (maptype) {
 634         case MAPPGSZ_ISM:
 635                 pgsz = map_pgszism(addr, len);
 636                 break;
 637 
 638         case MAPPGSZ_STK:
 639                 if (max_ustack_lpsize > MMU_PAGESIZE) {
 640                         pgsz = map_pgszstk(p, addr, len);
 641                 }
 642                 break;
 643 
 644         case MAPPGSZ_HEAP:
 645                 if (max_uheap_lpsize > MMU_PAGESIZE) {
 646                         pgsz = map_pgszheap(p, addr, len);
 647                 }
 648                 break;
 649         }
 650         return (pgsz);
 651 }
 652 
 653 
 654 /* assumes TTE8K...TTE4M == szc */
 655 
 656 static uint_t
 657 map_szcvec(caddr_t addr, size_t size, uintptr_t off, int disable_lpgs,
 658     size_t max_lpsize, size_t min_physmem)
 659 {
 660         caddr_t eaddr = addr + size;
 661         uint_t szcvec = 0;
 662         caddr_t raddr;
 663         caddr_t readdr;
 664         size_t pgsz;
 665         int i;
 666 
 667         if (physmem < min_physmem || max_lpsize <= MMU_PAGESIZE) {
 668                 return (0);
 669         }
 670         for (i = mmu_page_sizes - 1; i > 0; i--) {
 671                 if (disable_lpgs & (1 << i)) {
 672                         continue;
 673                 }
 674                 pgsz = page_get_pagesize(i);
 675                 if (pgsz > max_lpsize) {
 676                         continue;
 677                 }
 678                 raddr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
 679                 readdr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
 680                 if (raddr < addr || raddr >= readdr) {
 681                         continue;
 682                 }
 683                 if (P2PHASE((uintptr_t)addr ^ off, pgsz)) {
 684                         continue;
 685                 }
 686                 szcvec |= (1 << i);
 687                 /*
 688                  * And or in the remaining enabled page sizes.
 689                  */
 690                 szcvec |= P2PHASE(~disable_lpgs, (1 << i));
 691                 szcvec &= ~1; /* no need to return 8K pagesize */
 692                 break;
 693         }
 694         return (szcvec);
 695 }
 696 
 697 /*
 698  * Return a bit vector of large page size codes that
 699  * can be used to map [addr, addr + len) region.
 700  */
 701 /* ARGSUSED */
 702 uint_t
 703 map_pgszcvec(caddr_t addr, size_t size, uintptr_t off, int flags, int type,
 704     int memcntl)
 705 {
 706         if (flags & MAP_TEXT) {
 707                 return (map_szcvec(addr, size, off,
 708                     disable_auto_text_large_pages,
 709                     max_utext_lpsize, shm_lpg_min_physmem));
 710 
 711         } else if (flags & MAP_INITDATA) {
 712                 return (map_szcvec(addr, size, off,
 713                     disable_auto_data_large_pages,
 714                     max_uidata_lpsize, privm_lpg_min_physmem));
 715 
 716         } else if (type == MAPPGSZC_SHM) {
 717                 return (map_szcvec(addr, size, off,
 718                     disable_auto_data_large_pages,
 719                     max_shm_lpsize, shm_lpg_min_physmem));
 720 
 721         } else if (type == MAPPGSZC_HEAP) {
 722                 return (map_szcvec(addr, size, off,
 723                     disable_auto_data_large_pages,
 724                     max_uheap_lpsize, privm_lpg_min_physmem));
 725 
 726         } else if (type == MAPPGSZC_STACK) {
 727                 return (map_szcvec(addr, size, off,
 728                     disable_auto_data_large_pages,
 729                     max_ustack_lpsize, privm_lpg_min_physmem));
 730 
 731         } else {
 732                 return (map_szcvec(addr, size, off,
 733                     disable_auto_data_large_pages,
 734                     max_privmap_lpsize, privm_lpg_min_physmem));
 735         }
 736 }
 737 
 738 /*
 739  * Anchored in the table below are counters used to keep track
 740  * of free contiguous physical memory. Each element of the table contains
 741  * the array of counters, the size of array which is allocated during
 742  * startup based on physmax and a shift value used to convert a pagenum
 743  * into a counter array index or vice versa. The table has page size
 744  * for rows and region size for columns:
 745  *
 746  *      page_counters[page_size][region_size]
 747  *
 748  *      page_size:      TTE size code of pages on page_size freelist.
 749  *
 750  *      region_size:    TTE size code of a candidate larger page made up
 751  *                      made up of contiguous free page_size pages.
 752  *
 753  * As you go across a page_size row increasing region_size each
 754  * element keeps track of how many (region_size - 1) size groups
 755  * made up of page_size free pages can be coalesced into a
 756  * regsion_size page. Yuck! Lets try an example:
 757  *
 758  *      page_counters[1][3] is the table element used for identifying
 759  *      candidate 4M pages from contiguous pages off the 64K free list.
 760  *      Each index in the page_counters[1][3].array spans 4M. Its the
 761  *      number of free 512K size (regsion_size - 1) groups of contiguous
 762  *      64K free pages. So when page_counters[1][3].counters[n] == 8
 763  *      we know we have a candidate 4M page made up of 512K size groups
 764  *      of 64K free pages.
 765  */
 766 
 767 /*
 768  * Per page size free lists. 3rd (max_mem_nodes) and 4th (page coloring bins)
 769  * dimensions are allocated dynamically.
 770  */
 771 page_t ***page_freelists[MMU_PAGE_SIZES][MAX_MEM_TYPES];
 772 
 773 /*
 774  * For now there is only a single size cache list.
 775  * Allocated dynamically.
 776  */
 777 page_t ***page_cachelists[MAX_MEM_TYPES];
 778 
 779 kmutex_t *fpc_mutex[NPC_MUTEX];
 780 kmutex_t *cpc_mutex[NPC_MUTEX];
 781 
 782 /*
 783  * Calculate space needed for page freelists and counters
 784  */
 785 size_t
 786 calc_free_pagelist_sz(void)
 787 {
 788         int szc;
 789         size_t alloc_sz, cache_sz, free_sz;
 790 
 791         /*
 792          * one cachelist per color, node, and type
 793          */
 794         cache_sz = (page_get_pagecolors(0) * sizeof (page_t *)) +
 795             sizeof (page_t **);
 796         cache_sz *= max_mem_nodes * MAX_MEM_TYPES;
 797 
 798         /*
 799          * one freelist per size, color, node, and type
 800          */
 801         free_sz = sizeof (page_t **);
 802         for (szc = 0; szc < mmu_page_sizes; szc++)
 803                 free_sz += sizeof (page_t *) * page_get_pagecolors(szc);
 804         free_sz *= max_mem_nodes * MAX_MEM_TYPES;
 805 
 806         alloc_sz = cache_sz + free_sz + page_ctrs_sz();
 807         return (alloc_sz);
 808 }
 809 
 810 caddr_t
 811 alloc_page_freelists(caddr_t alloc_base)
 812 {
 813         int     mnode, mtype;
 814         int     szc, clrs;
 815 
 816         /*
 817          * We only support small pages in the cachelist.
 818          */
 819         for (mtype = 0; mtype < MAX_MEM_TYPES; mtype++) {
 820                 page_cachelists[mtype] = (page_t ***)alloc_base;
 821                 alloc_base += (max_mem_nodes * sizeof (page_t **));
 822                 for (mnode = 0; mnode < max_mem_nodes; mnode++) {
 823                         page_cachelists[mtype][mnode] = (page_t **)alloc_base;
 824                         alloc_base +=
 825                             (page_get_pagecolors(0) * sizeof (page_t *));
 826                 }
 827         }
 828 
 829         /*
 830          * Allocate freelists bins for all
 831          * supported page sizes.
 832          */
 833         for (szc = 0; szc < mmu_page_sizes; szc++) {
 834                 clrs = page_get_pagecolors(szc);
 835                 for (mtype = 0; mtype < MAX_MEM_TYPES; mtype++) {
 836                         page_freelists[szc][mtype] = (page_t ***)alloc_base;
 837                         alloc_base += (max_mem_nodes * sizeof (page_t **));
 838                         for (mnode = 0; mnode < max_mem_nodes; mnode++) {
 839                                 page_freelists[szc][mtype][mnode] =
 840                                     (page_t **)alloc_base;
 841                                 alloc_base += (clrs * (sizeof (page_t *)));
 842                         }
 843                 }
 844         }
 845 
 846         alloc_base = page_ctrs_alloc(alloc_base);
 847         return (alloc_base);
 848 }
 849 
 850 /*
 851  * Allocate page_freelists locks for a memnode from the nucleus data
 852  * area. This is the first time that mmu_page_sizes is used during
 853  * bootup, so check mmu_page_sizes initialization.
 854  */
 855 int
 856 ndata_alloc_page_mutexs(struct memlist *ndata)
 857 {
 858         size_t alloc_sz;
 859         caddr_t alloc_base;
 860         int     i;
 861         void    page_coloring_init();
 862 
 863         page_coloring_init();
 864         if (&mmu_init_mmu_page_sizes) {
 865                 if (!mmu_init_mmu_page_sizes(0)) {
 866                         cmn_err(CE_PANIC, "mmu_page_sizes %d not initialized",
 867                             mmu_page_sizes);
 868                 }
 869         }
 870         ASSERT(mmu_page_sizes >= DEFAULT_MMU_PAGE_SIZES);
 871 
 872         /* fpc_mutex and cpc_mutex */
 873         alloc_sz = 2 * NPC_MUTEX * max_mem_nodes * sizeof (kmutex_t);
 874 
 875         alloc_base = ndata_alloc(ndata, alloc_sz, ecache_alignsize);
 876         if (alloc_base == NULL)
 877                 return (-1);
 878 
 879         ASSERT(((uintptr_t)alloc_base & (ecache_alignsize - 1)) == 0);
 880 
 881         for (i = 0; i < NPC_MUTEX; i++) {
 882                 fpc_mutex[i] = (kmutex_t *)alloc_base;
 883                 alloc_base += (sizeof (kmutex_t) * max_mem_nodes);
 884                 cpc_mutex[i] = (kmutex_t *)alloc_base;
 885                 alloc_base += (sizeof (kmutex_t) * max_mem_nodes);
 886         }
 887         return (0);
 888 }
 889 
 890 /*
 891  * To select our starting bin, we stride through the bins with a stride
 892  * of 337.  Why 337?  It's prime, it's largeish, and it performs well both
 893  * in simulation and practice for different workloads on varying cache sizes.
 894  */
 895 uint32_t color_start_current = 0;
 896 uint32_t color_start_stride = 337;
 897 int color_start_random = 0;
 898 
 899 /* ARGSUSED */
 900 uint_t
 901 get_color_start(struct as *as)
 902 {
 903         uint32_t old, new;
 904 
 905         if (consistent_coloring == 2 || color_start_random) {
 906                 return ((uint_t)(((gettick()) << (vac_shift - MMU_PAGESHIFT)) &
 907                     (hw_page_array[0].hp_colors - 1)));
 908         }
 909 
 910         do {
 911                 old = color_start_current;
 912                 new = old + (color_start_stride << (vac_shift - MMU_PAGESHIFT));
 913         } while (atomic_cas_32(&color_start_current, old, new) != old);
 914 
 915         return ((uint_t)(new));
 916 }
 917 
 918 /*
 919  * Called once at startup from kphysm_init() -- before memialloc()
 920  * is invoked to do the 1st page_free()/page_freelist_add().
 921  *
 922  * initializes page_colors and page_colors_mask based on ecache_setsize.
 923  *
 924  * Also initializes the counter locks.
 925  */
 926 void
 927 page_coloring_init()
 928 {
 929         int     a, i;
 930         uint_t colors;
 931 
 932         if (do_pg_coloring == 0) {
 933                 page_colors = 1;
 934                 for (i = 0; i < mmu_page_sizes; i++) {
 935                         colorequivszc[i] = 0;
 936                         hw_page_array[i].hp_colors = 1;
 937                 }
 938                 return;
 939         }
 940 
 941         /*
 942          * Calculate page_colors from ecache_setsize. ecache_setsize contains
 943          * the max ecache setsize of all cpus configured in the system or, for
 944          * cheetah+ systems, the max possible ecache setsize for all possible
 945          * cheetah+ cpus.
 946          */
 947         page_colors = ecache_setsize / MMU_PAGESIZE;
 948         page_colors_mask = page_colors - 1;
 949 
 950         vac_colors = vac_size / MMU_PAGESIZE;
 951         vac_colors_mask = vac_colors -1;
 952 
 953         page_coloring_shift = 0;
 954         a = ecache_setsize;
 955         while (a >>= 1) {
 956                 page_coloring_shift++;
 957         }
 958 
 959         /* initialize number of colors per page size */
 960         for (i = 0; i < mmu_page_sizes; i++) {
 961                 hw_page_array[i].hp_colors = (page_colors_mask >>
 962                     (hw_page_array[i].hp_shift - hw_page_array[0].hp_shift))
 963                     + 1;
 964                 colorequivszc[i] = 0;
 965         }
 966 
 967         /*
 968          * initialize cpu_page_colors if ecache setsizes are homogenous.
 969          * cpu_page_colors set to -1 during DR operation or during startup
 970          * if setsizes are heterogenous.
 971          *
 972          * The value of cpu_page_colors determines if additional color bins
 973          * need to be checked for a particular color in the page_get routines.
 974          */
 975         if (cpu_setsize > 0 && cpu_page_colors == 0 &&
 976             cpu_setsize < ecache_setsize) {
 977                 cpu_page_colors = cpu_setsize / MMU_PAGESIZE;
 978                 a = lowbit(page_colors) - lowbit(cpu_page_colors);
 979                 ASSERT(a > 0);
 980                 ASSERT(a < 16);
 981 
 982                 for (i = 0; i < mmu_page_sizes; i++) {
 983                         if ((colors = hw_page_array[i].hp_colors) <= 1) {
 984                                 continue;
 985                         }
 986                         while ((colors >> a) == 0)
 987                                 a--;
 988                         ASSERT(a >= 0);
 989 
 990                         /* higher 4 bits encodes color equiv mask */
 991                         colorequivszc[i] = (a << 4);
 992                 }
 993         }
 994 
 995         /* do cpu specific color initialization */
 996         if (&page_coloring_init_cpu) {
 997                 page_coloring_init_cpu();
 998         }
 999 }
1000 
1001 int
1002 bp_color(struct buf *bp)
1003 {
1004         int color = -1;
1005 
1006         if (vac) {
1007                 if ((bp->b_flags & B_PAGEIO) != 0) {
1008                         color = sfmmu_get_ppvcolor(bp->b_pages);
1009                 } else if (bp->b_un.b_addr != NULL) {
1010                         color = sfmmu_get_addrvcolor(bp->b_un.b_addr);
1011                 }
1012         }
1013         return (color < 0 ? 0 : ptob(color));
1014 }
1015 
1016 /*
1017  * Function for flushing D-cache when performing module relocations
1018  * to an alternate mapping.  Stubbed out on all platforms except sun4u,
1019  * at least for now.
1020  */
1021 void
1022 dcache_flushall()
1023 {
1024         sfmmu_cache_flushall();
1025 }
1026 
1027 static int
1028 kdi_range_overlap(uintptr_t va1, size_t sz1, uintptr_t va2, size_t sz2)
1029 {
1030         if (va1 < va2 && va1 + sz1 <= va2)
1031                 return (0);
1032 
1033         if (va2 < va1 && va2 + sz2 <= va1)
1034                 return (0);
1035 
1036         return (1);
1037 }
1038 
1039 /*
1040  * Return the number of bytes, relative to the beginning of a given range, that
1041  * are non-toxic (can be read from and written to with relative impunity).
1042  */
1043 size_t
1044 kdi_range_is_nontoxic(uintptr_t va, size_t sz, int write)
1045 {
1046         /* OBP reads are harmless, but we don't want people writing there */
1047         if (write && kdi_range_overlap(va, sz, OFW_START_ADDR, OFW_END_ADDR -
1048             OFW_START_ADDR + 1))
1049                 return (va < OFW_START_ADDR ? OFW_START_ADDR - va : 0);
1050 
1051         if (kdi_range_overlap(va, sz, PIOMAPBASE, PIOMAPSIZE))
1052                 return (va < PIOMAPBASE ? PIOMAPBASE - va : 0);
1053 
1054         return (sz); /* no overlap */
1055 }
1056 
1057 /*
1058  * Minimum physmem required for enabling large pages for kernel heap
1059  * Currently we do not enable lp for kmem on systems with less
1060  * than 1GB of memory. This value can be changed via /etc/system
1061  */
1062 size_t segkmem_lpminphysmem = 0x40000000;       /* 1GB */
1063 
1064 /*
1065  * this function chooses large page size for kernel heap
1066  */
1067 size_t
1068 get_segkmem_lpsize(size_t lpsize)
1069 {
1070         size_t memtotal = physmem * PAGESIZE;
1071         size_t mmusz;
1072         uint_t szc;
1073 
1074         if (memtotal < segkmem_lpminphysmem)
1075                 return (PAGESIZE);
1076 
1077         if (plat_lpkmem_is_supported != NULL &&
1078             plat_lpkmem_is_supported() == 0)
1079                 return (PAGESIZE);
1080 
1081         mmusz = mmu_get_kernel_lpsize(lpsize);
1082         szc = page_szc(mmusz);
1083 
1084         while (szc) {
1085                 if (!(disable_large_pages & (1 << szc)))
1086                         return (page_get_pagesize(szc));
1087                 szc--;
1088         }
1089         return (PAGESIZE);
1090 }