1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 
  26 /*
  27  * UNIX machine dependent virtual memory support.
  28  */
  29 
  30 #include <sys/vm.h>
  31 #include <sys/exec.h>
  32 
  33 #include <sys/exechdr.h>
  34 #include <vm/seg_kmem.h>
  35 #include <sys/atomic.h>
  36 #include <sys/archsystm.h>
  37 #include <sys/machsystm.h>
  38 #include <sys/kdi.h>
  39 #include <sys/cpu_module.h>
  40 
  41 #include <vm/hat_sfmmu.h>
  42 
  43 #include <sys/memnode.h>
  44 
  45 #include <sys/mem_config.h>
  46 #include <sys/mem_cage.h>
  47 #include <vm/vm_dep.h>
  48 #include <vm/page.h>
  49 #include <sys/platform_module.h>
  50 
  51 /*
  52  * These variables are set by module specific config routines.
  53  * They are only set by modules which will use physical cache page coloring.
  54  */
  55 int do_pg_coloring = 0;
  56 
  57 /*
  58  * These variables can be conveniently patched at kernel load time to
  59  * prevent do_pg_coloring from being enabled by
  60  * module specific config routines.
  61  */
  62 
  63 int use_page_coloring = 1;
  64 
  65 /*
  66  * initialized by page_coloring_init()
  67  */
  68 extern uint_t page_colors;
  69 extern uint_t page_colors_mask;
  70 extern uint_t page_coloring_shift;
  71 int cpu_page_colors;
  72 uint_t vac_colors = 0;
  73 uint_t vac_colors_mask = 0;
  74 
  75 /* cpu specific coloring initialization */
  76 extern void page_coloring_init_cpu();
  77 #pragma weak page_coloring_init_cpu
  78 
  79 /*
  80  * get the ecache setsize for the current cpu.
  81  */
  82 #define CPUSETSIZE()    (cpunodes[CPU->cpu_id].ecache_setsize)
  83 
  84 plcnt_t         plcnt;          /* page list count */
  85 
  86 /*
  87  * This variable is set by the cpu module to contain the lowest
  88  * address not affected by the SF_ERRATA_57 workaround.  It should
  89  * remain 0 if the workaround is not needed.
  90  */
  91 #if defined(SF_ERRATA_57)
  92 caddr_t errata57_limit;
  93 #endif
  94 
  95 extern void page_relocate_hash(page_t *, page_t *);
  96 
  97 /*
  98  * these must be defined in platform specific areas
  99  */
 100 extern void map_addr_proc(caddr_t *, size_t, offset_t, int, caddr_t,
 101         struct proc *, uint_t);
 102 extern page_t *page_get_freelist(struct vnode *, u_offset_t, struct seg *,
 103         caddr_t, size_t, uint_t, struct lgrp *);
 104 /*
 105  * Convert page frame number to an OBMEM page frame number
 106  * (i.e. put in the type bits -- zero for this implementation)
 107  */
 108 pfn_t
 109 impl_obmem_pfnum(pfn_t pf)
 110 {
 111         return (pf);
 112 }
 113 
 114 /*
 115  * Use physmax to determine the highest physical page of DRAM memory
 116  * It is assumed that any physical addresses above physmax is in IO space.
 117  * We don't bother checking the low end because we assume that memory space
 118  * begins at physical page frame 0.
 119  *
 120  * Return 1 if the page frame is onboard DRAM memory, else 0.
 121  * Returns 0 for nvram so it won't be cached.
 122  */
 123 int
 124 pf_is_memory(pfn_t pf)
 125 {
 126         /* We must be IO space */
 127         if (pf > physmax)
 128                 return (0);
 129 
 130         /* We must be memory space */
 131         return (1);
 132 }
 133 
 134 /*
 135  * Handle a pagefault.
 136  */
 137 faultcode_t
 138 pagefault(caddr_t addr, enum fault_type type, enum seg_rw rw, int iskernel)
 139 {
 140         struct as *as;
 141         struct proc *p;
 142         faultcode_t res;
 143         caddr_t base;
 144         size_t len;
 145         int err;
 146 
 147         if (INVALID_VADDR(addr))
 148                 return (FC_NOMAP);
 149 
 150         if (iskernel) {
 151                 as = &kas;
 152         } else {
 153                 p = curproc;
 154                 as = p->p_as;
 155 #if defined(SF_ERRATA_57)
 156                 /*
 157                  * Prevent infinite loops due to a segment driver
 158                  * setting the execute permissions and the sfmmu hat
 159                  * silently ignoring them.
 160                  */
 161                 if (rw == S_EXEC && AS_TYPE_64BIT(as) &&
 162                     addr < errata57_limit) {
 163                         res = FC_NOMAP;
 164                         goto out;
 165                 }
 166 #endif
 167         }
 168 
 169         /*
 170          * Dispatch pagefault.
 171          */
 172         res = as_fault(as->a_hat, as, addr, 1, type, rw);
 173 
 174         /*
 175          * If this isn't a potential unmapped hole in the user's
 176          * UNIX data or stack segments, just return status info.
 177          */
 178         if (!(res == FC_NOMAP && iskernel == 0))
 179                 goto out;
 180 
 181         /*
 182          * Check to see if we happened to faulted on a currently unmapped
 183          * part of the UNIX data or stack segments.  If so, create a zfod
 184          * mapping there and then try calling the fault routine again.
 185          */
 186         base = p->p_brkbase;
 187         len = p->p_brksize;
 188 
 189         if (addr < base || addr >= base + len) {          /* data seg? */
 190                 base = (caddr_t)(p->p_usrstack - p->p_stksize);
 191                 len = p->p_stksize;
 192                 if (addr < base || addr >= p->p_usrstack) {    /* stack seg? */
 193                         /* not in either UNIX data or stack segments */
 194                         res = FC_NOMAP;
 195                         goto out;
 196                 }
 197         }
 198 
 199         /* the rest of this function implements a 3.X 4.X 5.X compatibility */
 200         /* This code is probably not needed anymore */
 201 
 202         /* expand the gap to the page boundaries on each side */
 203         len = (((uintptr_t)base + len + PAGEOFFSET) & PAGEMASK) -
 204             ((uintptr_t)base & PAGEMASK);
 205         base = (caddr_t)((uintptr_t)base & PAGEMASK);
 206 
 207         as_rangelock(as);
 208         as_purge(as);
 209         if (as_gap(as, PAGESIZE, &base, &len, AH_CONTAIN, addr) == 0) {
 210                 err = as_map(as, base, len, segvn_create, zfod_argsp);
 211                 as_rangeunlock(as);
 212                 if (err) {
 213                         res = FC_MAKE_ERR(err);
 214                         goto out;
 215                 }
 216         } else {
 217                 /*
 218                  * This page is already mapped by another thread after we
 219                  * returned from as_fault() above.  We just fallthrough
 220                  * as_fault() below.
 221                  */
 222                 as_rangeunlock(as);
 223         }
 224 
 225         res = as_fault(as->a_hat, as, addr, 1, F_INVAL, rw);
 226 
 227 out:
 228 
 229         return (res);
 230 }
 231 
 232 /*
 233  * This is the routine which defines the address limit implied
 234  * by the flag '_MAP_LOW32'.  USERLIMIT32 matches the highest
 235  * mappable address in a 32-bit process on this platform (though
 236  * perhaps we should make it be UINT32_MAX here?)
 237  */
 238 void
 239 map_addr(caddr_t *addrp, size_t len, offset_t off, int vacalign, uint_t flags)
 240 {
 241         struct proc *p = curproc;
 242         caddr_t userlimit = flags & _MAP_LOW32 ?
 243             (caddr_t)USERLIMIT32 : p->p_as->a_userlimit;
 244         map_addr_proc(addrp, len, off, vacalign, userlimit, p, flags);
 245 }
 246 
 247 /*
 248  * Some V9 CPUs have holes in the middle of the 64-bit virtual address range.
 249  */
 250 caddr_t hole_start, hole_end;
 251 
 252 /*
 253  * kpm mapping window
 254  */
 255 caddr_t kpm_vbase;
 256 size_t  kpm_size;
 257 uchar_t kpm_size_shift;
 258 
 259 int valid_va_range_aligned_wraparound;
 260 /*
 261  * Determine whether [*basep, *basep + *lenp) contains a mappable range of
 262  * addresses at least "minlen" long, where the base of the range is at "off"
 263  * phase from an "align" boundary and there is space for a "redzone"-sized
 264  * redzone on either side of the range.  On success, 1 is returned and *basep
 265  * and *lenp are adjusted to describe the acceptable range (including
 266  * the redzone).  On failure, 0 is returned.
 267  */
 268 int
 269 valid_va_range_aligned(caddr_t *basep, size_t *lenp, size_t minlen, int dir,
 270     size_t align, size_t redzone, size_t off)
 271 {
 272         caddr_t hi, lo;
 273         size_t tot_len;
 274 
 275         ASSERT(align == 0 ? off == 0 : off < align);
 276         ASSERT(ISP2(align));
 277         ASSERT(align == 0 || align >= PAGESIZE);
 278 
 279         lo = *basep;
 280         hi = lo + *lenp;
 281         tot_len = minlen + 2 * redzone; /* need at least this much space */
 282 
 283         /* If hi rolled over the top try cutting back. */
 284         if (hi < lo) {
 285                 *lenp = 0UL - (uintptr_t)lo - 1UL;
 286                 /* Trying to see if this really happens, and then if so, why */
 287                 valid_va_range_aligned_wraparound++;
 288                 hi = lo + *lenp;
 289         }
 290         if (*lenp < tot_len) {
 291                 return (0);
 292         }
 293 
 294         /*
 295          * Deal with a possible hole in the address range between
 296          * hole_start and hole_end that should never be mapped by the MMU.
 297          */
 298 
 299         if (lo < hole_start) {
 300                 if (hi > hole_start)
 301                         if (hi < hole_end)
 302                                 hi = hole_start;
 303                         else
 304                                 /* lo < hole_start && hi >= hole_end */
 305                                 if (dir == AH_LO) {
 306                                         /*
 307                                          * prefer lowest range
 308                                          */
 309                                         if (hole_start - lo >= tot_len)
 310                                                 hi = hole_start;
 311                                         else if (hi - hole_end >= tot_len)
 312                                                 lo = hole_end;
 313                                         else
 314                                                 return (0);
 315                                 } else {
 316                                         /*
 317                                          * prefer highest range
 318                                          */
 319                                         if (hi - hole_end >= tot_len)
 320                                                 lo = hole_end;
 321                                         else if (hole_start - lo >= tot_len)
 322                                                 hi = hole_start;
 323                                         else
 324                                                 return (0);
 325                                 }
 326         } else {
 327                 /* lo >= hole_start */
 328                 if (hi < hole_end)
 329                         return (0);
 330                 if (lo < hole_end)
 331                         lo = hole_end;
 332         }
 333 
 334         /* Check if remaining length is too small */
 335         if (hi - lo < tot_len) {
 336                 return (0);
 337         }
 338         if (align > 1) {
 339                 caddr_t tlo = lo + redzone;
 340                 caddr_t thi = hi - redzone;
 341                 tlo = (caddr_t)P2PHASEUP((uintptr_t)tlo, align, off);
 342                 if (tlo < lo + redzone) {
 343                         return (0);
 344                 }
 345                 if (thi < tlo || thi - tlo < minlen) {
 346                         return (0);
 347                 }
 348         }
 349         *basep = lo;
 350         *lenp = hi - lo;
 351         return (1);
 352 }
 353 
 354 /*
 355  * Determine whether [*basep, *basep + *lenp) contains a mappable range of
 356  * addresses at least "minlen" long.  On success, 1 is returned and *basep
 357  * and *lenp are adjusted to describe the acceptable range.  On failure, 0
 358  * is returned.
 359  */
 360 int
 361 valid_va_range(caddr_t *basep, size_t *lenp, size_t minlen, int dir)
 362 {
 363         return (valid_va_range_aligned(basep, lenp, minlen, dir, 0, 0, 0));
 364 }
 365 
 366 /*
 367  * Determine whether [addr, addr+len] with protections `prot' are valid
 368  * for a user address space.
 369  */
 370 /*ARGSUSED*/
 371 int
 372 valid_usr_range(caddr_t addr, size_t len, uint_t prot, struct as *as,
 373     caddr_t userlimit)
 374 {
 375         caddr_t eaddr = addr + len;
 376 
 377         if (eaddr <= addr || addr >= userlimit || eaddr > userlimit)
 378                 return (RANGE_BADADDR);
 379 
 380         /*
 381          * Determine if the address range falls within an illegal
 382          * range of the MMU.
 383          */
 384         if (eaddr > hole_start && addr < hole_end)
 385                 return (RANGE_BADADDR);
 386 
 387 #if defined(SF_ERRATA_57)
 388         /*
 389          * Make sure USERLIMIT isn't raised too high
 390          */
 391         ASSERT64(addr <= (caddr_t)0xffffffff80000000ul ||
 392             errata57_limit == 0);
 393 
 394         if (AS_TYPE_64BIT(as) &&
 395             (addr < errata57_limit) &&
 396             (prot & PROT_EXEC))
 397                 return (RANGE_BADPROT);
 398 #endif /* SF_ERRATA57 */
 399         return (RANGE_OKAY);
 400 }
 401 
 402 /*
 403  * Routine used to check to see if an a.out can be executed
 404  * by the current machine/architecture.
 405  */
 406 int
 407 chkaout(struct exdata *exp)
 408 {
 409         if (exp->ux_mach == M_SPARC)
 410                 return (0);
 411         else
 412                 return (ENOEXEC);
 413 }
 414 
 415 /*
 416  * The following functions return information about an a.out
 417  * which is used when a program is executed.
 418  */
 419 
 420 /*
 421  * Return the load memory address for the data segment.
 422  */
 423 caddr_t
 424 getdmem(struct exec *exp)
 425 {
 426         /*
 427          * XXX - Sparc Reference Hack approaching
 428          * Remember that we are loading
 429          * 8k executables into a 4k machine
 430          * DATA_ALIGN == 2 * PAGESIZE
 431          */
 432         if (exp->a_text)
 433                 return ((caddr_t)(roundup(USRTEXT + exp->a_text, DATA_ALIGN)));
 434         else
 435                 return ((caddr_t)USRTEXT);
 436 }
 437 
 438 /*
 439  * Return the starting disk address for the data segment.
 440  */
 441 ulong_t
 442 getdfile(struct exec *exp)
 443 {
 444         if (exp->a_magic == ZMAGIC)
 445                 return (exp->a_text);
 446         else
 447                 return (sizeof (struct exec) + exp->a_text);
 448 }
 449 
 450 /*
 451  * Return the load memory address for the text segment.
 452  */
 453 
 454 /*ARGSUSED*/
 455 caddr_t
 456 gettmem(struct exec *exp)
 457 {
 458         return ((caddr_t)USRTEXT);
 459 }
 460 
 461 /*
 462  * Return the file byte offset for the text segment.
 463  */
 464 uint_t
 465 gettfile(struct exec *exp)
 466 {
 467         if (exp->a_magic == ZMAGIC)
 468                 return (0);
 469         else
 470                 return (sizeof (struct exec));
 471 }
 472 
 473 void
 474 getexinfo(
 475         struct exdata *edp_in,
 476         struct exdata *edp_out,
 477         int *pagetext,
 478         int *pagedata)
 479 {
 480         *edp_out = *edp_in;     /* structure copy */
 481 
 482         if ((edp_in->ux_mag == ZMAGIC) &&
 483             ((edp_in->vp->v_flag & VNOMAP) == 0)) {
 484                 *pagetext = 1;
 485                 *pagedata = 1;
 486         } else {
 487                 *pagetext = 0;
 488                 *pagedata = 0;
 489         }
 490 }
 491 
 492 /*
 493  * Return non 0 value if the address may cause a VAC alias with KPM mappings.
 494  * KPM selects an address such that it's equal offset modulo shm_alignment and
 495  * assumes it can't be in VAC conflict with any larger than PAGESIZE mapping.
 496  */
 497 int
 498 map_addr_vacalign_check(caddr_t addr, u_offset_t off)
 499 {
 500         if (vac) {
 501                 return (((uintptr_t)addr ^ off) & shm_alignment - 1);
 502         } else {
 503                 return (0);
 504         }
 505 }
 506 
 507 /*
 508  * Sanity control. Don't use large pages regardless of user
 509  * settings if there's less than priv or shm_lpg_min_physmem memory installed.
 510  * The units for this variable is 8K pages.
 511  */
 512 pgcnt_t shm_lpg_min_physmem = 131072;                   /* 1GB */
 513 pgcnt_t privm_lpg_min_physmem = 131072;                 /* 1GB */
 514 
 515 static size_t
 516 map_pgszheap(struct proc *p, caddr_t addr, size_t len)
 517 {
 518         size_t          pgsz = MMU_PAGESIZE;
 519         int             szc;
 520 
 521         /*
 522          * If len is zero, retrieve from proc and don't demote the page size.
 523          * Use atleast the default pagesize.
 524          */
 525         if (len == 0) {
 526                 len = p->p_brkbase + p->p_brksize - p->p_bssbase;
 527         }
 528         len = MAX(len, default_uheap_lpsize);
 529 
 530         for (szc = mmu_page_sizes - 1; szc >= 0; szc--) {
 531                 pgsz = hw_page_array[szc].hp_size;
 532                 if ((disable_auto_data_large_pages & (1 << szc)) ||
 533                     pgsz > max_uheap_lpsize)
 534                         continue;
 535                 if (len >= pgsz) {
 536                         break;
 537                 }
 538         }
 539 
 540         /*
 541          * If addr == 0 we were called by memcntl() when the
 542          * size code is 0.  Don't set pgsz less than current size.
 543          */
 544         if (addr == 0 && (pgsz < hw_page_array[p->p_brkpageszc].hp_size)) {
 545                 pgsz = hw_page_array[p->p_brkpageszc].hp_size;
 546         }
 547 
 548         return (pgsz);
 549 }
 550 
 551 static size_t
 552 map_pgszstk(struct proc *p, caddr_t addr, size_t len)
 553 {
 554         size_t          pgsz = MMU_PAGESIZE;
 555         int             szc;
 556 
 557         /*
 558          * If len is zero, retrieve from proc and don't demote the page size.
 559          * Use atleast the default pagesize.
 560          */
 561         if (len == 0) {
 562                 len = p->p_stksize;
 563         }
 564         len = MAX(len, default_ustack_lpsize);
 565 
 566         for (szc = mmu_page_sizes - 1; szc >= 0; szc--) {
 567                 pgsz = hw_page_array[szc].hp_size;
 568                 if ((disable_auto_data_large_pages & (1 << szc)) ||
 569                     pgsz > max_ustack_lpsize)
 570                         continue;
 571                 if (len >= pgsz) {
 572                         break;
 573                 }
 574         }
 575 
 576         /*
 577          * If addr == 0 we were called by memcntl() or exec_args() when the
 578          * size code is 0.  Don't set pgsz less than current size.
 579          */
 580         if (addr == 0 && (pgsz < hw_page_array[p->p_stkpageszc].hp_size)) {
 581                 pgsz = hw_page_array[p->p_stkpageszc].hp_size;
 582         }
 583 
 584         return (pgsz);
 585 }
 586 
 587 static size_t
 588 map_pgszism(caddr_t addr, size_t len)
 589 {
 590         uint_t szc;
 591         size_t pgsz;
 592 
 593         for (szc = mmu_page_sizes - 1; szc >= TTE4M; szc--) {
 594                 if (disable_ism_large_pages & (1 << szc))
 595                         continue;
 596 
 597                 pgsz = hw_page_array[szc].hp_size;
 598                 if ((len >= pgsz) && IS_P2ALIGNED(addr, pgsz))
 599                         return (pgsz);
 600         }
 601 
 602         return (DEFAULT_ISM_PAGESIZE);
 603 }
 604 
 605 /*
 606  * Suggest a page size to be used to map a segment of type maptype and length
 607  * len.  Returns a page size (not a size code).
 608  */
 609 /* ARGSUSED */
 610 size_t
 611 map_pgsz(int maptype, struct proc *p, caddr_t addr, size_t len, int memcntl)
 612 {
 613         size_t  pgsz = MMU_PAGESIZE;
 614 
 615         ASSERT(maptype != MAPPGSZ_VA);
 616 
 617         if (maptype != MAPPGSZ_ISM && physmem < privm_lpg_min_physmem) {
 618                 return (MMU_PAGESIZE);
 619         }
 620 
 621         switch (maptype) {
 622         case MAPPGSZ_ISM:
 623                 pgsz = map_pgszism(addr, len);
 624                 break;
 625 
 626         case MAPPGSZ_STK:
 627                 if (max_ustack_lpsize > MMU_PAGESIZE) {
 628                         pgsz = map_pgszstk(p, addr, len);
 629                 }
 630                 break;
 631 
 632         case MAPPGSZ_HEAP:
 633                 if (max_uheap_lpsize > MMU_PAGESIZE) {
 634                         pgsz = map_pgszheap(p, addr, len);
 635                 }
 636                 break;
 637         }
 638         return (pgsz);
 639 }
 640 
 641 
 642 /* assumes TTE8K...TTE4M == szc */
 643 
 644 static uint_t
 645 map_szcvec(caddr_t addr, size_t size, uintptr_t off, int disable_lpgs,
 646     size_t max_lpsize, size_t min_physmem)
 647 {
 648         caddr_t eaddr = addr + size;
 649         uint_t szcvec = 0;
 650         caddr_t raddr;
 651         caddr_t readdr;
 652         size_t pgsz;
 653         int i;
 654 
 655         if (physmem < min_physmem || max_lpsize <= MMU_PAGESIZE) {
 656                 return (0);
 657         }
 658         for (i = mmu_page_sizes - 1; i > 0; i--) {
 659                 if (disable_lpgs & (1 << i)) {
 660                         continue;
 661                 }
 662                 pgsz = page_get_pagesize(i);
 663                 if (pgsz > max_lpsize) {
 664                         continue;
 665                 }
 666                 raddr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
 667                 readdr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
 668                 if (raddr < addr || raddr >= readdr) {
 669                         continue;
 670                 }
 671                 if (P2PHASE((uintptr_t)addr ^ off, pgsz)) {
 672                         continue;
 673                 }
 674                 szcvec |= (1 << i);
 675                 /*
 676                  * And or in the remaining enabled page sizes.
 677                  */
 678                 szcvec |= P2PHASE(~disable_lpgs, (1 << i));
 679                 szcvec &= ~1; /* no need to return 8K pagesize */
 680                 break;
 681         }
 682         return (szcvec);
 683 }
 684 
 685 /*
 686  * Return a bit vector of large page size codes that
 687  * can be used to map [addr, addr + len) region.
 688  */
 689 /* ARGSUSED */
 690 uint_t
 691 map_pgszcvec(caddr_t addr, size_t size, uintptr_t off, int flags, int type,
 692     int memcntl)
 693 {
 694         if (flags & MAP_TEXT) {
 695                 return (map_szcvec(addr, size, off,
 696                     disable_auto_text_large_pages,
 697                     max_utext_lpsize, shm_lpg_min_physmem));
 698 
 699         } else if (flags & MAP_INITDATA) {
 700                 return (map_szcvec(addr, size, off,
 701                     disable_auto_data_large_pages,
 702                     max_uidata_lpsize, privm_lpg_min_physmem));
 703 
 704         } else if (type == MAPPGSZC_SHM) {
 705                 return (map_szcvec(addr, size, off,
 706                     disable_auto_data_large_pages,
 707                     max_shm_lpsize, shm_lpg_min_physmem));
 708 
 709         } else if (type == MAPPGSZC_HEAP) {
 710                 return (map_szcvec(addr, size, off,
 711                     disable_auto_data_large_pages,
 712                     max_uheap_lpsize, privm_lpg_min_physmem));
 713 
 714         } else if (type == MAPPGSZC_STACK) {
 715                 return (map_szcvec(addr, size, off,
 716                     disable_auto_data_large_pages,
 717                     max_ustack_lpsize, privm_lpg_min_physmem));
 718 
 719         } else {
 720                 return (map_szcvec(addr, size, off,
 721                     disable_auto_data_large_pages,
 722                     max_privmap_lpsize, privm_lpg_min_physmem));
 723         }
 724 }
 725 
 726 /*
 727  * Anchored in the table below are counters used to keep track
 728  * of free contiguous physical memory. Each element of the table contains
 729  * the array of counters, the size of array which is allocated during
 730  * startup based on physmax and a shift value used to convert a pagenum
 731  * into a counter array index or vice versa. The table has page size
 732  * for rows and region size for columns:
 733  *
 734  *      page_counters[page_size][region_size]
 735  *
 736  *      page_size:      TTE size code of pages on page_size freelist.
 737  *
 738  *      region_size:    TTE size code of a candidate larger page made up
 739  *                      made up of contiguous free page_size pages.
 740  *
 741  * As you go across a page_size row increasing region_size each
 742  * element keeps track of how many (region_size - 1) size groups
 743  * made up of page_size free pages can be coalesced into a
 744  * regsion_size page. Yuck! Lets try an example:
 745  *
 746  *      page_counters[1][3] is the table element used for identifying
 747  *      candidate 4M pages from contiguous pages off the 64K free list.
 748  *      Each index in the page_counters[1][3].array spans 4M. Its the
 749  *      number of free 512K size (regsion_size - 1) groups of contiguous
 750  *      64K free pages. So when page_counters[1][3].counters[n] == 8
 751  *      we know we have a candidate 4M page made up of 512K size groups
 752  *      of 64K free pages.
 753  */
 754 
 755 /*
 756  * Per page size free lists. 3rd (max_mem_nodes) and 4th (page coloring bins)
 757  * dimensions are allocated dynamically.
 758  */
 759 page_t ***page_freelists[MMU_PAGE_SIZES][MAX_MEM_TYPES];
 760 
 761 /*
 762  * For now there is only a single size cache list.
 763  * Allocated dynamically.
 764  */
 765 page_t ***page_cachelists[MAX_MEM_TYPES];
 766 
 767 kmutex_t *fpc_mutex[NPC_MUTEX];
 768 kmutex_t *cpc_mutex[NPC_MUTEX];
 769 
 770 /*
 771  * Calculate space needed for page freelists and counters
 772  */
 773 size_t
 774 calc_free_pagelist_sz(void)
 775 {
 776         int szc;
 777         size_t alloc_sz, cache_sz, free_sz;
 778 
 779         /*
 780          * one cachelist per color, node, and type
 781          */
 782         cache_sz = (page_get_pagecolors(0) * sizeof (page_t *)) +
 783             sizeof (page_t **);
 784         cache_sz *= max_mem_nodes * MAX_MEM_TYPES;
 785 
 786         /*
 787          * one freelist per size, color, node, and type
 788          */
 789         free_sz = sizeof (page_t **);
 790         for (szc = 0; szc < mmu_page_sizes; szc++)
 791                 free_sz += sizeof (page_t *) * page_get_pagecolors(szc);
 792         free_sz *= max_mem_nodes * MAX_MEM_TYPES;
 793 
 794         alloc_sz = cache_sz + free_sz + page_ctrs_sz();
 795         return (alloc_sz);
 796 }
 797 
 798 caddr_t
 799 alloc_page_freelists(caddr_t alloc_base)
 800 {
 801         int     mnode, mtype;
 802         int     szc, clrs;
 803 
 804         /*
 805          * We only support small pages in the cachelist.
 806          */
 807         for (mtype = 0; mtype < MAX_MEM_TYPES; mtype++) {
 808                 page_cachelists[mtype] = (page_t ***)alloc_base;
 809                 alloc_base += (max_mem_nodes * sizeof (page_t **));
 810                 for (mnode = 0; mnode < max_mem_nodes; mnode++) {
 811                         page_cachelists[mtype][mnode] = (page_t **)alloc_base;
 812                         alloc_base +=
 813                             (page_get_pagecolors(0) * sizeof (page_t *));
 814                 }
 815         }
 816 
 817         /*
 818          * Allocate freelists bins for all
 819          * supported page sizes.
 820          */
 821         for (szc = 0; szc < mmu_page_sizes; szc++) {
 822                 clrs = page_get_pagecolors(szc);
 823                 for (mtype = 0; mtype < MAX_MEM_TYPES; mtype++) {
 824                         page_freelists[szc][mtype] = (page_t ***)alloc_base;
 825                         alloc_base += (max_mem_nodes * sizeof (page_t **));
 826                         for (mnode = 0; mnode < max_mem_nodes; mnode++) {
 827                                 page_freelists[szc][mtype][mnode] =
 828                                     (page_t **)alloc_base;
 829                                 alloc_base += (clrs * (sizeof (page_t *)));
 830                         }
 831                 }
 832         }
 833 
 834         alloc_base = page_ctrs_alloc(alloc_base);
 835         return (alloc_base);
 836 }
 837 
 838 /*
 839  * Allocate page_freelists locks for a memnode from the nucleus data
 840  * area. This is the first time that mmu_page_sizes is used during
 841  * bootup, so check mmu_page_sizes initialization.
 842  */
 843 int
 844 ndata_alloc_page_mutexs(struct memlist *ndata)
 845 {
 846         size_t alloc_sz;
 847         caddr_t alloc_base;
 848         int     i;
 849         void    page_coloring_init();
 850 
 851         page_coloring_init();
 852         if (&mmu_init_mmu_page_sizes) {
 853                 if (!mmu_init_mmu_page_sizes(0)) {
 854                         cmn_err(CE_PANIC, "mmu_page_sizes %d not initialized",
 855                             mmu_page_sizes);
 856                 }
 857         }
 858         ASSERT(mmu_page_sizes >= DEFAULT_MMU_PAGE_SIZES);
 859 
 860         /* fpc_mutex and cpc_mutex */
 861         alloc_sz = 2 * NPC_MUTEX * max_mem_nodes * sizeof (kmutex_t);
 862 
 863         alloc_base = ndata_alloc(ndata, alloc_sz, ecache_alignsize);
 864         if (alloc_base == NULL)
 865                 return (-1);
 866 
 867         ASSERT(((uintptr_t)alloc_base & (ecache_alignsize - 1)) == 0);
 868 
 869         for (i = 0; i < NPC_MUTEX; i++) {
 870                 fpc_mutex[i] = (kmutex_t *)alloc_base;
 871                 alloc_base += (sizeof (kmutex_t) * max_mem_nodes);
 872                 cpc_mutex[i] = (kmutex_t *)alloc_base;
 873                 alloc_base += (sizeof (kmutex_t) * max_mem_nodes);
 874         }
 875         return (0);
 876 }
 877 
 878 /*
 879  * To select our starting bin, we stride through the bins with a stride
 880  * of 337.  Why 337?  It's prime, it's largeish, and it performs well both
 881  * in simulation and practice for different workloads on varying cache sizes.
 882  */
 883 uint32_t color_start_current = 0;
 884 uint32_t color_start_stride = 337;
 885 int color_start_random = 0;
 886 
 887 /* ARGSUSED */
 888 uint_t
 889 get_color_start(struct as *as)
 890 {
 891         uint32_t old, new;
 892 
 893         if (consistent_coloring == 2 || color_start_random) {
 894                 return ((uint_t)(((gettick()) << (vac_shift - MMU_PAGESHIFT)) &
 895                     (hw_page_array[0].hp_colors - 1)));
 896         }
 897 
 898         do {
 899                 old = color_start_current;
 900                 new = old + (color_start_stride << (vac_shift - MMU_PAGESHIFT));
 901         } while (atomic_cas_32(&color_start_current, old, new) != old);
 902 
 903         return ((uint_t)(new));
 904 }
 905 
 906 /*
 907  * Called once at startup from kphysm_init() -- before memialloc()
 908  * is invoked to do the 1st page_free()/page_freelist_add().
 909  *
 910  * initializes page_colors and page_colors_mask based on ecache_setsize.
 911  *
 912  * Also initializes the counter locks.
 913  */
 914 void
 915 page_coloring_init()
 916 {
 917         int     a, i;
 918         uint_t colors;
 919 
 920         if (do_pg_coloring == 0) {
 921                 page_colors = 1;
 922                 for (i = 0; i < mmu_page_sizes; i++) {
 923                         colorequivszc[i] = 0;
 924                         hw_page_array[i].hp_colors = 1;
 925                 }
 926                 return;
 927         }
 928 
 929         /*
 930          * Calculate page_colors from ecache_setsize. ecache_setsize contains
 931          * the max ecache setsize of all cpus configured in the system or, for
 932          * cheetah+ systems, the max possible ecache setsize for all possible
 933          * cheetah+ cpus.
 934          */
 935         page_colors = ecache_setsize / MMU_PAGESIZE;
 936         page_colors_mask = page_colors - 1;
 937 
 938         vac_colors = vac_size / MMU_PAGESIZE;
 939         vac_colors_mask = vac_colors -1;
 940 
 941         page_coloring_shift = 0;
 942         a = ecache_setsize;
 943         while (a >>= 1) {
 944                 page_coloring_shift++;
 945         }
 946 
 947         /* initialize number of colors per page size */
 948         for (i = 0; i < mmu_page_sizes; i++) {
 949                 hw_page_array[i].hp_colors = (page_colors_mask >>
 950                     (hw_page_array[i].hp_shift - hw_page_array[0].hp_shift))
 951                     + 1;
 952                 colorequivszc[i] = 0;
 953         }
 954 
 955         /*
 956          * initialize cpu_page_colors if ecache setsizes are homogenous.
 957          * cpu_page_colors set to -1 during DR operation or during startup
 958          * if setsizes are heterogenous.
 959          *
 960          * The value of cpu_page_colors determines if additional color bins
 961          * need to be checked for a particular color in the page_get routines.
 962          */
 963         if (cpu_setsize > 0 && cpu_page_colors == 0 &&
 964             cpu_setsize < ecache_setsize) {
 965                 cpu_page_colors = cpu_setsize / MMU_PAGESIZE;
 966                 a = lowbit(page_colors) - lowbit(cpu_page_colors);
 967                 ASSERT(a > 0);
 968                 ASSERT(a < 16);
 969 
 970                 for (i = 0; i < mmu_page_sizes; i++) {
 971                         if ((colors = hw_page_array[i].hp_colors) <= 1) {
 972                                 continue;
 973                         }
 974                         while ((colors >> a) == 0)
 975                                 a--;
 976                         ASSERT(a >= 0);
 977 
 978                         /* higher 4 bits encodes color equiv mask */
 979                         colorequivszc[i] = (a << 4);
 980                 }
 981         }
 982 
 983         /* do cpu specific color initialization */
 984         if (&page_coloring_init_cpu) {
 985                 page_coloring_init_cpu();
 986         }
 987 }
 988 
 989 int
 990 bp_color(struct buf *bp)
 991 {
 992         int color = -1;
 993 
 994         if (vac) {
 995                 if ((bp->b_flags & B_PAGEIO) != 0) {
 996                         color = sfmmu_get_ppvcolor(bp->b_pages);
 997                 } else if (bp->b_un.b_addr != NULL) {
 998                         color = sfmmu_get_addrvcolor(bp->b_un.b_addr);
 999                 }
1000         }
1001         return (color < 0 ? 0 : ptob(color));
1002 }
1003 
1004 /*
1005  * Function for flushing D-cache when performing module relocations
1006  * to an alternate mapping.  Stubbed out on all platforms except sun4u,
1007  * at least for now.
1008  */
1009 void
1010 dcache_flushall()
1011 {
1012         sfmmu_cache_flushall();
1013 }
1014 
1015 static int
1016 kdi_range_overlap(uintptr_t va1, size_t sz1, uintptr_t va2, size_t sz2)
1017 {
1018         if (va1 < va2 && va1 + sz1 <= va2)
1019                 return (0);
1020 
1021         if (va2 < va1 && va2 + sz2 <= va1)
1022                 return (0);
1023 
1024         return (1);
1025 }
1026 
1027 /*
1028  * Return the number of bytes, relative to the beginning of a given range, that
1029  * are non-toxic (can be read from and written to with relative impunity).
1030  */
1031 size_t
1032 kdi_range_is_nontoxic(uintptr_t va, size_t sz, int write)
1033 {
1034         /* OBP reads are harmless, but we don't want people writing there */
1035         if (write && kdi_range_overlap(va, sz, OFW_START_ADDR, OFW_END_ADDR -
1036             OFW_START_ADDR + 1))
1037                 return (va < OFW_START_ADDR ? OFW_START_ADDR - va : 0);
1038 
1039         if (kdi_range_overlap(va, sz, PIOMAPBASE, PIOMAPSIZE))
1040                 return (va < PIOMAPBASE ? PIOMAPBASE - va : 0);
1041 
1042         return (sz); /* no overlap */
1043 }
1044 
1045 /*
1046  * Minimum physmem required for enabling large pages for kernel heap
1047  * Currently we do not enable lp for kmem on systems with less
1048  * than 1GB of memory. This value can be changed via /etc/system
1049  */
1050 size_t segkmem_lpminphysmem = 0x40000000;       /* 1GB */
1051 
1052 /*
1053  * this function chooses large page size for kernel heap
1054  */
1055 size_t
1056 get_segkmem_lpsize(size_t lpsize)
1057 {
1058         size_t memtotal = physmem * PAGESIZE;
1059         size_t mmusz;
1060         uint_t szc;
1061 
1062         if (memtotal < segkmem_lpminphysmem)
1063                 return (PAGESIZE);
1064 
1065         if (plat_lpkmem_is_supported != NULL &&
1066             plat_lpkmem_is_supported() == 0)
1067                 return (PAGESIZE);
1068 
1069         mmusz = mmu_get_kernel_lpsize(lpsize);
1070         szc = page_szc(mmusz);
1071 
1072         while (szc) {
1073                 if (!(disable_large_pages & (1 << szc)))
1074                         return (page_get_pagesize(szc));
1075                 szc--;
1076         }
1077         return (PAGESIZE);
1078 }