il_7029 Wdiff usr/src/uts/sun4/vm/vm_dep.c

Print this page

7029 want per-process exploit mitigation features (secflags)
7030 want basic address space layout randomization (aslr)
7031 noexec_user_stack should be a secflag
7032 want a means to forbid mappings around NULL.

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/sun4/vm/vm_dep.c
          +++ new/usr/src/uts/sun4/vm/vm_dep.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23   23   * Use is subject to license terms.
  24   24   */
  25   25  
  26   26  /*
  27   27   * UNIX machine dependent virtual memory support.
  28   28   */
  29   29

↓ open down ↓

29 lines elided

↑ open up ↑

  30   30  #include <sys/vm.h>
  31   31  #include <sys/exec.h>
  32   32  
  33   33  #include <sys/exechdr.h>
  34   34  #include <vm/seg_kmem.h>
  35   35  #include <sys/atomic.h>
  36   36  #include <sys/archsystm.h>
  37   37  #include <sys/machsystm.h>
  38   38  #include <sys/kdi.h>
  39   39  #include <sys/cpu_module.h>
       40 +#include <sys/secflags.h>
  40   41  
  41   42  #include <vm/hat_sfmmu.h>
  42   43  
  43   44  #include <sys/memnode.h>
  44   45  
  45   46  #include <sys/mem_config.h>
  46   47  #include <sys/mem_cage.h>
  47   48  #include <vm/vm_dep.h>
  48   49  #include <vm/page.h>
  49   50  #include <sys/platform_module.h>

  50   51  
  51   52  /*
  52   53   * These variables are set by module specific config routines.
  53   54   * They are only set by modules which will use physical cache page coloring.
  54   55   */
  55   56  int do_pg_coloring = 0;
  56   57  
  57   58  /*
  58   59   * These variables can be conveniently patched at kernel load time to
  59   60   * prevent do_pg_coloring from being enabled by
  60   61   * module specific config routines.
  61   62   */
  62   63  
  63   64  int use_page_coloring = 1;
  64   65  
  65   66  /*
  66   67   * initialized by page_coloring_init()
  67   68   */
  68   69  extern uint_t page_colors;
  69   70  extern uint_t page_colors_mask;
  70   71  extern uint_t page_coloring_shift;
  71   72  int cpu_page_colors;
  72   73  uint_t vac_colors = 0;
  73   74  uint_t vac_colors_mask = 0;
  74   75  
  75   76  /* cpu specific coloring initialization */
  76   77  extern void page_coloring_init_cpu();
  77   78  #pragma weak page_coloring_init_cpu
  78   79  
  79   80  /*
  80   81   * get the ecache setsize for the current cpu.
  81   82   */
  82   83  #define CPUSETSIZE()    (cpunodes[CPU->cpu_id].ecache_setsize)
  83   84  
  84   85  plcnt_t         plcnt;          /* page list count */
  85   86  
  86   87  /*
  87   88   * This variable is set by the cpu module to contain the lowest
  88   89   * address not affected by the SF_ERRATA_57 workaround.  It should
  89   90   * remain 0 if the workaround is not needed.
  90   91   */
  91   92  #if defined(SF_ERRATA_57)
  92   93  caddr_t errata57_limit;
  93   94  #endif
  94   95  
  95   96  extern void page_relocate_hash(page_t *, page_t *);
  96   97  
  97   98  /*
  98   99   * these must be defined in platform specific areas
  99  100   */
 100  101  extern void map_addr_proc(caddr_t *, size_t, offset_t, int, caddr_t,
 101  102          struct proc *, uint_t);
 102  103  extern page_t *page_get_freelist(struct vnode *, u_offset_t, struct seg *,
 103  104          caddr_t, size_t, uint_t, struct lgrp *);
 104  105  /*
 105  106   * Convert page frame number to an OBMEM page frame number
 106  107   * (i.e. put in the type bits -- zero for this implementation)
 107  108   */
 108  109  pfn_t
 109  110  impl_obmem_pfnum(pfn_t pf)
 110  111  {
 111  112          return (pf);
 112  113  }
 113  114  
 114  115  /*
 115  116   * Use physmax to determine the highest physical page of DRAM memory
 116  117   * It is assumed that any physical addresses above physmax is in IO space.
 117  118   * We don't bother checking the low end because we assume that memory space
 118  119   * begins at physical page frame 0.
 119  120   *
 120  121   * Return 1 if the page frame is onboard DRAM memory, else 0.
 121  122   * Returns 0 for nvram so it won't be cached.
 122  123   */
 123  124  int
 124  125  pf_is_memory(pfn_t pf)
 125  126  {
 126  127          /* We must be IO space */
 127  128          if (pf > physmax)
 128  129                  return (0);
 129  130  
 130  131          /* We must be memory space */
 131  132          return (1);
 132  133  }
 133  134  
 134  135  /*
 135  136   * Handle a pagefault.
 136  137   */
 137  138  faultcode_t
 138  139  pagefault(caddr_t addr, enum fault_type type, enum seg_rw rw, int iskernel)
 139  140  {
 140  141          struct as *as;
 141  142          struct proc *p;
 142  143          faultcode_t res;
 143  144          caddr_t base;
 144  145          size_t len;
 145  146          int err;
 146  147  
 147  148          if (INVALID_VADDR(addr))
 148  149                  return (FC_NOMAP);
 149  150  
 150  151          if (iskernel) {
 151  152                  as = &kas;
 152  153          } else {
 153  154                  p = curproc;
 154  155                  as = p->p_as;
 155  156  #if defined(SF_ERRATA_57)
 156  157                  /*
 157  158                   * Prevent infinite loops due to a segment driver
 158  159                   * setting the execute permissions and the sfmmu hat
 159  160                   * silently ignoring them.
 160  161                   */
 161  162                  if (rw == S_EXEC && AS_TYPE_64BIT(as) &&
 162  163                      addr < errata57_limit) {
 163  164                          res = FC_NOMAP;
 164  165                          goto out;
 165  166                  }
 166  167  #endif
 167  168          }
 168  169  
 169  170          /*
 170  171           * Dispatch pagefault.
 171  172           */
 172  173          res = as_fault(as->a_hat, as, addr, 1, type, rw);
 173  174  
 174  175          /*
 175  176           * If this isn't a potential unmapped hole in the user's
 176  177           * UNIX data or stack segments, just return status info.
 177  178           */
 178  179          if (!(res == FC_NOMAP && iskernel == 0))
 179  180                  goto out;
 180  181  
 181  182          /*
 182  183           * Check to see if we happened to faulted on a currently unmapped
 183  184           * part of the UNIX data or stack segments.  If so, create a zfod
 184  185           * mapping there and then try calling the fault routine again.
 185  186           */
 186  187          base = p->p_brkbase;
 187  188          len = p->p_brksize;
 188  189  
 189  190          if (addr < base || addr >= base + len) {                /* data seg? */
 190  191                  base = (caddr_t)(p->p_usrstack - p->p_stksize);
 191  192                  len = p->p_stksize;
 192  193                  if (addr < base || addr >= p->p_usrstack) {     /* stack seg? */
 193  194                          /* not in either UNIX data or stack segments */
 194  195                          res = FC_NOMAP;
 195  196                          goto out;
 196  197                  }
 197  198          }
 198  199  
 199  200          /* the rest of this function implements a 3.X 4.X 5.X compatibility */
 200  201          /* This code is probably not needed anymore */
 201  202  
 202  203          /* expand the gap to the page boundaries on each side */
 203  204          len = (((uintptr_t)base + len + PAGEOFFSET) & PAGEMASK) -
 204  205              ((uintptr_t)base & PAGEMASK);
 205  206          base = (caddr_t)((uintptr_t)base & PAGEMASK);
 206  207  
 207  208          as_rangelock(as);
 208  209          as_purge(as);
 209  210          if (as_gap(as, PAGESIZE, &base, &len, AH_CONTAIN, addr) == 0) {
 210  211                  err = as_map(as, base, len, segvn_create, zfod_argsp);
 211  212                  as_rangeunlock(as);
 212  213                  if (err) {
 213  214                          res = FC_MAKE_ERR(err);
 214  215                          goto out;
 215  216                  }
 216  217          } else {
 217  218                  /*
 218  219                   * This page is already mapped by another thread after we
 219  220                   * returned from as_fault() above.  We just fallthrough
 220  221                   * as_fault() below.
 221  222                   */
 222  223                  as_rangeunlock(as);
 223  224          }
 224  225  
 225  226          res = as_fault(as->a_hat, as, addr, 1, F_INVAL, rw);
 226  227  
 227  228  out:
 228  229  
 229  230          return (res);
 230  231  }
 231  232  
 232  233  /*
 233  234   * This is the routine which defines the address limit implied
 234  235   * by the flag '_MAP_LOW32'.  USERLIMIT32 matches the highest
 235  236   * mappable address in a 32-bit process on this platform (though
 236  237   * perhaps we should make it be UINT32_MAX here?)
 237  238   */
 238  239  void
 239  240  map_addr(caddr_t *addrp, size_t len, offset_t off, int vacalign, uint_t flags)
 240  241  {
 241  242          struct proc *p = curproc;
 242  243          caddr_t userlimit = flags & _MAP_LOW32 ?
 243  244              (caddr_t)USERLIMIT32 : p->p_as->a_userlimit;
 244  245          map_addr_proc(addrp, len, off, vacalign, userlimit, p, flags);
 245  246  }
 246  247  
 247  248  /*
 248  249   * Some V9 CPUs have holes in the middle of the 64-bit virtual address range.
 249  250   */
 250  251  caddr_t hole_start, hole_end;
 251  252  
 252  253  /*
 253  254   * kpm mapping window
 254  255   */
 255  256  caddr_t kpm_vbase;
 256  257  size_t  kpm_size;
 257  258  uchar_t kpm_size_shift;
 258  259  
 259  260  int valid_va_range_aligned_wraparound;
 260  261  /*
 261  262   * Determine whether [*basep, *basep + *lenp) contains a mappable range of
 262  263   * addresses at least "minlen" long, where the base of the range is at "off"
 263  264   * phase from an "align" boundary and there is space for a "redzone"-sized
 264  265   * redzone on either side of the range.  On success, 1 is returned and *basep
 265  266   * and *lenp are adjusted to describe the acceptable range (including
 266  267   * the redzone).  On failure, 0 is returned.
 267  268   */
 268  269  int
 269  270  valid_va_range_aligned(caddr_t *basep, size_t *lenp, size_t minlen, int dir,
 270  271      size_t align, size_t redzone, size_t off)
 271  272  {
 272  273          caddr_t hi, lo;
 273  274          size_t tot_len;
 274  275  
 275  276          ASSERT(align == 0 ? off == 0 : off < align);
 276  277          ASSERT(ISP2(align));
 277  278          ASSERT(align == 0 || align >= PAGESIZE);
 278  279  
 279  280          lo = *basep;
 280  281          hi = lo + *lenp;
 281  282          tot_len = minlen + 2 * redzone; /* need at least this much space */
 282  283  
 283  284          /* If hi rolled over the top try cutting back. */
 284  285          if (hi < lo) {
 285  286                  *lenp = 0UL - (uintptr_t)lo - 1UL;
 286  287                  /* Trying to see if this really happens, and then if so, why */
 287  288                  valid_va_range_aligned_wraparound++;
 288  289                  hi = lo + *lenp;
 289  290          }
 290  291          if (*lenp < tot_len) {
 291  292                  return (0);
 292  293          }
 293  294  
 294  295          /*
 295  296           * Deal with a possible hole in the address range between
 296  297           * hole_start and hole_end that should never be mapped by the MMU.
 297  298           */
 298  299  
 299  300          if (lo < hole_start) {
 300  301                  if (hi > hole_start)
 301  302                          if (hi < hole_end)
 302  303                                  hi = hole_start;
 303  304                          else
 304  305                                  /* lo < hole_start && hi >= hole_end */
 305  306                                  if (dir == AH_LO) {
 306  307                                          /*
 307  308                                           * prefer lowest range
 308  309                                           */
 309  310                                          if (hole_start - lo >= tot_len)
 310  311                                                  hi = hole_start;
 311  312                                          else if (hi - hole_end >= tot_len)
 312  313                                                  lo = hole_end;
 313  314                                          else
 314  315                                                  return (0);
 315  316                                  } else {
 316  317                                          /*
 317  318                                           * prefer highest range
 318  319                                           */
 319  320                                          if (hi - hole_end >= tot_len)
 320  321                                                  lo = hole_end;
 321  322                                          else if (hole_start - lo >= tot_len)
 322  323                                                  hi = hole_start;
 323  324                                          else
 324  325                                                  return (0);
 325  326                                  }
 326  327          } else {
 327  328                  /* lo >= hole_start */
 328  329                  if (hi < hole_end)
 329  330                          return (0);
 330  331                  if (lo < hole_end)
 331  332                          lo = hole_end;
 332  333          }
 333  334  
 334  335          /* Check if remaining length is too small */
 335  336          if (hi - lo < tot_len) {
 336  337                  return (0);
 337  338          }
 338  339          if (align > 1) {
 339  340                  caddr_t tlo = lo + redzone;
 340  341                  caddr_t thi = hi - redzone;
 341  342                  tlo = (caddr_t)P2PHASEUP((uintptr_t)tlo, align, off);
 342  343                  if (tlo < lo + redzone) {
 343  344                          return (0);
 344  345                  }
 345  346                  if (thi < tlo || thi - tlo < minlen) {
 346  347                          return (0);
 347  348                  }
 348  349          }
 349  350          *basep = lo;
 350  351          *lenp = hi - lo;
 351  352          return (1);
 352  353  }
 353  354  
 354  355  /*
 355  356   * Determine whether [*basep, *basep + *lenp) contains a mappable range of
 356  357   * addresses at least "minlen" long.  On success, 1 is returned and *basep

↓ open down ↓

307 lines elided

↑ open up ↑

 357  358   * and *lenp are adjusted to describe the acceptable range.  On failure, 0
 358  359   * is returned.
 359  360   */
 360  361  int
 361  362  valid_va_range(caddr_t *basep, size_t *lenp, size_t minlen, int dir)
 362  363  {
 363  364          return (valid_va_range_aligned(basep, lenp, minlen, dir, 0, 0, 0));
 364  365  }
 365  366  
 366  367  /*
      368 + * Default to forbidding the first 64k of address space.  This protects most
      369 + * reasonably sized structures from dereferences through NULL:
      370 + *     ((foo_t *)0)->bar
      371 + */
      372 +uintptr_t forbidden_null_mapping_sz = 0x10000;
      373 +
      374 +/*
 367  375   * Determine whether [addr, addr+len] with protections `prot' are valid
 368  376   * for a user address space.
 369  377   */
 370  378  /*ARGSUSED*/
 371  379  int
 372  380  valid_usr_range(caddr_t addr, size_t len, uint_t prot, struct as *as,
 373  381      caddr_t userlimit)
 374  382  {
 375  383          caddr_t eaddr = addr + len;
 376  384  
 377  385          if (eaddr <= addr || addr >= userlimit || eaddr > userlimit)
 378  386                  return (RANGE_BADADDR);
 379  387  
      388 +        if ((addr <= (caddr_t)forbidden_null_mapping_sz) &&
      389 +            secflag_enabled(as->a_proc, PROC_SEC_FORBIDNULLMAP))
      390 +                return (RANGE_BADADDR);
      391 +
 380  392          /*
 381  393           * Determine if the address range falls within an illegal
 382  394           * range of the MMU.
 383  395           */
 384  396          if (eaddr > hole_start && addr < hole_end)
 385  397                  return (RANGE_BADADDR);
 386  398  
 387  399  #if defined(SF_ERRATA_57)
 388  400          /*
 389  401           * Make sure USERLIMIT isn't raised too high

 390  402           */
 391  403          ASSERT64(addr <= (caddr_t)0xffffffff80000000ul ||
 392  404              errata57_limit == 0);
 393  405  
 394  406          if (AS_TYPE_64BIT(as) &&
 395  407              (addr < errata57_limit) &&
 396  408              (prot & PROT_EXEC))
 397  409                  return (RANGE_BADPROT);
 398  410  #endif /* SF_ERRATA57 */
 399  411          return (RANGE_OKAY);
 400  412  }
 401  413  
 402  414  /*
 403  415   * Routine used to check to see if an a.out can be executed
 404  416   * by the current machine/architecture.
 405  417   */
 406  418  int
 407  419  chkaout(struct exdata *exp)
 408  420  {
 409  421          if (exp->ux_mach == M_SPARC)
 410  422                  return (0);
 411  423          else
 412  424                  return (ENOEXEC);
 413  425  }
 414  426  
 415  427  /*
 416  428   * The following functions return information about an a.out
 417  429   * which is used when a program is executed.
 418  430   */
 419  431  
 420  432  /*
 421  433   * Return the load memory address for the data segment.
 422  434   */
 423  435  caddr_t
 424  436  getdmem(struct exec *exp)
 425  437  {
 426  438          /*
 427  439           * XXX - Sparc Reference Hack approaching
 428  440           * Remember that we are loading
 429  441           * 8k executables into a 4k machine
 430  442           * DATA_ALIGN == 2 * PAGESIZE
 431  443           */
 432  444          if (exp->a_text)
 433  445                  return ((caddr_t)(roundup(USRTEXT + exp->a_text, DATA_ALIGN)));
 434  446          else
 435  447                  return ((caddr_t)USRTEXT);
 436  448  }
 437  449  
 438  450  /*
 439  451   * Return the starting disk address for the data segment.
 440  452   */
 441  453  ulong_t
 442  454  getdfile(struct exec *exp)
 443  455  {
 444  456          if (exp->a_magic == ZMAGIC)
 445  457                  return (exp->a_text);
 446  458          else
 447  459                  return (sizeof (struct exec) + exp->a_text);
 448  460  }
 449  461  
 450  462  /*
 451  463   * Return the load memory address for the text segment.
 452  464   */
 453  465  
 454  466  /*ARGSUSED*/
 455  467  caddr_t
 456  468  gettmem(struct exec *exp)
 457  469  {
 458  470          return ((caddr_t)USRTEXT);
 459  471  }
 460  472  
 461  473  /*
 462  474   * Return the file byte offset for the text segment.
 463  475   */
 464  476  uint_t
 465  477  gettfile(struct exec *exp)
 466  478  {
 467  479          if (exp->a_magic == ZMAGIC)
 468  480                  return (0);
 469  481          else
 470  482                  return (sizeof (struct exec));
 471  483  }
 472  484  
 473  485  void
 474  486  getexinfo(
 475  487          struct exdata *edp_in,
 476  488          struct exdata *edp_out,
 477  489          int *pagetext,
 478  490          int *pagedata)
 479  491  {
 480  492          *edp_out = *edp_in;     /* structure copy */
 481  493  
 482  494          if ((edp_in->ux_mag == ZMAGIC) &&
 483  495              ((edp_in->vp->v_flag & VNOMAP) == 0)) {
 484  496                  *pagetext = 1;
 485  497                  *pagedata = 1;
 486  498          } else {
 487  499                  *pagetext = 0;
 488  500                  *pagedata = 0;
 489  501          }
 490  502  }
 491  503  
 492  504  /*
 493  505   * Return non 0 value if the address may cause a VAC alias with KPM mappings.
 494  506   * KPM selects an address such that it's equal offset modulo shm_alignment and
 495  507   * assumes it can't be in VAC conflict with any larger than PAGESIZE mapping.
 496  508   */
 497  509  int
 498  510  map_addr_vacalign_check(caddr_t addr, u_offset_t off)
 499  511  {
 500  512          if (vac) {
 501  513                  return (((uintptr_t)addr ^ off) & shm_alignment - 1);
 502  514          } else {
 503  515                  return (0);
 504  516          }
 505  517  }
 506  518  
 507  519  /*
 508  520   * Sanity control. Don't use large pages regardless of user
 509  521   * settings if there's less than priv or shm_lpg_min_physmem memory installed.
 510  522   * The units for this variable is 8K pages.
 511  523   */
 512  524  pgcnt_t shm_lpg_min_physmem = 131072;                   /* 1GB */
 513  525  pgcnt_t privm_lpg_min_physmem = 131072;                 /* 1GB */
 514  526  
 515  527  static size_t
 516  528  map_pgszheap(struct proc *p, caddr_t addr, size_t len)
 517  529  {
 518  530          size_t          pgsz = MMU_PAGESIZE;
 519  531          int             szc;
 520  532  
 521  533          /*
 522  534           * If len is zero, retrieve from proc and don't demote the page size.
 523  535           * Use atleast the default pagesize.
 524  536           */
 525  537          if (len == 0) {
 526  538                  len = p->p_brkbase + p->p_brksize - p->p_bssbase;
 527  539          }
 528  540          len = MAX(len, default_uheap_lpsize);
 529  541  
 530  542          for (szc = mmu_page_sizes - 1; szc >= 0; szc--) {
 531  543                  pgsz = hw_page_array[szc].hp_size;
 532  544                  if ((disable_auto_data_large_pages & (1 << szc)) ||
 533  545                      pgsz > max_uheap_lpsize)
 534  546                          continue;
 535  547                  if (len >= pgsz) {
 536  548                          break;
 537  549                  }
 538  550          }
 539  551  
 540  552          /*
 541  553           * If addr == 0 we were called by memcntl() when the
 542  554           * size code is 0.  Don't set pgsz less than current size.
 543  555           */
 544  556          if (addr == 0 && (pgsz < hw_page_array[p->p_brkpageszc].hp_size)) {
 545  557                  pgsz = hw_page_array[p->p_brkpageszc].hp_size;
 546  558          }
 547  559  
 548  560          return (pgsz);
 549  561  }
 550  562  
 551  563  static size_t
 552  564  map_pgszstk(struct proc *p, caddr_t addr, size_t len)
 553  565  {
 554  566          size_t          pgsz = MMU_PAGESIZE;
 555  567          int             szc;
 556  568  
 557  569          /*
 558  570           * If len is zero, retrieve from proc and don't demote the page size.
 559  571           * Use atleast the default pagesize.
 560  572           */
 561  573          if (len == 0) {
 562  574                  len = p->p_stksize;
 563  575          }
 564  576          len = MAX(len, default_ustack_lpsize);
 565  577  
 566  578          for (szc = mmu_page_sizes - 1; szc >= 0; szc--) {
 567  579                  pgsz = hw_page_array[szc].hp_size;
 568  580                  if ((disable_auto_data_large_pages & (1 << szc)) ||
 569  581                      pgsz > max_ustack_lpsize)
 570  582                          continue;
 571  583                  if (len >= pgsz) {
 572  584                          break;
 573  585                  }
 574  586          }
 575  587  
 576  588          /*
 577  589           * If addr == 0 we were called by memcntl() or exec_args() when the
 578  590           * size code is 0.  Don't set pgsz less than current size.
 579  591           */
 580  592          if (addr == 0 && (pgsz < hw_page_array[p->p_stkpageszc].hp_size)) {
 581  593                  pgsz = hw_page_array[p->p_stkpageszc].hp_size;
 582  594          }
 583  595  
 584  596          return (pgsz);
 585  597  }
 586  598  
 587  599  static size_t
 588  600  map_pgszism(caddr_t addr, size_t len)
 589  601  {
 590  602          uint_t szc;
 591  603          size_t pgsz;
 592  604  
 593  605          for (szc = mmu_page_sizes - 1; szc >= TTE4M; szc--) {
 594  606                  if (disable_ism_large_pages & (1 << szc))
 595  607                          continue;
 596  608  
 597  609                  pgsz = hw_page_array[szc].hp_size;
 598  610                  if ((len >= pgsz) && IS_P2ALIGNED(addr, pgsz))
 599  611                          return (pgsz);
 600  612          }
 601  613  
 602  614          return (DEFAULT_ISM_PAGESIZE);
 603  615  }
 604  616  
 605  617  /*
 606  618   * Suggest a page size to be used to map a segment of type maptype and length
 607  619   * len.  Returns a page size (not a size code).
 608  620   */
 609  621  /* ARGSUSED */
 610  622  size_t
 611  623  map_pgsz(int maptype, struct proc *p, caddr_t addr, size_t len, int memcntl)
 612  624  {
 613  625          size_t  pgsz = MMU_PAGESIZE;
 614  626  
 615  627          ASSERT(maptype != MAPPGSZ_VA);
 616  628  
 617  629          if (maptype != MAPPGSZ_ISM && physmem < privm_lpg_min_physmem) {
 618  630                  return (MMU_PAGESIZE);
 619  631          }
 620  632  
 621  633          switch (maptype) {
 622  634          case MAPPGSZ_ISM:
 623  635                  pgsz = map_pgszism(addr, len);
 624  636                  break;
 625  637  
 626  638          case MAPPGSZ_STK:
 627  639                  if (max_ustack_lpsize > MMU_PAGESIZE) {
 628  640                          pgsz = map_pgszstk(p, addr, len);
 629  641                  }
 630  642                  break;
 631  643  
 632  644          case MAPPGSZ_HEAP:
 633  645                  if (max_uheap_lpsize > MMU_PAGESIZE) {
 634  646                          pgsz = map_pgszheap(p, addr, len);
 635  647                  }
 636  648                  break;
 637  649          }
 638  650          return (pgsz);
 639  651  }
 640  652  
 641  653  
 642  654  /* assumes TTE8K...TTE4M == szc */
 643  655  
 644  656  static uint_t
 645  657  map_szcvec(caddr_t addr, size_t size, uintptr_t off, int disable_lpgs,
 646  658      size_t max_lpsize, size_t min_physmem)
 647  659  {
 648  660          caddr_t eaddr = addr + size;
 649  661          uint_t szcvec = 0;
 650  662          caddr_t raddr;
 651  663          caddr_t readdr;
 652  664          size_t pgsz;
 653  665          int i;
 654  666  
 655  667          if (physmem < min_physmem || max_lpsize <= MMU_PAGESIZE) {
 656  668                  return (0);
 657  669          }
 658  670          for (i = mmu_page_sizes - 1; i > 0; i--) {
 659  671                  if (disable_lpgs & (1 << i)) {
 660  672                          continue;
 661  673                  }
 662  674                  pgsz = page_get_pagesize(i);
 663  675                  if (pgsz > max_lpsize) {
 664  676                          continue;
 665  677                  }
 666  678                  raddr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
 667  679                  readdr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
 668  680                  if (raddr < addr || raddr >= readdr) {
 669  681                          continue;
 670  682                  }
 671  683                  if (P2PHASE((uintptr_t)addr ^ off, pgsz)) {
 672  684                          continue;
 673  685                  }
 674  686                  szcvec |= (1 << i);
 675  687                  /*
 676  688                   * And or in the remaining enabled page sizes.
 677  689                   */
 678  690                  szcvec |= P2PHASE(~disable_lpgs, (1 << i));
 679  691                  szcvec &= ~1; /* no need to return 8K pagesize */
 680  692                  break;
 681  693          }
 682  694          return (szcvec);
 683  695  }
 684  696  
 685  697  /*
 686  698   * Return a bit vector of large page size codes that
 687  699   * can be used to map [addr, addr + len) region.
 688  700   */
 689  701  /* ARGSUSED */
 690  702  uint_t
 691  703  map_pgszcvec(caddr_t addr, size_t size, uintptr_t off, int flags, int type,
 692  704      int memcntl)
 693  705  {
 694  706          if (flags & MAP_TEXT) {
 695  707                  return (map_szcvec(addr, size, off,
 696  708                      disable_auto_text_large_pages,
 697  709                      max_utext_lpsize, shm_lpg_min_physmem));
 698  710  
 699  711          } else if (flags & MAP_INITDATA) {
 700  712                  return (map_szcvec(addr, size, off,
 701  713                      disable_auto_data_large_pages,
 702  714                      max_uidata_lpsize, privm_lpg_min_physmem));
 703  715  
 704  716          } else if (type == MAPPGSZC_SHM) {
 705  717                  return (map_szcvec(addr, size, off,
 706  718                      disable_auto_data_large_pages,
 707  719                      max_shm_lpsize, shm_lpg_min_physmem));
 708  720  
 709  721          } else if (type == MAPPGSZC_HEAP) {
 710  722                  return (map_szcvec(addr, size, off,
 711  723                      disable_auto_data_large_pages,
 712  724                      max_uheap_lpsize, privm_lpg_min_physmem));
 713  725  
 714  726          } else if (type == MAPPGSZC_STACK) {
 715  727                  return (map_szcvec(addr, size, off,
 716  728                      disable_auto_data_large_pages,
 717  729                      max_ustack_lpsize, privm_lpg_min_physmem));
 718  730  
 719  731          } else {
 720  732                  return (map_szcvec(addr, size, off,
 721  733                      disable_auto_data_large_pages,
 722  734                      max_privmap_lpsize, privm_lpg_min_physmem));
 723  735          }
 724  736  }
 725  737  
 726  738  /*
 727  739   * Anchored in the table below are counters used to keep track
 728  740   * of free contiguous physical memory. Each element of the table contains
 729  741   * the array of counters, the size of array which is allocated during
 730  742   * startup based on physmax and a shift value used to convert a pagenum
 731  743   * into a counter array index or vice versa. The table has page size
 732  744   * for rows and region size for columns:
 733  745   *
 734  746   *      page_counters[page_size][region_size]
 735  747   *
 736  748   *      page_size:      TTE size code of pages on page_size freelist.
 737  749   *
 738  750   *      region_size:    TTE size code of a candidate larger page made up
 739  751   *                      made up of contiguous free page_size pages.
 740  752   *
 741  753   * As you go across a page_size row increasing region_size each
 742  754   * element keeps track of how many (region_size - 1) size groups
 743  755   * made up of page_size free pages can be coalesced into a
 744  756   * regsion_size page. Yuck! Lets try an example:
 745  757   *
 746  758   *      page_counters[1][3] is the table element used for identifying
 747  759   *      candidate 4M pages from contiguous pages off the 64K free list.
 748  760   *      Each index in the page_counters[1][3].array spans 4M. Its the
 749  761   *      number of free 512K size (regsion_size - 1) groups of contiguous
 750  762   *      64K free pages. So when page_counters[1][3].counters[n] == 8
 751  763   *      we know we have a candidate 4M page made up of 512K size groups
 752  764   *      of 64K free pages.
 753  765   */
 754  766  
 755  767  /*
 756  768   * Per page size free lists. 3rd (max_mem_nodes) and 4th (page coloring bins)
 757  769   * dimensions are allocated dynamically.
 758  770   */
 759  771  page_t ***page_freelists[MMU_PAGE_SIZES][MAX_MEM_TYPES];
 760  772  
 761  773  /*
 762  774   * For now there is only a single size cache list.
 763  775   * Allocated dynamically.
 764  776   */
 765  777  page_t ***page_cachelists[MAX_MEM_TYPES];
 766  778  
 767  779  kmutex_t *fpc_mutex[NPC_MUTEX];
 768  780  kmutex_t *cpc_mutex[NPC_MUTEX];
 769  781  
 770  782  /*
 771  783   * Calculate space needed for page freelists and counters
 772  784   */
 773  785  size_t
 774  786  calc_free_pagelist_sz(void)
 775  787  {
 776  788          int szc;
 777  789          size_t alloc_sz, cache_sz, free_sz;
 778  790  
 779  791          /*
 780  792           * one cachelist per color, node, and type
 781  793           */
 782  794          cache_sz = (page_get_pagecolors(0) * sizeof (page_t *)) +
 783  795              sizeof (page_t **);
 784  796          cache_sz *= max_mem_nodes * MAX_MEM_TYPES;
 785  797  
 786  798          /*
 787  799           * one freelist per size, color, node, and type
 788  800           */
 789  801          free_sz = sizeof (page_t **);
 790  802          for (szc = 0; szc < mmu_page_sizes; szc++)
 791  803                  free_sz += sizeof (page_t *) * page_get_pagecolors(szc);
 792  804          free_sz *= max_mem_nodes * MAX_MEM_TYPES;
 793  805  
 794  806          alloc_sz = cache_sz + free_sz + page_ctrs_sz();
 795  807          return (alloc_sz);
 796  808  }
 797  809  
 798  810  caddr_t
 799  811  alloc_page_freelists(caddr_t alloc_base)
 800  812  {
 801  813          int     mnode, mtype;
 802  814          int     szc, clrs;
 803  815  
 804  816          /*
 805  817           * We only support small pages in the cachelist.
 806  818           */
 807  819          for (mtype = 0; mtype < MAX_MEM_TYPES; mtype++) {
 808  820                  page_cachelists[mtype] = (page_t ***)alloc_base;
 809  821                  alloc_base += (max_mem_nodes * sizeof (page_t **));
 810  822                  for (mnode = 0; mnode < max_mem_nodes; mnode++) {
 811  823                          page_cachelists[mtype][mnode] = (page_t **)alloc_base;
 812  824                          alloc_base +=
 813  825                              (page_get_pagecolors(0) * sizeof (page_t *));
 814  826                  }
 815  827          }
 816  828  
 817  829          /*
 818  830           * Allocate freelists bins for all
 819  831           * supported page sizes.
 820  832           */
 821  833          for (szc = 0; szc < mmu_page_sizes; szc++) {
 822  834                  clrs = page_get_pagecolors(szc);
 823  835                  for (mtype = 0; mtype < MAX_MEM_TYPES; mtype++) {
 824  836                          page_freelists[szc][mtype] = (page_t ***)alloc_base;
 825  837                          alloc_base += (max_mem_nodes * sizeof (page_t **));
 826  838                          for (mnode = 0; mnode < max_mem_nodes; mnode++) {
 827  839                                  page_freelists[szc][mtype][mnode] =
 828  840                                      (page_t **)alloc_base;
 829  841                                  alloc_base += (clrs * (sizeof (page_t *)));
 830  842                          }
 831  843                  }
 832  844          }
 833  845  
 834  846          alloc_base = page_ctrs_alloc(alloc_base);
 835  847          return (alloc_base);
 836  848  }
 837  849  
 838  850  /*
 839  851   * Allocate page_freelists locks for a memnode from the nucleus data
 840  852   * area. This is the first time that mmu_page_sizes is used during
 841  853   * bootup, so check mmu_page_sizes initialization.
 842  854   */
 843  855  int
 844  856  ndata_alloc_page_mutexs(struct memlist *ndata)
 845  857  {
 846  858          size_t alloc_sz;
 847  859          caddr_t alloc_base;
 848  860          int     i;
 849  861          void    page_coloring_init();
 850  862  
 851  863          page_coloring_init();
 852  864          if (&mmu_init_mmu_page_sizes) {
 853  865                  if (!mmu_init_mmu_page_sizes(0)) {
 854  866                          cmn_err(CE_PANIC, "mmu_page_sizes %d not initialized",
 855  867                              mmu_page_sizes);
 856  868                  }
 857  869          }
 858  870          ASSERT(mmu_page_sizes >= DEFAULT_MMU_PAGE_SIZES);
 859  871  
 860  872          /* fpc_mutex and cpc_mutex */
 861  873          alloc_sz = 2 * NPC_MUTEX * max_mem_nodes * sizeof (kmutex_t);
 862  874  
 863  875          alloc_base = ndata_alloc(ndata, alloc_sz, ecache_alignsize);
 864  876          if (alloc_base == NULL)
 865  877                  return (-1);
 866  878  
 867  879          ASSERT(((uintptr_t)alloc_base & (ecache_alignsize - 1)) == 0);
 868  880  
 869  881          for (i = 0; i < NPC_MUTEX; i++) {
 870  882                  fpc_mutex[i] = (kmutex_t *)alloc_base;
 871  883                  alloc_base += (sizeof (kmutex_t) * max_mem_nodes);
 872  884                  cpc_mutex[i] = (kmutex_t *)alloc_base;
 873  885                  alloc_base += (sizeof (kmutex_t) * max_mem_nodes);
 874  886          }
 875  887          return (0);
 876  888  }
 877  889  
 878  890  /*
 879  891   * To select our starting bin, we stride through the bins with a stride
 880  892   * of 337.  Why 337?  It's prime, it's largeish, and it performs well both
 881  893   * in simulation and practice for different workloads on varying cache sizes.
 882  894   */
 883  895  uint32_t color_start_current = 0;
 884  896  uint32_t color_start_stride = 337;
 885  897  int color_start_random = 0;
 886  898  
 887  899  /* ARGSUSED */
 888  900  uint_t
 889  901  get_color_start(struct as *as)
 890  902  {
 891  903          uint32_t old, new;
 892  904  
 893  905          if (consistent_coloring == 2 || color_start_random) {
 894  906                  return ((uint_t)(((gettick()) << (vac_shift - MMU_PAGESHIFT)) &
 895  907                      (hw_page_array[0].hp_colors - 1)));
 896  908          }
 897  909  
 898  910          do {
 899  911                  old = color_start_current;
 900  912                  new = old + (color_start_stride << (vac_shift - MMU_PAGESHIFT));
 901  913          } while (atomic_cas_32(&color_start_current, old, new) != old);
 902  914  
 903  915          return ((uint_t)(new));
 904  916  }
 905  917  
 906  918  /*
 907  919   * Called once at startup from kphysm_init() -- before memialloc()
 908  920   * is invoked to do the 1st page_free()/page_freelist_add().
 909  921   *
 910  922   * initializes page_colors and page_colors_mask based on ecache_setsize.
 911  923   *
 912  924   * Also initializes the counter locks.
 913  925   */
 914  926  void
 915  927  page_coloring_init()
 916  928  {
 917  929          int     a, i;
 918  930          uint_t colors;
 919  931  
 920  932          if (do_pg_coloring == 0) {
 921  933                  page_colors = 1;
 922  934                  for (i = 0; i < mmu_page_sizes; i++) {
 923  935                          colorequivszc[i] = 0;
 924  936                          hw_page_array[i].hp_colors = 1;
 925  937                  }
 926  938                  return;
 927  939          }
 928  940  
 929  941          /*
 930  942           * Calculate page_colors from ecache_setsize. ecache_setsize contains
 931  943           * the max ecache setsize of all cpus configured in the system or, for
 932  944           * cheetah+ systems, the max possible ecache setsize for all possible
 933  945           * cheetah+ cpus.
 934  946           */
 935  947          page_colors = ecache_setsize / MMU_PAGESIZE;
 936  948          page_colors_mask = page_colors - 1;
 937  949  
 938  950          vac_colors = vac_size / MMU_PAGESIZE;
 939  951          vac_colors_mask = vac_colors -1;
 940  952  
 941  953          page_coloring_shift = 0;
 942  954          a = ecache_setsize;
 943  955          while (a >>= 1) {
 944  956                  page_coloring_shift++;
 945  957          }
 946  958  
 947  959          /* initialize number of colors per page size */
 948  960          for (i = 0; i < mmu_page_sizes; i++) {
 949  961                  hw_page_array[i].hp_colors = (page_colors_mask >>
 950  962                      (hw_page_array[i].hp_shift - hw_page_array[0].hp_shift))
 951  963                      + 1;
 952  964                  colorequivszc[i] = 0;
 953  965          }
 954  966  
 955  967          /*
 956  968           * initialize cpu_page_colors if ecache setsizes are homogenous.
 957  969           * cpu_page_colors set to -1 during DR operation or during startup
 958  970           * if setsizes are heterogenous.
 959  971           *
 960  972           * The value of cpu_page_colors determines if additional color bins
 961  973           * need to be checked for a particular color in the page_get routines.
 962  974           */
 963  975          if (cpu_setsize > 0 && cpu_page_colors == 0 &&
 964  976              cpu_setsize < ecache_setsize) {
 965  977                  cpu_page_colors = cpu_setsize / MMU_PAGESIZE;
 966  978                  a = lowbit(page_colors) - lowbit(cpu_page_colors);
 967  979                  ASSERT(a > 0);
 968  980                  ASSERT(a < 16);
 969  981  
 970  982                  for (i = 0; i < mmu_page_sizes; i++) {
 971  983                          if ((colors = hw_page_array[i].hp_colors) <= 1) {
 972  984                                  continue;
 973  985                          }
 974  986                          while ((colors >> a) == 0)
 975  987                                  a--;
 976  988                          ASSERT(a >= 0);
 977  989  
 978  990                          /* higher 4 bits encodes color equiv mask */
 979  991                          colorequivszc[i] = (a << 4);
 980  992                  }
 981  993          }
 982  994  
 983  995          /* do cpu specific color initialization */
 984  996          if (&page_coloring_init_cpu) {
 985  997                  page_coloring_init_cpu();
 986  998          }
 987  999  }
 988 1000  
 989 1001  int
 990 1002  bp_color(struct buf *bp)
 991 1003  {
 992 1004          int color = -1;
 993 1005  
 994 1006          if (vac) {
 995 1007                  if ((bp->b_flags & B_PAGEIO) != 0) {
 996 1008                          color = sfmmu_get_ppvcolor(bp->b_pages);
 997 1009                  } else if (bp->b_un.b_addr != NULL) {
 998 1010                          color = sfmmu_get_addrvcolor(bp->b_un.b_addr);
 999 1011                  }
1000 1012          }
1001 1013          return (color < 0 ? 0 : ptob(color));
1002 1014  }
1003 1015  
1004 1016  /*
1005 1017   * Function for flushing D-cache when performing module relocations
1006 1018   * to an alternate mapping.  Stubbed out on all platforms except sun4u,
1007 1019   * at least for now.
1008 1020   */
1009 1021  void
1010 1022  dcache_flushall()
1011 1023  {
1012 1024          sfmmu_cache_flushall();
1013 1025  }
1014 1026  
1015 1027  static int
1016 1028  kdi_range_overlap(uintptr_t va1, size_t sz1, uintptr_t va2, size_t sz2)
1017 1029  {
1018 1030          if (va1 < va2 && va1 + sz1 <= va2)
1019 1031                  return (0);
1020 1032  
1021 1033          if (va2 < va1 && va2 + sz2 <= va1)
1022 1034                  return (0);
1023 1035  
1024 1036          return (1);
1025 1037  }
1026 1038  
1027 1039  /*
1028 1040   * Return the number of bytes, relative to the beginning of a given range, that
1029 1041   * are non-toxic (can be read from and written to with relative impunity).
1030 1042   */
1031 1043  size_t
1032 1044  kdi_range_is_nontoxic(uintptr_t va, size_t sz, int write)
1033 1045  {
1034 1046          /* OBP reads are harmless, but we don't want people writing there */
1035 1047          if (write && kdi_range_overlap(va, sz, OFW_START_ADDR, OFW_END_ADDR -
1036 1048              OFW_START_ADDR + 1))
1037 1049                  return (va < OFW_START_ADDR ? OFW_START_ADDR - va : 0);
1038 1050  
1039 1051          if (kdi_range_overlap(va, sz, PIOMAPBASE, PIOMAPSIZE))
1040 1052                  return (va < PIOMAPBASE ? PIOMAPBASE - va : 0);
1041 1053  
1042 1054          return (sz); /* no overlap */
1043 1055  }
1044 1056  
1045 1057  /*
1046 1058   * Minimum physmem required for enabling large pages for kernel heap
1047 1059   * Currently we do not enable lp for kmem on systems with less
1048 1060   * than 1GB of memory. This value can be changed via /etc/system
1049 1061   */
1050 1062  size_t segkmem_lpminphysmem = 0x40000000;       /* 1GB */
1051 1063  
1052 1064  /*
1053 1065   * this function chooses large page size for kernel heap
1054 1066   */
1055 1067  size_t
1056 1068  get_segkmem_lpsize(size_t lpsize)
1057 1069  {
1058 1070          size_t memtotal = physmem * PAGESIZE;
1059 1071          size_t mmusz;
1060 1072          uint_t szc;
1061 1073  
1062 1074          if (memtotal < segkmem_lpminphysmem)
1063 1075                  return (PAGESIZE);
1064 1076  
1065 1077          if (plat_lpkmem_is_supported != NULL &&
1066 1078              plat_lpkmem_is_supported() == 0)
1067 1079                  return (PAGESIZE);
1068 1080  
1069 1081          mmusz = mmu_get_kernel_lpsize(lpsize);
1070 1082          szc = page_szc(mmusz);
1071 1083  
1072 1084          while (szc) {
1073 1085                  if (!(disable_large_pages & (1 << szc)))
1074 1086                          return (page_get_pagesize(szc));
1075 1087                  szc--;
1076 1088          }
1077 1089          return (PAGESIZE);
1078 1090  }

↓ open down ↓

689 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX