omnios_quiet Wdiff usr/src/uts/common/os/grow.c

Print this page

os/grow: define 'p' under the same ifdef as it's consumed

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/os/grow.c
          +++ new/usr/src/uts/common/os/grow.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /* Copyright 2013 OmniTI Computer Consulting, Inc. All rights reserved. */
  23   23  
  24   24  /*
  25   25   * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  26   26   * Use is subject to license terms.
  27   27   */
  28   28  
  29   29  /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  30   30  /*        All Rights Reserved   */
  31   31  
  32   32  #include <sys/types.h>
  33   33  #include <sys/inttypes.h>
  34   34  #include <sys/param.h>
  35   35  #include <sys/sysmacros.h>
  36   36  #include <sys/systm.h>
  37   37  #include <sys/signal.h>
  38   38  #include <sys/user.h>
  39   39  #include <sys/errno.h>
  40   40  #include <sys/var.h>
  41   41  #include <sys/proc.h>
  42   42  #include <sys/tuneable.h>
  43   43  #include <sys/debug.h>
  44   44  #include <sys/cmn_err.h>
  45   45  #include <sys/cred.h>
  46   46  #include <sys/vnode.h>
  47   47  #include <sys/vfs.h>
  48   48  #include <sys/vm.h>
  49   49  #include <sys/file.h>
  50   50  #include <sys/mman.h>
  51   51  #include <sys/vmparam.h>
  52   52  #include <sys/fcntl.h>
  53   53  #include <sys/lwpchan_impl.h>
  54   54  #include <sys/nbmlock.h>
  55   55  
  56   56  #include <vm/hat.h>
  57   57  #include <vm/as.h>
  58   58  #include <vm/seg.h>
  59   59  #include <vm/seg_dev.h>
  60   60  #include <vm/seg_vn.h>
  61   61  
  62   62  int use_brk_lpg = 1;
  63   63  int use_stk_lpg = 1;
  64   64  
  65   65  static int brk_lpg(caddr_t nva);
  66   66  static int grow_lpg(caddr_t sp);
  67   67  
  68   68  int
  69   69  brk(caddr_t nva)
  70   70  {
  71   71          int error;
  72   72          proc_t *p = curproc;
  73   73  
  74   74          /*
  75   75           * Serialize brk operations on an address space.
  76   76           * This also serves as the lock protecting p_brksize
  77   77           * and p_brkpageszc.
  78   78           */
  79   79          as_rangelock(p->p_as);
  80   80          if (use_brk_lpg && (p->p_flag & SAUTOLPG) != 0) {
  81   81                  error = brk_lpg(nva);
  82   82          } else {
  83   83                  error = brk_internal(nva, p->p_brkpageszc);
  84   84          }
  85   85          as_rangeunlock(p->p_as);
  86   86          return ((error != 0 ? set_errno(error) : 0));
  87   87  }
  88   88  
  89   89  /*
  90   90   * Algorithm: call arch-specific map_pgsz to get best page size to use,
  91   91   * then call brk_internal().
  92   92   * Returns 0 on success.
  93   93   */
  94   94  static int
  95   95  brk_lpg(caddr_t nva)
  96   96  {
  97   97          struct proc *p = curproc;
  98   98          size_t pgsz, len;
  99   99          caddr_t addr, brkend;
 100  100          caddr_t bssbase = p->p_bssbase;
 101  101          caddr_t brkbase = p->p_brkbase;
 102  102          int oszc, szc;
 103  103          int err;
 104  104  
 105  105          oszc = p->p_brkpageszc;
 106  106  
 107  107          /*
 108  108           * If p_brkbase has not yet been set, the first call
 109  109           * to brk_internal() will initialize it.
 110  110           */
 111  111          if (brkbase == 0) {
 112  112                  return (brk_internal(nva, oszc));
 113  113          }
 114  114  
 115  115          len = nva - bssbase;
 116  116  
 117  117          pgsz = map_pgsz(MAPPGSZ_HEAP, p, bssbase, len, 0);
 118  118          szc = page_szc(pgsz);
 119  119  
 120  120          /*
 121  121           * Covers two cases:
 122  122           * 1. page_szc() returns -1 for invalid page size, so we want to
 123  123           * ignore it in that case.
 124  124           * 2. By design we never decrease page size, as it is more stable.
 125  125           */
 126  126          if (szc <= oszc) {
 127  127                  err = brk_internal(nva, oszc);
 128  128                  /* If failed, back off to base page size. */
 129  129                  if (err != 0 && oszc != 0) {
 130  130                          err = brk_internal(nva, 0);
 131  131                  }
 132  132                  return (err);
 133  133          }
 134  134  
 135  135          err = brk_internal(nva, szc);
 136  136          /* If using szc failed, map with base page size and return. */
 137  137          if (err != 0) {
 138  138                  if (szc != 0) {
 139  139                          err = brk_internal(nva, 0);
 140  140                  }
 141  141                  return (err);
 142  142          }
 143  143  
 144  144          /*
 145  145           * Round up brk base to a large page boundary and remap
 146  146           * anything in the segment already faulted in beyond that
 147  147           * point.
 148  148           */
 149  149          addr = (caddr_t)P2ROUNDUP((uintptr_t)p->p_bssbase, pgsz);
 150  150          brkend = brkbase + p->p_brksize;
 151  151          len = brkend - addr;
 152  152          /* Check that len is not negative. Update page size code for heap. */
 153  153          if (addr >= p->p_bssbase && brkend > addr && IS_P2ALIGNED(len, pgsz)) {
 154  154                  (void) as_setpagesize(p->p_as, addr, len, szc, B_FALSE);
 155  155                  p->p_brkpageszc = szc;
 156  156          }
 157  157  
 158  158          ASSERT(err == 0);
 159  159          return (err);           /* should always be 0 */
 160  160  }
 161  161  
 162  162  /*
 163  163   * Returns 0 on success.
 164  164   */
 165  165  int
 166  166  brk_internal(caddr_t nva, uint_t brkszc)
 167  167  {
 168  168          caddr_t ova;                    /* current break address */
 169  169          size_t size;
 170  170          int     error;
 171  171          struct proc *p = curproc;
 172  172          struct as *as = p->p_as;
 173  173          size_t pgsz;
 174  174          uint_t szc;
 175  175          rctl_qty_t as_rctl;
 176  176  
 177  177          /*
 178  178           * extend heap to brkszc alignment but use current p->p_brkpageszc
 179  179           * for the newly created segment. This allows the new extension
 180  180           * segment to be concatenated successfully with the existing brk
 181  181           * segment.
 182  182           */
 183  183          if ((szc = brkszc) != 0) {
 184  184                  pgsz = page_get_pagesize(szc);
 185  185                  ASSERT(pgsz > PAGESIZE);
 186  186          } else {
 187  187                  pgsz = PAGESIZE;
 188  188          }
 189  189  
 190  190          mutex_enter(&p->p_lock);
 191  191          as_rctl = rctl_enforced_value(rctlproc_legacy[RLIMIT_DATA],
 192  192              p->p_rctls, p);
 193  193          mutex_exit(&p->p_lock);
 194  194  
 195  195          /*
 196  196           * If p_brkbase has not yet been set, the first call
 197  197           * to brk() will initialize it.
 198  198           */
 199  199          if (p->p_brkbase == 0)
 200  200                  p->p_brkbase = nva;
 201  201  
 202  202          /*
 203  203           * Before multiple page size support existed p_brksize was the value
 204  204           * not rounded to the pagesize (i.e. it stored the exact user request
 205  205           * for heap size). If pgsz is greater than PAGESIZE calculate the
 206  206           * heap size as the real new heap size by rounding it up to pgsz.
 207  207           * This is useful since we may want to know where the heap ends
 208  208           * without knowing heap pagesize (e.g. some old code) and also if
 209  209           * heap pagesize changes we can update p_brkpageszc but delay adding
 210  210           * new mapping yet still know from p_brksize where the heap really
 211  211           * ends. The user requested heap end is stored in libc variable.
 212  212           */
 213  213          if (pgsz > PAGESIZE) {
 214  214                  caddr_t tnva = (caddr_t)P2ROUNDUP((uintptr_t)nva, pgsz);
 215  215                  size = tnva - p->p_brkbase;
 216  216                  if (tnva < p->p_brkbase || (size > p->p_brksize &&
 217  217                      size > (size_t)as_rctl)) {
 218  218                          szc = 0;
 219  219                          pgsz = PAGESIZE;
 220  220                          size = nva - p->p_brkbase;
 221  221                  }
 222  222          } else {
 223  223                  size = nva - p->p_brkbase;
 224  224          }
 225  225  
 226  226          /*
 227  227           * use PAGESIZE to roundup ova because we want to know the real value
 228  228           * of the current heap end in case p_brkpageszc changes since the last
 229  229           * p_brksize was computed.
 230  230           */
 231  231          nva = (caddr_t)P2ROUNDUP((uintptr_t)nva, pgsz);
 232  232          ova = (caddr_t)P2ROUNDUP((uintptr_t)(p->p_brkbase + p->p_brksize),
 233  233              PAGESIZE);
 234  234  
 235  235          if ((nva < p->p_brkbase) || (size > p->p_brksize &&
 236  236              size > as_rctl)) {
 237  237                  mutex_enter(&p->p_lock);
 238  238                  (void) rctl_action(rctlproc_legacy[RLIMIT_DATA], p->p_rctls, p,
 239  239                      RCA_SAFE);
 240  240                  mutex_exit(&p->p_lock);
 241  241                  return (ENOMEM);
 242  242          }
 243  243  
 244  244          if (nva > ova) {
 245  245                  struct segvn_crargs crargs =
 246  246                      SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL);
 247  247  
 248  248                  if (!(p->p_datprot & PROT_EXEC)) {
 249  249                          crargs.prot &= ~PROT_EXEC;
 250  250                  }
 251  251  
 252  252                  /*
 253  253                   * Add new zfod mapping to extend UNIX data segment
 254  254                   * AS_MAP_NO_LPOOB means use 0, and don't reapply OOB policies
 255  255                   * via map_pgszcvec(). Use AS_MAP_HEAP to get intermediate
 256  256                   * page sizes if ova is not aligned to szc's pgsz.
 257  257                   */
 258  258                  if (szc > 0) {
 259  259                          caddr_t rbss;
 260  260  
 261  261                          rbss = (caddr_t)P2ROUNDUP((uintptr_t)p->p_bssbase,
 262  262                              pgsz);
 263  263                          if (IS_P2ALIGNED(p->p_bssbase, pgsz) || ova > rbss) {
 264  264                                  crargs.szc = p->p_brkpageszc ? p->p_brkpageszc :
 265  265                                      AS_MAP_NO_LPOOB;
 266  266                          } else if (ova == rbss) {
 267  267                                  crargs.szc = szc;
 268  268                          } else {
 269  269                                  crargs.szc = AS_MAP_HEAP;
 270  270                          }
 271  271                  } else {
 272  272                          crargs.szc = AS_MAP_NO_LPOOB;
 273  273                  }
 274  274                  crargs.lgrp_mem_policy_flags = LGRP_MP_FLAG_EXTEND_UP;
 275  275                  error = as_map(as, ova, (size_t)(nva - ova), segvn_create,
 276  276                      &crargs);
 277  277                  if (error) {
 278  278                          return (error);
 279  279                  }
 280  280  
 281  281          } else if (nva < ova) {
 282  282                  /*
 283  283                   * Release mapping to shrink UNIX data segment.
 284  284                   */
 285  285                  (void) as_unmap(as, nva, (size_t)(ova - nva));
 286  286          }
 287  287          p->p_brksize = size;
 288  288          return (0);
 289  289  }
 290  290  
 291  291  /*
 292  292   * Grow the stack to include sp.  Return 1 if successful, 0 otherwise.
 293  293   * This routine assumes that the stack grows downward.
 294  294   */
 295  295  int
 296  296  grow(caddr_t sp)
 297  297  {
 298  298          struct proc *p = curproc;
 299  299          struct as *as = p->p_as;
 300  300          size_t oldsize = p->p_stksize;
 301  301          size_t newsize;
 302  302          int err;
 303  303  
 304  304          /*
 305  305           * Serialize grow operations on an address space.
 306  306           * This also serves as the lock protecting p_stksize
 307  307           * and p_stkpageszc.
 308  308           */
 309  309          as_rangelock(as);
 310  310          if (use_stk_lpg && (p->p_flag & SAUTOLPG) != 0) {
 311  311                  err = grow_lpg(sp);
 312  312          } else {
 313  313                  err = grow_internal(sp, p->p_stkpageszc);
 314  314          }
 315  315          as_rangeunlock(as);
 316  316  
 317  317          if (err == 0 && (newsize = p->p_stksize) > oldsize) {
 318  318                  ASSERT(IS_P2ALIGNED(oldsize, PAGESIZE));
 319  319                  ASSERT(IS_P2ALIGNED(newsize, PAGESIZE));
 320  320                  /*
 321  321                   * Set up translations so the process doesn't have to fault in
 322  322                   * the stack pages we just gave it.
 323  323                   */
 324  324                  (void) as_fault(as->a_hat, as, p->p_usrstack - newsize,
 325  325                      newsize - oldsize, F_INVAL, S_WRITE);
 326  326          }
 327  327          return ((err == 0 ? 1 : 0));
 328  328  }
 329  329  
 330  330  /*
 331  331   * Algorithm: call arch-specific map_pgsz to get best page size to use,
 332  332   * then call grow_internal().
 333  333   * Returns 0 on success.
 334  334   */
 335  335  static int
 336  336  grow_lpg(caddr_t sp)
 337  337  {
 338  338          struct proc *p = curproc;
 339  339          size_t pgsz;
 340  340          size_t len, newsize;
 341  341          caddr_t addr, saddr;
 342  342          caddr_t growend;
 343  343          int oszc, szc;
 344  344          int err;
 345  345  
 346  346          newsize = p->p_usrstack - sp;
 347  347  
 348  348          oszc = p->p_stkpageszc;
 349  349          pgsz = map_pgsz(MAPPGSZ_STK, p, sp, newsize, 0);
 350  350          szc = page_szc(pgsz);
 351  351  
 352  352          /*
 353  353           * Covers two cases:
 354  354           * 1. page_szc() returns -1 for invalid page size, so we want to
 355  355           * ignore it in that case.
 356  356           * 2. By design we never decrease page size, as it is more stable.
 357  357           * This shouldn't happen as the stack never shrinks.
 358  358           */
 359  359          if (szc <= oszc) {
 360  360                  err = grow_internal(sp, oszc);
 361  361                  /* failed, fall back to base page size */
 362  362                  if (err != 0 && oszc != 0) {
 363  363                          err = grow_internal(sp, 0);
 364  364                  }
 365  365                  return (err);
 366  366          }
 367  367  
 368  368          /*
 369  369           * We've grown sufficiently to switch to a new page size.
 370  370           * So we are going to remap the whole segment with the new page size.
 371  371           */
 372  372          err = grow_internal(sp, szc);
 373  373          /* The grow with szc failed, so fall back to base page size. */
 374  374          if (err != 0) {
 375  375                  if (szc != 0) {
 376  376                          err = grow_internal(sp, 0);
 377  377                  }
 378  378                  return (err);
 379  379          }
 380  380  
 381  381          /*
 382  382           * Round up stack pointer to a large page boundary and remap
 383  383           * any pgsz pages in the segment already faulted in beyond that
 384  384           * point.
 385  385           */
 386  386          saddr = p->p_usrstack - p->p_stksize;
 387  387          addr = (caddr_t)P2ROUNDUP((uintptr_t)saddr, pgsz);
 388  388          growend = (caddr_t)P2ALIGN((uintptr_t)p->p_usrstack, pgsz);
 389  389          len = growend - addr;
 390  390          /* Check that len is not negative. Update page size code for stack. */
 391  391          if (addr >= saddr && growend > addr && IS_P2ALIGNED(len, pgsz)) {
 392  392                  (void) as_setpagesize(p->p_as, addr, len, szc, B_FALSE);
 393  393                  p->p_stkpageszc = szc;
 394  394          }
 395  395  
 396  396          ASSERT(err == 0);
 397  397          return (err);           /* should always be 0 */
 398  398  }
 399  399  
 400  400  /*
 401  401   * This routine assumes that the stack grows downward.
 402  402   * Returns 0 on success, errno on failure.
 403  403   */
 404  404  int
 405  405  grow_internal(caddr_t sp, uint_t growszc)
 406  406  {
 407  407          struct proc *p = curproc;
 408  408          size_t newsize;
 409  409          size_t oldsize;
 410  410          int    error;
 411  411          size_t pgsz;
 412  412          uint_t szc;
 413  413          struct segvn_crargs crargs = SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL);
 414  414  
 415  415          ASSERT(sp < p->p_usrstack);
 416  416          sp = (caddr_t)P2ALIGN((uintptr_t)sp, PAGESIZE);
 417  417  
 418  418          /*
 419  419           * grow to growszc alignment but use current p->p_stkpageszc for
 420  420           * the segvn_crargs szc passed to segvn_create. For memcntl to
 421  421           * increase the szc, this allows the new extension segment to be
 422  422           * concatenated successfully with the existing stack segment.
 423  423           */
 424  424          if ((szc = growszc) != 0) {
 425  425                  pgsz = page_get_pagesize(szc);
 426  426                  ASSERT(pgsz > PAGESIZE);
 427  427                  newsize = p->p_usrstack - (caddr_t)P2ALIGN((uintptr_t)sp, pgsz);
 428  428                  if (newsize > (size_t)p->p_stk_ctl) {
 429  429                          szc = 0;
 430  430                          pgsz = PAGESIZE;
 431  431                          newsize = p->p_usrstack - sp;
 432  432                  }
 433  433          } else {
 434  434                  pgsz = PAGESIZE;
 435  435                  newsize = p->p_usrstack - sp;
 436  436          }
 437  437  
 438  438          if (newsize > (size_t)p->p_stk_ctl) {
 439  439                  (void) rctl_action(rctlproc_legacy[RLIMIT_STACK], p->p_rctls, p,
 440  440                      RCA_UNSAFE_ALL);
 441  441  
 442  442                  return (ENOMEM);
 443  443          }
 444  444  
 445  445          oldsize = p->p_stksize;
 446  446          ASSERT(P2PHASE(oldsize, PAGESIZE) == 0);
 447  447  
 448  448          if (newsize <= oldsize) {       /* prevent the stack from shrinking */
 449  449                  return (0);
 450  450          }
 451  451  
 452  452          if (!(p->p_stkprot & PROT_EXEC)) {
 453  453                  crargs.prot &= ~PROT_EXEC;
 454  454          }
 455  455          /*
 456  456           * extend stack with the proposed new growszc, which is different
 457  457           * than p_stkpageszc only on a memcntl to increase the stack pagesize.
 458  458           * AS_MAP_NO_LPOOB means use 0, and don't reapply OOB policies via
 459  459           * map_pgszcvec(). Use AS_MAP_STACK to get intermediate page sizes
 460  460           * if not aligned to szc's pgsz.
 461  461           */
 462  462          if (szc > 0) {
 463  463                  caddr_t oldsp = p->p_usrstack - oldsize;
 464  464                  caddr_t austk = (caddr_t)P2ALIGN((uintptr_t)p->p_usrstack,
 465  465                      pgsz);
 466  466  
 467  467                  if (IS_P2ALIGNED(p->p_usrstack, pgsz) || oldsp < austk) {
 468  468                          crargs.szc = p->p_stkpageszc ? p->p_stkpageszc :
 469  469                              AS_MAP_NO_LPOOB;
 470  470                  } else if (oldsp == austk) {
 471  471                          crargs.szc = szc;
 472  472                  } else {
 473  473                          crargs.szc = AS_MAP_STACK;
 474  474                  }
 475  475          } else {
 476  476                  crargs.szc = AS_MAP_NO_LPOOB;
 477  477          }
 478  478          crargs.lgrp_mem_policy_flags = LGRP_MP_FLAG_EXTEND_DOWN;
 479  479  
 480  480          if ((error = as_map(p->p_as, p->p_usrstack - newsize, newsize - oldsize,
 481  481              segvn_create, &crargs)) != 0) {
 482  482                  if (error == EAGAIN) {
 483  483                          cmn_err(CE_WARN, "Sorry, no swap space to grow stack "
 484  484                              "for pid %d (%s)", p->p_pid, PTOU(p)->u_comm);
 485  485                  }
 486  486                  return (error);
 487  487          }
 488  488          p->p_stksize = newsize;
 489  489          return (0);
 490  490  }
 491  491

↓ open down ↓

491 lines elided

↑ open up ↑

 492  492  /*
 493  493   * Find address for user to map.
 494  494   * If MAP_FIXED is not specified, we can pick any address we want, but we will
 495  495   * first try the value in *addrp if it is non-NULL.  Thus this is implementing
 496  496   * a way to try and get a preferred address.
 497  497   */
 498  498  int
 499  499  choose_addr(struct as *as, caddr_t *addrp, size_t len, offset_t off,
 500  500      int vacalign, uint_t flags)
 501  501  {
      502 +#if defined(__amd64)
 502  503          proc_t *p = curproc;
      504 +#endif
 503  505          caddr_t basep = (caddr_t)(uintptr_t)((uintptr_t)*addrp & PAGEMASK);
 504  506          size_t lenp;
 505  507  
 506  508          ASSERT(AS_ISCLAIMGAP(as));      /* searches should be serialized */
 507  509  
 508  510          /*
 509  511           * If we have been provided a hint, we should still expand the lenp
 510  512           * to be the rest of the address space.  This will allow us to
 511  513           * treat the hint as a strong desire to be "nearby" the provided
 512  514           * address.  If we can't satisfy the hint, as_gap() will walk forward.

 513  515           */
 514  516          if (flags & _MAP_LOW32)
 515  517                  lenp = (caddr_t)USERLIMIT32 - basep;
 516  518  #if defined(__amd64)
 517  519          else if (p->p_model == DATAMODEL_NATIVE)
 518  520                  lenp = p->p_usrstack - basep -
 519  521                      ((p->p_stk_ctl + PAGEOFFSET) & PAGEMASK);
 520  522  #endif
 521  523          else
 522  524                  lenp = as->a_userlimit - basep;
 523  525  
 524  526          if (flags & MAP_FIXED) {
 525  527                  (void) as_unmap(as, *addrp, len);
 526  528                  return (0);
 527  529          } else if (basep != NULL && ((flags & MAP_ALIGN) == 0) &&
 528  530              !as_gap(as, len, &basep, &lenp, 0, *addrp)) {
 529  531                  /* User supplied address was available */
 530  532                  *addrp = basep;
 531  533          } else {
 532  534                  /*
 533  535                   * No user supplied address or the address supplied was not
 534  536                   * available.
 535  537                   */
 536  538                  map_addr(addrp, len, off, vacalign, flags);
 537  539          }
 538  540          if (*addrp == NULL)
 539  541                  return (ENOMEM);
 540  542          return (0);
 541  543  }
 542  544  
 543  545  
 544  546  /*
 545  547   * Used for MAP_ANON - fast way to get anonymous pages
 546  548   */
 547  549  static int
 548  550  zmap(struct as *as, caddr_t *addrp, size_t len, uint_t uprot, int flags,
 549  551      offset_t pos)
 550  552  {
 551  553          struct segvn_crargs vn_a;
 552  554          int error;
 553  555  
 554  556          if (((PROT_ALL & uprot) != uprot))
 555  557                  return (EACCES);
 556  558  
 557  559          if ((flags & MAP_FIXED) != 0) {
 558  560                  caddr_t userlimit;
 559  561  
 560  562                  /*
 561  563                   * Use the user address.  First verify that
 562  564                   * the address to be used is page aligned.
 563  565                   * Then make some simple bounds checks.
 564  566                   */
 565  567                  if (((uintptr_t)*addrp & PAGEOFFSET) != 0)
 566  568                          return (EINVAL);
 567  569  
 568  570                  userlimit = flags & _MAP_LOW32 ?
 569  571                      (caddr_t)USERLIMIT32 : as->a_userlimit;
 570  572                  switch (valid_usr_range(*addrp, len, uprot, as, userlimit)) {
 571  573                  case RANGE_OKAY:
 572  574                          break;
 573  575                  case RANGE_BADPROT:
 574  576                          return (ENOTSUP);
 575  577                  case RANGE_BADADDR:
 576  578                  default:
 577  579                          return (ENOMEM);
 578  580                  }
 579  581          }
 580  582          /*
 581  583           * No need to worry about vac alignment for anonymous
 582  584           * pages since this is a "clone" object that doesn't
 583  585           * yet exist.
 584  586           */
 585  587          error = choose_addr(as, addrp, len, pos, ADDR_NOVACALIGN, flags);
 586  588          if (error != 0) {
 587  589                  return (error);
 588  590          }
 589  591  
 590  592          /*
 591  593           * Use the seg_vn segment driver; passing in the NULL amp
 592  594           * gives the desired "cloning" effect.
 593  595           */
 594  596          vn_a.vp = NULL;
 595  597          vn_a.offset = 0;
 596  598          vn_a.type = flags & MAP_TYPE;
 597  599          vn_a.prot = uprot;
 598  600          vn_a.maxprot = PROT_ALL;
 599  601          vn_a.flags = flags & ~MAP_TYPE;
 600  602          vn_a.cred = CRED();
 601  603          vn_a.amp = NULL;
 602  604          vn_a.szc = 0;
 603  605          vn_a.lgrp_mem_policy_flags = 0;
 604  606  
 605  607          return (as_map(as, *addrp, len, segvn_create, &vn_a));
 606  608  }
 607  609  
 608  610  static int
 609  611  smmap_common(caddr_t *addrp, size_t len,
 610  612      int prot, int flags, struct file *fp, offset_t pos)
 611  613  {
 612  614          struct vnode *vp;
 613  615          struct as *as = curproc->p_as;
 614  616          uint_t uprot, maxprot, type;
 615  617          int error;
 616  618          int in_crit = 0;
 617  619  
 618  620          if ((flags & ~(MAP_SHARED | MAP_PRIVATE | MAP_FIXED | _MAP_NEW |
 619  621              _MAP_LOW32 | MAP_NORESERVE | MAP_ANON | MAP_ALIGN |
 620  622              MAP_TEXT | MAP_INITDATA)) != 0) {
 621  623                  /* | MAP_RENAME */      /* not implemented, let user know */
 622  624                  return (EINVAL);
 623  625          }
 624  626  
 625  627          if ((flags & MAP_TEXT) && !(prot & PROT_EXEC)) {
 626  628                  return (EINVAL);
 627  629          }
 628  630  
 629  631          if ((flags & (MAP_TEXT | MAP_INITDATA)) == (MAP_TEXT | MAP_INITDATA)) {
 630  632                  return (EINVAL);
 631  633          }
 632  634  
 633  635  #if defined(__sparc)
 634  636          /*
 635  637           * See if this is an "old mmap call".  If so, remember this
 636  638           * fact and convert the flags value given to mmap to indicate
 637  639           * the specified address in the system call must be used.
 638  640           * _MAP_NEW is turned set by all new uses of mmap.
 639  641           */
 640  642          if ((flags & _MAP_NEW) == 0)
 641  643                  flags |= MAP_FIXED;
 642  644  #endif
 643  645          flags &= ~_MAP_NEW;
 644  646  
 645  647          type = flags & MAP_TYPE;
 646  648          if (type != MAP_PRIVATE && type != MAP_SHARED)
 647  649                  return (EINVAL);
 648  650  
 649  651  
 650  652          if (flags & MAP_ALIGN) {
 651  653  
 652  654                  if (flags & MAP_FIXED)
 653  655                          return (EINVAL);
 654  656  
 655  657                  /* alignment needs to be a power of 2 >= page size */
 656  658                  if (((uintptr_t)*addrp < PAGESIZE && (uintptr_t)*addrp != 0) ||
 657  659                      !ISP2((uintptr_t)*addrp))
 658  660                          return (EINVAL);
 659  661          }
 660  662          /*
 661  663           * Check for bad lengths and file position.
 662  664           * We let the VOP_MAP routine check for negative lengths
 663  665           * since on some vnode types this might be appropriate.
 664  666           */
 665  667          if (len == 0 || (pos & (u_offset_t)PAGEOFFSET) != 0)
 666  668                  return (EINVAL);
 667  669  
 668  670          maxprot = PROT_ALL;             /* start out allowing all accesses */
 669  671          uprot = prot | PROT_USER;
 670  672  
 671  673          if (fp == NULL) {
 672  674                  ASSERT(flags & MAP_ANON);
 673  675                  /* discard lwpchan mappings, like munmap() */
 674  676                  if ((flags & MAP_FIXED) && curproc->p_lcp != NULL)
 675  677                          lwpchan_delete_mapping(curproc, *addrp, *addrp + len);
 676  678                  as_rangelock(as);
 677  679                  error = zmap(as, addrp, len, uprot, flags, pos);
 678  680                  as_rangeunlock(as);
 679  681                  /*
 680  682                   * Tell machine specific code that lwp has mapped shared memory
 681  683                   */
 682  684                  if (error == 0 && (flags & MAP_SHARED)) {
 683  685                          /* EMPTY */
 684  686                          LWP_MMODEL_SHARED_AS(*addrp, len);
 685  687                  }
 686  688                  return (error);
 687  689          } else if ((flags & MAP_ANON) != 0)
 688  690                  return (EINVAL);
 689  691  
 690  692          vp = fp->f_vnode;
 691  693  
 692  694          /* Can't execute code from "noexec" mounted filesystem. */
 693  695          if ((vp->v_vfsp->vfs_flag & VFS_NOEXEC) != 0)
 694  696                  maxprot &= ~PROT_EXEC;
 695  697  
 696  698          /*
 697  699           * These checks were added as part of large files.
 698  700           *
 699  701           * Return ENXIO if the initial position is negative; return EOVERFLOW
 700  702           * if (offset + len) would overflow the maximum allowed offset for the
 701  703           * type of file descriptor being used.
 702  704           */
 703  705          if (vp->v_type == VREG) {
 704  706                  if (pos < 0)
 705  707                          return (ENXIO);
 706  708                  if ((offset_t)len > (OFFSET_MAX(fp) - pos))
 707  709                          return (EOVERFLOW);
 708  710          }
 709  711  
 710  712          if (type == MAP_SHARED && (fp->f_flag & FWRITE) == 0) {
 711  713                  /* no write access allowed */
 712  714                  maxprot &= ~PROT_WRITE;
 713  715          }
 714  716  
 715  717          /*
 716  718           * XXX - Do we also adjust maxprot based on protections
 717  719           * of the vnode?  E.g. if no execute permission is given
 718  720           * on the vnode for the current user, maxprot probably
 719  721           * should disallow PROT_EXEC also?  This is different
 720  722           * from the write access as this would be a per vnode
 721  723           * test as opposed to a per fd test for writability.
 722  724           */
 723  725  
 724  726          /*
 725  727           * Verify that the specified protections are not greater than
 726  728           * the maximum allowable protections.  Also test to make sure
 727  729           * that the file descriptor does allows for read access since
 728  730           * "write only" mappings are hard to do since normally we do
 729  731           * the read from the file before the page can be written.
 730  732           */
 731  733          if (((maxprot & uprot) != uprot) || (fp->f_flag & FREAD) == 0)
 732  734                  return (EACCES);
 733  735  
 734  736          /*
 735  737           * If the user specified an address, do some simple checks here
 736  738           */
 737  739          if ((flags & MAP_FIXED) != 0) {
 738  740                  caddr_t userlimit;
 739  741  
 740  742                  /*
 741  743                   * Use the user address.  First verify that
 742  744                   * the address to be used is page aligned.
 743  745                   * Then make some simple bounds checks.
 744  746                   */
 745  747                  if (((uintptr_t)*addrp & PAGEOFFSET) != 0)
 746  748                          return (EINVAL);
 747  749  
 748  750                  userlimit = flags & _MAP_LOW32 ?
 749  751                      (caddr_t)USERLIMIT32 : as->a_userlimit;
 750  752                  switch (valid_usr_range(*addrp, len, uprot, as, userlimit)) {
 751  753                  case RANGE_OKAY:
 752  754                          break;
 753  755                  case RANGE_BADPROT:
 754  756                          return (ENOTSUP);
 755  757                  case RANGE_BADADDR:
 756  758                  default:
 757  759                          return (ENOMEM);
 758  760                  }
 759  761          }
 760  762  
 761  763          if ((prot & (PROT_READ | PROT_WRITE | PROT_EXEC)) &&
 762  764              nbl_need_check(vp)) {
 763  765                  int svmand;
 764  766                  nbl_op_t nop;
 765  767  
 766  768                  nbl_start_crit(vp, RW_READER);
 767  769                  in_crit = 1;
 768  770                  error = nbl_svmand(vp, fp->f_cred, &svmand);
 769  771                  if (error != 0)
 770  772                          goto done;
 771  773                  if ((prot & PROT_WRITE) && (type == MAP_SHARED)) {
 772  774                          if (prot & (PROT_READ | PROT_EXEC)) {
 773  775                                  nop = NBL_READWRITE;
 774  776                          } else {
 775  777                                  nop = NBL_WRITE;
 776  778                          }
 777  779                  } else {
 778  780                          nop = NBL_READ;
 779  781                  }
 780  782                  if (nbl_conflict(vp, nop, 0, LONG_MAX, svmand, NULL)) {
 781  783                          error = EACCES;
 782  784                          goto done;
 783  785                  }
 784  786          }
 785  787  
 786  788          /* discard lwpchan mappings, like munmap() */
 787  789          if ((flags & MAP_FIXED) && curproc->p_lcp != NULL)
 788  790                  lwpchan_delete_mapping(curproc, *addrp, *addrp + len);
 789  791  
 790  792          /*
 791  793           * Ok, now let the vnode map routine do its thing to set things up.
 792  794           */
 793  795          error = VOP_MAP(vp, pos, as,
 794  796              addrp, len, uprot, maxprot, flags, fp->f_cred, NULL);
 795  797  
 796  798          if (error == 0) {
 797  799                  /*
 798  800                   * Tell machine specific code that lwp has mapped shared memory
 799  801                   */
 800  802                  if (flags & MAP_SHARED) {
 801  803                          /* EMPTY */
 802  804                          LWP_MMODEL_SHARED_AS(*addrp, len);
 803  805                  }
 804  806                  if (vp->v_type == VREG &&
 805  807                      (flags & (MAP_TEXT | MAP_INITDATA)) != 0) {
 806  808                          /*
 807  809                           * Mark this as an executable vnode
 808  810                           */
 809  811                          mutex_enter(&vp->v_lock);
 810  812                          vp->v_flag |= VVMEXEC;
 811  813                          mutex_exit(&vp->v_lock);
 812  814                  }
 813  815          }
 814  816  
 815  817  done:
 816  818          if (in_crit)
 817  819                  nbl_end_crit(vp);
 818  820          return (error);
 819  821  }
 820  822  
 821  823  #ifdef _LP64
 822  824  /*
 823  825   * LP64 mmap(2) system call: 64-bit offset, 64-bit address.
 824  826   *
 825  827   * The "large file" mmap routine mmap64(2) is also mapped to this routine
 826  828   * by the 64-bit version of libc.
 827  829   *
 828  830   * Eventually, this should be the only version, and have smmap_common()
 829  831   * folded back into it again.  Some day.
 830  832   */
 831  833  caddr_t
 832  834  smmap64(caddr_t addr, size_t len, int prot, int flags, int fd, off_t pos)
 833  835  {
 834  836          struct file *fp;
 835  837          int error;
 836  838  
 837  839          if (fd == -1 && (flags & MAP_ANON) != 0)
 838  840                  error = smmap_common(&addr, len, prot, flags,
 839  841                      NULL, (offset_t)pos);
 840  842          else if ((fp = getf(fd)) != NULL) {
 841  843                  error = smmap_common(&addr, len, prot, flags,
 842  844                      fp, (offset_t)pos);
 843  845                  releasef(fd);
 844  846          } else
 845  847                  error = EBADF;
 846  848  
 847  849          return (error ? (caddr_t)(uintptr_t)set_errno(error) : addr);
 848  850  }
 849  851  #endif  /* _LP64 */
 850  852  
 851  853  #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
 852  854  
 853  855  /*
 854  856   * ILP32 mmap(2) system call: 32-bit offset, 32-bit address.
 855  857   */
 856  858  caddr_t
 857  859  smmap32(caddr32_t addr, size32_t len, int prot, int flags, int fd, off32_t pos)
 858  860  {
 859  861          struct file *fp;
 860  862          int error;
 861  863          caddr_t a = (caddr_t)(uintptr_t)addr;
 862  864  
 863  865          if (flags & _MAP_LOW32)
 864  866                  error = EINVAL;
 865  867          else if (fd == -1 && (flags & MAP_ANON) != 0)
 866  868                  error = smmap_common(&a, (size_t)len, prot,
 867  869                      flags | _MAP_LOW32, NULL, (offset_t)pos);
 868  870          else if ((fp = getf(fd)) != NULL) {
 869  871                  error = smmap_common(&a, (size_t)len, prot,
 870  872                      flags | _MAP_LOW32, fp, (offset_t)pos);
 871  873                  releasef(fd);
 872  874          } else
 873  875                  error = EBADF;
 874  876  
 875  877          ASSERT(error != 0 || (uintptr_t)(a + len) < (uintptr_t)UINT32_MAX);
 876  878  
 877  879          return (error ? (caddr_t)(uintptr_t)set_errno(error) : a);
 878  880  }
 879  881  
 880  882  /*
 881  883   * ILP32 mmap64(2) system call: 64-bit offset, 32-bit address.
 882  884   *
 883  885   * Now things really get ugly because we can't use the C-style
 884  886   * calling convention for more than 6 args, and 64-bit parameter
 885  887   * passing on 32-bit systems is less than clean.
 886  888   */
 887  889  
 888  890  struct mmaplf32a {
 889  891          caddr_t addr;
 890  892          size_t len;
 891  893  #ifdef _LP64
 892  894          /*
 893  895           * 32-bit contents, 64-bit cells
 894  896           */
 895  897          uint64_t prot;
 896  898          uint64_t flags;
 897  899          uint64_t fd;
 898  900          uint64_t offhi;
 899  901          uint64_t offlo;
 900  902  #else
 901  903          /*
 902  904           * 32-bit contents, 32-bit cells
 903  905           */
 904  906          uint32_t prot;
 905  907          uint32_t flags;
 906  908          uint32_t fd;
 907  909          uint32_t offhi;
 908  910          uint32_t offlo;
 909  911  #endif
 910  912  };
 911  913  
 912  914  int
 913  915  smmaplf32(struct mmaplf32a *uap, rval_t *rvp)
 914  916  {
 915  917          struct file *fp;
 916  918          int error;
 917  919          caddr_t a = uap->addr;
 918  920          int flags = (int)uap->flags;
 919  921          int fd = (int)uap->fd;
 920  922  #ifdef _BIG_ENDIAN
 921  923          offset_t off = ((u_offset_t)uap->offhi << 32) | (u_offset_t)uap->offlo;
 922  924  #else
 923  925          offset_t off = ((u_offset_t)uap->offlo << 32) | (u_offset_t)uap->offhi;
 924  926  #endif
 925  927  
 926  928          if (flags & _MAP_LOW32)
 927  929                  error = EINVAL;
 928  930          else if (fd == -1 && (flags & MAP_ANON) != 0)
 929  931                  error = smmap_common(&a, uap->len, (int)uap->prot,
 930  932                      flags | _MAP_LOW32, NULL, off);
 931  933          else if ((fp = getf(fd)) != NULL) {
 932  934                  error = smmap_common(&a, uap->len, (int)uap->prot,
 933  935                      flags | _MAP_LOW32, fp, off);
 934  936                  releasef(fd);
 935  937          } else
 936  938                  error = EBADF;
 937  939  
 938  940          if (error == 0)
 939  941                  rvp->r_val1 = (uintptr_t)a;
 940  942          return (error);
 941  943  }
 942  944  
 943  945  #endif  /* _SYSCALL32_IMPL || _ILP32 */
 944  946  
 945  947  int
 946  948  munmap(caddr_t addr, size_t len)
 947  949  {
 948  950          struct proc *p = curproc;
 949  951          struct as *as = p->p_as;
 950  952  
 951  953          if (((uintptr_t)addr & PAGEOFFSET) != 0 || len == 0)
 952  954                  return (set_errno(EINVAL));
 953  955  
 954  956          if (valid_usr_range(addr, len, 0, as, as->a_userlimit) != RANGE_OKAY)
 955  957                  return (set_errno(EINVAL));
 956  958  
 957  959          /*
 958  960           * Discard lwpchan mappings.
 959  961           */
 960  962          if (p->p_lcp != NULL)
 961  963                  lwpchan_delete_mapping(p, addr, addr + len);
 962  964          if (as_unmap(as, addr, len) != 0)
 963  965                  return (set_errno(EINVAL));
 964  966  
 965  967          return (0);
 966  968  }
 967  969  
 968  970  int
 969  971  mprotect(caddr_t addr, size_t len, int prot)
 970  972  {
 971  973          struct as *as = curproc->p_as;
 972  974          uint_t uprot = prot | PROT_USER;
 973  975          int error;
 974  976  
 975  977          if (((uintptr_t)addr & PAGEOFFSET) != 0 || len == 0)
 976  978                  return (set_errno(EINVAL));
 977  979  
 978  980          switch (valid_usr_range(addr, len, prot, as, as->a_userlimit)) {
 979  981          case RANGE_OKAY:
 980  982                  break;
 981  983          case RANGE_BADPROT:
 982  984                  return (set_errno(ENOTSUP));
 983  985          case RANGE_BADADDR:
 984  986          default:
 985  987                  return (set_errno(ENOMEM));
 986  988          }
 987  989  
 988  990          error = as_setprot(as, addr, len, uprot);
 989  991          if (error)
 990  992                  return (set_errno(error));
 991  993          return (0);
 992  994  }
 993  995  
 994  996  #define MC_CACHE        128                     /* internal result buffer */
 995  997  #define MC_QUANTUM      (MC_CACHE * PAGESIZE)   /* addresses covered in loop */
 996  998  
 997  999  int
 998 1000  mincore(caddr_t addr, size_t len, char *vecp)
 999 1001  {
1000 1002          struct as *as = curproc->p_as;
1001 1003          caddr_t ea;                     /* end address of loop */
1002 1004          size_t rl;                      /* inner result length */
1003 1005          char vec[MC_CACHE];             /* local vector cache */
1004 1006          int error;
1005 1007          model_t model;
1006 1008          long    llen;
1007 1009  
1008 1010          model = get_udatamodel();
1009 1011          /*
1010 1012           * Validate form of address parameters.
1011 1013           */
1012 1014          if (model == DATAMODEL_NATIVE) {
1013 1015                  llen = (long)len;
1014 1016          } else {
1015 1017                  llen = (int32_t)(size32_t)len;
1016 1018          }
1017 1019          if (((uintptr_t)addr & PAGEOFFSET) != 0 || llen <= 0)
1018 1020                  return (set_errno(EINVAL));
1019 1021  
1020 1022          if (valid_usr_range(addr, len, 0, as, as->a_userlimit) != RANGE_OKAY)
1021 1023                  return (set_errno(ENOMEM));
1022 1024  
1023 1025          /*
1024 1026           * Loop over subranges of interval [addr : addr + len), recovering
1025 1027           * results internally and then copying them out to caller.  Subrange
1026 1028           * is based on the size of MC_CACHE, defined above.
1027 1029           */
1028 1030          for (ea = addr + len; addr < ea; addr += MC_QUANTUM) {
1029 1031                  error = as_incore(as, addr,
1030 1032                      (size_t)MIN(MC_QUANTUM, ea - addr), vec, &rl);
1031 1033                  if (rl != 0) {
1032 1034                          rl = (rl + PAGESIZE - 1) / PAGESIZE;
1033 1035                          if (copyout(vec, vecp, rl) != 0)
1034 1036                                  return (set_errno(EFAULT));
1035 1037                          vecp += rl;
1036 1038                  }
1037 1039                  if (error != 0)
1038 1040                          return (set_errno(ENOMEM));
1039 1041          }
1040 1042          return (0);
1041 1043  }

↓ open down ↓

529 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX