il_7029-4 New usr/src/uts/common/os/grow.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /* Copyright 2013 OmniTI Computer Consulting, Inc. All rights reserved. */
  23 
  24 /*
  25  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  26  * Use is subject to license terms.
  27  */
  28 
  29 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T     */
  30 /*        All Rights Reserved   */
  31 
  32 #include <sys/types.h>
  33 #include <sys/inttypes.h>
  34 #include <sys/param.h>
  35 #include <sys/sysmacros.h>
  36 #include <sys/systm.h>
  37 #include <sys/signal.h>
  38 #include <sys/user.h>
  39 #include <sys/errno.h>
  40 #include <sys/var.h>
  41 #include <sys/proc.h>
  42 #include <sys/tuneable.h>
  43 #include <sys/debug.h>
  44 #include <sys/cmn_err.h>
  45 #include <sys/cred.h>
  46 #include <sys/vnode.h>
  47 #include <sys/vfs.h>
  48 #include <sys/vm.h>
  49 #include <sys/file.h>
  50 #include <sys/mman.h>
  51 #include <sys/vmparam.h>
  52 #include <sys/fcntl.h>
  53 #include <sys/lwpchan_impl.h>
  54 #include <sys/nbmlock.h>
  55 
  56 #include <vm/hat.h>
  57 #include <vm/as.h>
  58 #include <vm/seg.h>
  59 #include <vm/seg_dev.h>
  60 #include <vm/seg_vn.h>
  61 
  62 int use_brk_lpg = 1;
  63 int use_stk_lpg = 1;
  64 
  65 /*
  66  * If set, we will not randomize mappings where the 'addr' argument is
  67  * non-NULL and not an alignment.
  68  */
  69 int aslr_respect_mmap_hint = 0;
  70 
  71 static int brk_lpg(caddr_t nva);
  72 static int grow_lpg(caddr_t sp);
  73 
  74 intptr_t
  75 brk(caddr_t nva)
  76 {
  77         int error;
  78         proc_t *p = curproc;
  79 
  80         /*
  81          * Serialize brk operations on an address space.
  82          * This also serves as the lock protecting p_brksize
  83          * and p_brkpageszc.
  84          */
  85         as_rangelock(p->p_as);
  86 
  87         /*
  88          * As a special case to aid the implementation of sbrk(3C), if given a
  89          * new brk of 0, return the current brk.  We'll hide this in brk(3C).
  90          */
  91         if (nva == 0) {
  92                 as_rangeunlock(p->p_as);
  93                 return ((intptr_t)(p->p_brkbase + p->p_brksize));
  94         }
  95 
  96         if (use_brk_lpg && (p->p_flag & SAUTOLPG) != 0) {
  97                 error = brk_lpg(nva);
  98         } else {
  99                 error = brk_internal(nva, p->p_brkpageszc);
 100         }
 101         as_rangeunlock(p->p_as);
 102         return ((error != 0 ? set_errno(error) : 0));
 103 }
 104 
 105 /*
 106  * Algorithm: call arch-specific map_pgsz to get best page size to use,
 107  * then call brk_internal().
 108  * Returns 0 on success.
 109  */
 110 static int
 111 brk_lpg(caddr_t nva)
 112 {
 113         struct proc *p = curproc;
 114         size_t pgsz, len;
 115         caddr_t addr, brkend;
 116         caddr_t bssbase = p->p_bssbase;
 117         caddr_t brkbase = p->p_brkbase;
 118         int oszc, szc;
 119         int err;
 120 
 121         oszc = p->p_brkpageszc;
 122 
 123         /*
 124          * If p_brkbase has not yet been set, the first call
 125          * to brk_internal() will initialize it.
 126          */
 127         if (brkbase == 0) {
 128                 return (brk_internal(nva, oszc));
 129         }
 130 
 131         len = nva - bssbase;
 132 
 133         pgsz = map_pgsz(MAPPGSZ_HEAP, p, bssbase, len, 0);
 134         szc = page_szc(pgsz);
 135 
 136         /*
 137          * Covers two cases:
 138          * 1. page_szc() returns -1 for invalid page size, so we want to
 139          * ignore it in that case.
 140          * 2. By design we never decrease page size, as it is more stable.
 141          */
 142         if (szc <= oszc) {
 143                 err = brk_internal(nva, oszc);
 144                 /* If failed, back off to base page size. */
 145                 if (err != 0 && oszc != 0) {
 146                         err = brk_internal(nva, 0);
 147                 }
 148                 return (err);
 149         }
 150 
 151         err = brk_internal(nva, szc);
 152         /* If using szc failed, map with base page size and return. */
 153         if (err != 0) {
 154                 if (szc != 0) {
 155                         err = brk_internal(nva, 0);
 156                 }
 157                 return (err);
 158         }
 159 
 160         /*
 161          * Round up brk base to a large page boundary and remap
 162          * anything in the segment already faulted in beyond that
 163          * point.
 164          */
 165         addr = (caddr_t)P2ROUNDUP((uintptr_t)p->p_bssbase, pgsz);
 166         brkend = brkbase + p->p_brksize;
 167         len = brkend - addr;
 168         /* Check that len is not negative. Update page size code for heap. */
 169         if (addr >= p->p_bssbase && brkend > addr && IS_P2ALIGNED(len, pgsz)) {
 170                 (void) as_setpagesize(p->p_as, addr, len, szc, B_FALSE);
 171                 p->p_brkpageszc = szc;
 172         }
 173 
 174         ASSERT(err == 0);
 175         return (err);           /* should always be 0 */
 176 }
 177 
 178 /*
 179  * Returns 0 on success.
 180  */
 181 int
 182 brk_internal(caddr_t nva, uint_t brkszc)
 183 {
 184         caddr_t ova;                    /* current break address */
 185         size_t size;
 186         int     error;
 187         struct proc *p = curproc;
 188         struct as *as = p->p_as;
 189         size_t pgsz;
 190         uint_t szc;
 191         rctl_qty_t as_rctl;
 192 
 193         /*
 194          * extend heap to brkszc alignment but use current p->p_brkpageszc
 195          * for the newly created segment. This allows the new extension
 196          * segment to be concatenated successfully with the existing brk
 197          * segment.
 198          */
 199         if ((szc = brkszc) != 0) {
 200                 pgsz = page_get_pagesize(szc);
 201                 ASSERT(pgsz > PAGESIZE);
 202         } else {
 203                 pgsz = PAGESIZE;
 204         }
 205 
 206         mutex_enter(&p->p_lock);
 207         as_rctl = rctl_enforced_value(rctlproc_legacy[RLIMIT_DATA],
 208             p->p_rctls, p);
 209         mutex_exit(&p->p_lock);
 210 
 211         /*
 212          * If p_brkbase has not yet been set, the first call
 213          * to brk() will initialize it.
 214          */
 215         if (p->p_brkbase == 0)
 216                 p->p_brkbase = nva;
 217 
 218         /*
 219          * Before multiple page size support existed p_brksize was the value
 220          * not rounded to the pagesize (i.e. it stored the exact user request
 221          * for heap size). If pgsz is greater than PAGESIZE calculate the
 222          * heap size as the real new heap size by rounding it up to pgsz.
 223          * This is useful since we may want to know where the heap ends
 224          * without knowing heap pagesize (e.g. some old code) and also if
 225          * heap pagesize changes we can update p_brkpageszc but delay adding
 226          * new mapping yet still know from p_brksize where the heap really
 227          * ends. The user requested heap end is stored in libc variable.
 228          */
 229         if (pgsz > PAGESIZE) {
 230                 caddr_t tnva = (caddr_t)P2ROUNDUP((uintptr_t)nva, pgsz);
 231                 size = tnva - p->p_brkbase;
 232                 if (tnva < p->p_brkbase || (size > p->p_brksize &&
 233                     size > (size_t)as_rctl)) {
 234                         szc = 0;
 235                         pgsz = PAGESIZE;
 236                         size = nva - p->p_brkbase;
 237                 }
 238         } else {
 239                 size = nva - p->p_brkbase;
 240         }
 241 
 242         /*
 243          * use PAGESIZE to roundup ova because we want to know the real value
 244          * of the current heap end in case p_brkpageszc changes since the last
 245          * p_brksize was computed.
 246          */
 247         nva = (caddr_t)P2ROUNDUP((uintptr_t)nva, pgsz);
 248         ova = (caddr_t)P2ROUNDUP((uintptr_t)(p->p_brkbase + p->p_brksize),
 249             PAGESIZE);
 250 
 251         if ((nva < p->p_brkbase) || (size > p->p_brksize &&
 252             size > as_rctl)) {
 253                 mutex_enter(&p->p_lock);
 254                 (void) rctl_action(rctlproc_legacy[RLIMIT_DATA], p->p_rctls, p,
 255                     RCA_SAFE);
 256                 mutex_exit(&p->p_lock);
 257                 return (ENOMEM);
 258         }
 259 
 260         if (nva > ova) {
 261                 struct segvn_crargs crargs =
 262                     SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL);
 263 
 264                 if (!(p->p_datprot & PROT_EXEC)) {
 265                         crargs.prot &= ~PROT_EXEC;
 266                 }
 267 
 268                 /*
 269                  * Add new zfod mapping to extend UNIX data segment
 270                  * AS_MAP_NO_LPOOB means use 0, and don't reapply OOB policies
 271                  * via map_pgszcvec(). Use AS_MAP_HEAP to get intermediate
 272                  * page sizes if ova is not aligned to szc's pgsz.
 273                  */
 274                 if (szc > 0) {
 275                         caddr_t rbss;
 276 
 277                         rbss = (caddr_t)P2ROUNDUP((uintptr_t)p->p_bssbase,
 278                             pgsz);
 279                         if (IS_P2ALIGNED(p->p_bssbase, pgsz) || ova > rbss) {
 280                                 crargs.szc = p->p_brkpageszc ? p->p_brkpageszc :
 281                                     AS_MAP_NO_LPOOB;
 282                         } else if (ova == rbss) {
 283                                 crargs.szc = szc;
 284                         } else {
 285                                 crargs.szc = AS_MAP_HEAP;
 286                         }
 287                 } else {
 288                         crargs.szc = AS_MAP_NO_LPOOB;
 289                 }
 290                 crargs.lgrp_mem_policy_flags = LGRP_MP_FLAG_EXTEND_UP;
 291                 error = as_map(as, ova, (size_t)(nva - ova), segvn_create,
 292                     &crargs);
 293                 if (error) {
 294                         return (error);
 295                 }
 296 
 297         } else if (nva < ova) {
 298                 /*
 299                  * Release mapping to shrink UNIX data segment.
 300                  */
 301                 (void) as_unmap(as, nva, (size_t)(ova - nva));
 302         }
 303         p->p_brksize = size;
 304         return (0);
 305 }
 306 
 307 /*
 308  * Grow the stack to include sp.  Return 1 if successful, 0 otherwise.
 309  * This routine assumes that the stack grows downward.
 310  */
 311 int
 312 grow(caddr_t sp)
 313 {
 314         struct proc *p = curproc;
 315         struct as *as = p->p_as;
 316         size_t oldsize = p->p_stksize;
 317         size_t newsize;
 318         int err;
 319 
 320         /*
 321          * Serialize grow operations on an address space.
 322          * This also serves as the lock protecting p_stksize
 323          * and p_stkpageszc.
 324          */
 325         as_rangelock(as);
 326         if (use_stk_lpg && (p->p_flag & SAUTOLPG) != 0) {
 327                 err = grow_lpg(sp);
 328         } else {
 329                 err = grow_internal(sp, p->p_stkpageszc);
 330         }
 331         as_rangeunlock(as);
 332 
 333         if (err == 0 && (newsize = p->p_stksize) > oldsize) {
 334                 ASSERT(IS_P2ALIGNED(oldsize, PAGESIZE));
 335                 ASSERT(IS_P2ALIGNED(newsize, PAGESIZE));
 336                 /*
 337                  * Set up translations so the process doesn't have to fault in
 338                  * the stack pages we just gave it.
 339                  */
 340                 (void) as_fault(as->a_hat, as, p->p_usrstack - newsize,
 341                     newsize - oldsize, F_INVAL, S_WRITE);
 342         }
 343         return ((err == 0 ? 1 : 0));
 344 }
 345 
 346 /*
 347  * Algorithm: call arch-specific map_pgsz to get best page size to use,
 348  * then call grow_internal().
 349  * Returns 0 on success.
 350  */
 351 static int
 352 grow_lpg(caddr_t sp)
 353 {
 354         struct proc *p = curproc;
 355         size_t pgsz;
 356         size_t len, newsize;
 357         caddr_t addr, saddr;
 358         caddr_t growend;
 359         int oszc, szc;
 360         int err;
 361 
 362         newsize = p->p_usrstack - sp;
 363 
 364         oszc = p->p_stkpageszc;
 365         pgsz = map_pgsz(MAPPGSZ_STK, p, sp, newsize, 0);
 366         szc = page_szc(pgsz);
 367 
 368         /*
 369          * Covers two cases:
 370          * 1. page_szc() returns -1 for invalid page size, so we want to
 371          * ignore it in that case.
 372          * 2. By design we never decrease page size, as it is more stable.
 373          * This shouldn't happen as the stack never shrinks.
 374          */
 375         if (szc <= oszc) {
 376                 err = grow_internal(sp, oszc);
 377                 /* failed, fall back to base page size */
 378                 if (err != 0 && oszc != 0) {
 379                         err = grow_internal(sp, 0);
 380                 }
 381                 return (err);
 382         }
 383 
 384         /*
 385          * We've grown sufficiently to switch to a new page size.
 386          * So we are going to remap the whole segment with the new page size.
 387          */
 388         err = grow_internal(sp, szc);
 389         /* The grow with szc failed, so fall back to base page size. */
 390         if (err != 0) {
 391                 if (szc != 0) {
 392                         err = grow_internal(sp, 0);
 393                 }
 394                 return (err);
 395         }
 396 
 397         /*
 398          * Round up stack pointer to a large page boundary and remap
 399          * any pgsz pages in the segment already faulted in beyond that
 400          * point.
 401          */
 402         saddr = p->p_usrstack - p->p_stksize;
 403         addr = (caddr_t)P2ROUNDUP((uintptr_t)saddr, pgsz);
 404         growend = (caddr_t)P2ALIGN((uintptr_t)p->p_usrstack, pgsz);
 405         len = growend - addr;
 406         /* Check that len is not negative. Update page size code for stack. */
 407         if (addr >= saddr && growend > addr && IS_P2ALIGNED(len, pgsz)) {
 408                 (void) as_setpagesize(p->p_as, addr, len, szc, B_FALSE);
 409                 p->p_stkpageszc = szc;
 410         }
 411 
 412         ASSERT(err == 0);
 413         return (err);           /* should always be 0 */
 414 }
 415 
 416 /*
 417  * This routine assumes that the stack grows downward.
 418  * Returns 0 on success, errno on failure.
 419  */
 420 int
 421 grow_internal(caddr_t sp, uint_t growszc)
 422 {
 423         struct proc *p = curproc;
 424         size_t newsize;
 425         size_t oldsize;
 426         int    error;
 427         size_t pgsz;
 428         uint_t szc;
 429         struct segvn_crargs crargs = SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL);
 430 
 431         ASSERT(sp < p->p_usrstack);
 432         sp = (caddr_t)P2ALIGN((uintptr_t)sp, PAGESIZE);
 433 
 434         /*
 435          * grow to growszc alignment but use current p->p_stkpageszc for
 436          * the segvn_crargs szc passed to segvn_create. For memcntl to
 437          * increase the szc, this allows the new extension segment to be
 438          * concatenated successfully with the existing stack segment.
 439          */
 440         if ((szc = growszc) != 0) {
 441                 pgsz = page_get_pagesize(szc);
 442                 ASSERT(pgsz > PAGESIZE);
 443                 newsize = p->p_usrstack - (caddr_t)P2ALIGN((uintptr_t)sp, pgsz);
 444                 if (newsize > (size_t)p->p_stk_ctl) {
 445                         szc = 0;
 446                         pgsz = PAGESIZE;
 447                         newsize = p->p_usrstack - sp;
 448                 }
 449         } else {
 450                 pgsz = PAGESIZE;
 451                 newsize = p->p_usrstack - sp;
 452         }
 453 
 454         if (newsize > (size_t)p->p_stk_ctl) {
 455                 (void) rctl_action(rctlproc_legacy[RLIMIT_STACK], p->p_rctls, p,
 456                     RCA_UNSAFE_ALL);
 457 
 458                 return (ENOMEM);
 459         }
 460 
 461         oldsize = p->p_stksize;
 462         ASSERT(P2PHASE(oldsize, PAGESIZE) == 0);
 463 
 464         if (newsize <= oldsize) {    /* prevent the stack from shrinking */
 465                 return (0);
 466         }
 467 
 468         if (!(p->p_stkprot & PROT_EXEC)) {
 469                 crargs.prot &= ~PROT_EXEC;
 470         }
 471         /*
 472          * extend stack with the proposed new growszc, which is different
 473          * than p_stkpageszc only on a memcntl to increase the stack pagesize.
 474          * AS_MAP_NO_LPOOB means use 0, and don't reapply OOB policies via
 475          * map_pgszcvec(). Use AS_MAP_STACK to get intermediate page sizes
 476          * if not aligned to szc's pgsz.
 477          */
 478         if (szc > 0) {
 479                 caddr_t oldsp = p->p_usrstack - oldsize;
 480                 caddr_t austk = (caddr_t)P2ALIGN((uintptr_t)p->p_usrstack,
 481                     pgsz);
 482 
 483                 if (IS_P2ALIGNED(p->p_usrstack, pgsz) || oldsp < austk) {
 484                         crargs.szc = p->p_stkpageszc ? p->p_stkpageszc :
 485                             AS_MAP_NO_LPOOB;
 486                 } else if (oldsp == austk) {
 487                         crargs.szc = szc;
 488                 } else {
 489                         crargs.szc = AS_MAP_STACK;
 490                 }
 491         } else {
 492                 crargs.szc = AS_MAP_NO_LPOOB;
 493         }
 494         crargs.lgrp_mem_policy_flags = LGRP_MP_FLAG_EXTEND_DOWN;
 495 
 496         if ((error = as_map(p->p_as, p->p_usrstack - newsize, newsize - oldsize,
 497             segvn_create, &crargs)) != 0) {
 498                 if (error == EAGAIN) {
 499                         cmn_err(CE_WARN, "Sorry, no swap space to grow stack "
 500                             "for pid %d (%s)", p->p_pid, PTOU(p)->u_comm);
 501                 }
 502                 return (error);
 503         }
 504         p->p_stksize = newsize;
 505         return (0);
 506 }
 507 
 508 /*
 509  * Find address for user to map.  If MAP_FIXED is not specified, we can pick
 510  * any address we want, but we will first try the value in *addrp if it is
 511  * non-NULL and _MAP_RANDOMIZE is not set.  Thus this is implementing a way to
 512  * try and get a preferred address.
 513  */
 514 int
 515 choose_addr(struct as *as, caddr_t *addrp, size_t len, offset_t off,
 516     int vacalign, uint_t flags)
 517 {
 518         caddr_t basep = (caddr_t)(uintptr_t)((uintptr_t)*addrp & PAGEMASK);
 519         size_t lenp = len;
 520 
 521         ASSERT(AS_ISCLAIMGAP(as));      /* searches should be serialized */
 522         if (flags & MAP_FIXED) {
 523                 (void) as_unmap(as, *addrp, len);
 524                 return (0);
 525         } else if (basep != NULL &&
 526             ((flags & (MAP_ALIGN | _MAP_RANDOMIZE)) == 0) &&
 527             !as_gap(as, len, &basep, &lenp, 0, *addrp)) {
 528                 /* User supplied address was available */
 529                 *addrp = basep;
 530         } else {
 531                 /*
 532                  * No user supplied address or the address supplied was not
 533                  * available.
 534                  */
 535                 map_addr(addrp, len, off, vacalign, flags);
 536         }
 537         if (*addrp == NULL)
 538                 return (ENOMEM);
 539         return (0);
 540 }
 541 
 542 
 543 /*
 544  * Used for MAP_ANON - fast way to get anonymous pages
 545  */
 546 static int
 547 zmap(struct as *as, caddr_t *addrp, size_t len, uint_t uprot, int flags,
 548     offset_t pos)
 549 {
 550         struct segvn_crargs vn_a;
 551         int error;
 552 
 553         if (((PROT_ALL & uprot) != uprot))
 554                 return (EACCES);
 555 
 556         if ((flags & MAP_FIXED) != 0) {
 557                 caddr_t userlimit;
 558 
 559                 /*
 560                  * Use the user address.  First verify that
 561                  * the address to be used is page aligned.
 562                  * Then make some simple bounds checks.
 563                  */
 564                 if (((uintptr_t)*addrp & PAGEOFFSET) != 0)
 565                         return (EINVAL);
 566 
 567                 userlimit = flags & _MAP_LOW32 ?
 568                     (caddr_t)USERLIMIT32 : as->a_userlimit;
 569                 switch (valid_usr_range(*addrp, len, uprot, as, userlimit)) {
 570                 case RANGE_OKAY:
 571                         break;
 572                 case RANGE_BADPROT:
 573                         return (ENOTSUP);
 574                 case RANGE_BADADDR:
 575                 default:
 576                         return (ENOMEM);
 577                 }
 578         }
 579         /*
 580          * No need to worry about vac alignment for anonymous
 581          * pages since this is a "clone" object that doesn't
 582          * yet exist.
 583          */
 584         error = choose_addr(as, addrp, len, pos, ADDR_NOVACALIGN, flags);
 585         if (error != 0) {
 586                 return (error);
 587         }
 588 
 589         /*
 590          * Use the seg_vn segment driver; passing in the NULL amp
 591          * gives the desired "cloning" effect.
 592          */
 593         vn_a.vp = NULL;
 594         vn_a.offset = 0;
 595         vn_a.type = flags & MAP_TYPE;
 596         vn_a.prot = uprot;
 597         vn_a.maxprot = PROT_ALL;
 598         vn_a.flags = flags & ~MAP_TYPE;
 599         vn_a.cred = CRED();
 600         vn_a.amp = NULL;
 601         vn_a.szc = 0;
 602         vn_a.lgrp_mem_policy_flags = 0;
 603 
 604         return (as_map(as, *addrp, len, segvn_create, &vn_a));
 605 }
 606 
 607 #define RANDOMIZABLE_MAPPING(addr, flags) (((flags & MAP_FIXED) == 0) && \
 608         !(((flags & MAP_ALIGN) == 0) && (addr != 0) && aslr_respect_mmap_hint))
 609 
 610 static int
 611 smmap_common(caddr_t *addrp, size_t len,
 612     int prot, int flags, struct file *fp, offset_t pos)
 613 {
 614         struct vnode *vp;
 615         struct as *as = curproc->p_as;
 616         uint_t uprot, maxprot, type;
 617         int error;
 618         int in_crit = 0;
 619 
 620         if ((flags & ~(MAP_SHARED | MAP_PRIVATE | MAP_FIXED | _MAP_NEW |
 621             _MAP_LOW32 | MAP_NORESERVE | MAP_ANON | MAP_ALIGN |
 622             MAP_TEXT | MAP_INITDATA)) != 0) {
 623                 /* | MAP_RENAME */      /* not implemented, let user know */
 624                 return (EINVAL);
 625         }
 626 
 627         if ((flags & MAP_TEXT) && !(prot & PROT_EXEC)) {
 628                 return (EINVAL);
 629         }
 630 
 631         if ((flags & (MAP_TEXT | MAP_INITDATA)) == (MAP_TEXT | MAP_INITDATA)) {
 632                 return (EINVAL);
 633         }
 634 
 635         if ((flags & (MAP_FIXED | _MAP_RANDOMIZE)) ==
 636             (MAP_FIXED | _MAP_RANDOMIZE)) {
 637                 return (EINVAL);
 638         }
 639 
 640         /*
 641          * If it's not a fixed allocation and mmap ASLR is enabled, randomize
 642          * it.
 643          */
 644         if (RANDOMIZABLE_MAPPING(*addrp, flags) &&
 645             secflag_enabled(curproc, PROC_SEC_ASLR))
 646                 flags |= _MAP_RANDOMIZE;
 647 
 648 #if defined(__sparc)
 649         /*
 650          * See if this is an "old mmap call".  If so, remember this
 651          * fact and convert the flags value given to mmap to indicate
 652          * the specified address in the system call must be used.
 653          * _MAP_NEW is turned set by all new uses of mmap.
 654          */
 655         if ((flags & _MAP_NEW) == 0)
 656                 flags |= MAP_FIXED;
 657 #endif
 658         flags &= ~_MAP_NEW;
 659 
 660         type = flags & MAP_TYPE;
 661         if (type != MAP_PRIVATE && type != MAP_SHARED)
 662                 return (EINVAL);
 663 
 664 
 665         if (flags & MAP_ALIGN) {
 666                 if (flags & MAP_FIXED)
 667                         return (EINVAL);
 668 
 669                 /* alignment needs to be a power of 2 >= page size */
 670                 if (((uintptr_t)*addrp < PAGESIZE && (uintptr_t)*addrp != 0) ||
 671                     !ISP2((uintptr_t)*addrp))
 672                         return (EINVAL);
 673         }
 674         /*
 675          * Check for bad lengths and file position.
 676          * We let the VOP_MAP routine check for negative lengths
 677          * since on some vnode types this might be appropriate.
 678          */
 679         if (len == 0 || (pos & (u_offset_t)PAGEOFFSET) != 0)
 680                 return (EINVAL);
 681 
 682         maxprot = PROT_ALL;             /* start out allowing all accesses */
 683         uprot = prot | PROT_USER;
 684 
 685         if (fp == NULL) {
 686                 ASSERT(flags & MAP_ANON);
 687                 /* discard lwpchan mappings, like munmap() */
 688                 if ((flags & MAP_FIXED) && curproc->p_lcp != NULL)
 689                         lwpchan_delete_mapping(curproc, *addrp, *addrp + len);
 690                 as_rangelock(as);
 691                 error = zmap(as, addrp, len, uprot, flags, pos);
 692                 as_rangeunlock(as);
 693                 /*
 694                  * Tell machine specific code that lwp has mapped shared memory
 695                  */
 696                 if (error == 0 && (flags & MAP_SHARED)) {
 697                         /* EMPTY */
 698                         LWP_MMODEL_SHARED_AS(*addrp, len);
 699                 }
 700                 return (error);
 701         } else if ((flags & MAP_ANON) != 0)
 702                 return (EINVAL);
 703 
 704         vp = fp->f_vnode;
 705 
 706         /* Can't execute code from "noexec" mounted filesystem. */
 707         if ((vp->v_vfsp->vfs_flag & VFS_NOEXEC) != 0)
 708                 maxprot &= ~PROT_EXEC;
 709 
 710         /*
 711          * These checks were added as part of large files.
 712          *
 713          * Return ENXIO if the initial position is negative; return EOVERFLOW
 714          * if (offset + len) would overflow the maximum allowed offset for the
 715          * type of file descriptor being used.
 716          */
 717         if (vp->v_type == VREG) {
 718                 if (pos < 0)
 719                         return (ENXIO);
 720                 if ((offset_t)len > (OFFSET_MAX(fp) - pos))
 721                         return (EOVERFLOW);
 722         }
 723 
 724         if (type == MAP_SHARED && (fp->f_flag & FWRITE) == 0) {
 725                 /* no write access allowed */
 726                 maxprot &= ~PROT_WRITE;
 727         }
 728 
 729         /*
 730          * XXX - Do we also adjust maxprot based on protections
 731          * of the vnode?  E.g. if no execute permission is given
 732          * on the vnode for the current user, maxprot probably
 733          * should disallow PROT_EXEC also?  This is different
 734          * from the write access as this would be a per vnode
 735          * test as opposed to a per fd test for writability.
 736          */
 737 
 738         /*
 739          * Verify that the specified protections are not greater than
 740          * the maximum allowable protections.  Also test to make sure
 741          * that the file descriptor does allows for read access since
 742          * "write only" mappings are hard to do since normally we do
 743          * the read from the file before the page can be written.
 744          */
 745         if (((maxprot & uprot) != uprot) || (fp->f_flag & FREAD) == 0)
 746                 return (EACCES);
 747 
 748         /*
 749          * If the user specified an address, do some simple checks here
 750          */
 751         if ((flags & MAP_FIXED) != 0) {
 752                 caddr_t userlimit;
 753 
 754                 /*
 755                  * Use the user address.  First verify that
 756                  * the address to be used is page aligned.
 757                  * Then make some simple bounds checks.
 758                  */
 759                 if (((uintptr_t)*addrp & PAGEOFFSET) != 0)
 760                         return (EINVAL);
 761 
 762                 userlimit = flags & _MAP_LOW32 ?
 763                     (caddr_t)USERLIMIT32 : as->a_userlimit;
 764                 switch (valid_usr_range(*addrp, len, uprot, as, userlimit)) {
 765                 case RANGE_OKAY:
 766                         break;
 767                 case RANGE_BADPROT:
 768                         return (ENOTSUP);
 769                 case RANGE_BADADDR:
 770                 default:
 771                         return (ENOMEM);
 772                 }
 773         }
 774 
 775         if ((prot & (PROT_READ | PROT_WRITE | PROT_EXEC)) &&
 776             nbl_need_check(vp)) {
 777                 int svmand;
 778                 nbl_op_t nop;
 779 
 780                 nbl_start_crit(vp, RW_READER);
 781                 in_crit = 1;
 782                 error = nbl_svmand(vp, fp->f_cred, &svmand);
 783                 if (error != 0)
 784                         goto done;
 785                 if ((prot & PROT_WRITE) && (type == MAP_SHARED)) {
 786                         if (prot & (PROT_READ | PROT_EXEC)) {
 787                                 nop = NBL_READWRITE;
 788                         } else {
 789                                 nop = NBL_WRITE;
 790                         }
 791                 } else {
 792                         nop = NBL_READ;
 793                 }
 794                 if (nbl_conflict(vp, nop, 0, LONG_MAX, svmand, NULL)) {
 795                         error = EACCES;
 796                         goto done;
 797                 }
 798         }
 799 
 800         /* discard lwpchan mappings, like munmap() */
 801         if ((flags & MAP_FIXED) && curproc->p_lcp != NULL)
 802                 lwpchan_delete_mapping(curproc, *addrp, *addrp + len);
 803 
 804         /*
 805          * Ok, now let the vnode map routine do its thing to set things up.
 806          */
 807         error = VOP_MAP(vp, pos, as,
 808             addrp, len, uprot, maxprot, flags, fp->f_cred, NULL);
 809 
 810         if (error == 0) {
 811                 /*
 812                  * Tell machine specific code that lwp has mapped shared memory
 813                  */
 814                 if (flags & MAP_SHARED) {
 815                         /* EMPTY */
 816                         LWP_MMODEL_SHARED_AS(*addrp, len);
 817                 }
 818                 if (vp->v_type == VREG &&
 819                     (flags & (MAP_TEXT | MAP_INITDATA)) != 0) {
 820                         /*
 821                          * Mark this as an executable vnode
 822                          */
 823                         mutex_enter(&vp->v_lock);
 824                         vp->v_flag |= VVMEXEC;
 825                         mutex_exit(&vp->v_lock);
 826                 }
 827         }
 828 
 829 done:
 830         if (in_crit)
 831                 nbl_end_crit(vp);
 832         return (error);
 833 }
 834 
 835 #ifdef _LP64
 836 /*
 837  * LP64 mmap(2) system call: 64-bit offset, 64-bit address.
 838  *
 839  * The "large file" mmap routine mmap64(2) is also mapped to this routine
 840  * by the 64-bit version of libc.
 841  *
 842  * Eventually, this should be the only version, and have smmap_common()
 843  * folded back into it again.  Some day.
 844  */
 845 caddr_t
 846 smmap64(caddr_t addr, size_t len, int prot, int flags, int fd, off_t pos)
 847 {
 848         struct file *fp;
 849         int error;
 850 
 851         if (fd == -1 && (flags & MAP_ANON) != 0)
 852                 error = smmap_common(&addr, len, prot, flags,
 853                     NULL, (offset_t)pos);
 854         else if ((fp = getf(fd)) != NULL) {
 855                 error = smmap_common(&addr, len, prot, flags,
 856                     fp, (offset_t)pos);
 857                 releasef(fd);
 858         } else
 859                 error = EBADF;
 860 
 861         return (error ? (caddr_t)(uintptr_t)set_errno(error) : addr);
 862 }
 863 #endif  /* _LP64 */
 864 
 865 #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
 866 
 867 /*
 868  * ILP32 mmap(2) system call: 32-bit offset, 32-bit address.
 869  */
 870 caddr_t
 871 smmap32(caddr32_t addr, size32_t len, int prot, int flags, int fd, off32_t pos)
 872 {
 873         struct file *fp;
 874         int error;
 875         caddr_t a = (caddr_t)(uintptr_t)addr;
 876 
 877         if (flags & _MAP_LOW32)
 878                 error = EINVAL;
 879         else if (fd == -1 && (flags & MAP_ANON) != 0)
 880                 error = smmap_common(&a, (size_t)len, prot,
 881                     flags | _MAP_LOW32, NULL, (offset_t)pos);
 882         else if ((fp = getf(fd)) != NULL) {
 883                 error = smmap_common(&a, (size_t)len, prot,
 884                     flags | _MAP_LOW32, fp, (offset_t)pos);
 885                 releasef(fd);
 886         } else
 887                 error = EBADF;
 888 
 889         ASSERT(error != 0 || (uintptr_t)(a + len) < (uintptr_t)UINT32_MAX);
 890 
 891         return (error ? (caddr_t)(uintptr_t)set_errno(error) : a);
 892 }
 893 
 894 /*
 895  * ILP32 mmap64(2) system call: 64-bit offset, 32-bit address.
 896  *
 897  * Now things really get ugly because we can't use the C-style
 898  * calling convention for more than 6 args, and 64-bit parameter
 899  * passing on 32-bit systems is less than clean.
 900  */
 901 
 902 struct mmaplf32a {
 903         caddr_t addr;
 904         size_t len;
 905 #ifdef _LP64
 906         /*
 907          * 32-bit contents, 64-bit cells
 908          */
 909         uint64_t prot;
 910         uint64_t flags;
 911         uint64_t fd;
 912         uint64_t offhi;
 913         uint64_t offlo;
 914 #else
 915         /*
 916          * 32-bit contents, 32-bit cells
 917          */
 918         uint32_t prot;
 919         uint32_t flags;
 920         uint32_t fd;
 921         uint32_t offhi;
 922         uint32_t offlo;
 923 #endif
 924 };
 925 
 926 int
 927 smmaplf32(struct mmaplf32a *uap, rval_t *rvp)
 928 {
 929         struct file *fp;
 930         int error;
 931         caddr_t a = uap->addr;
 932         int flags = (int)uap->flags;
 933         int fd = (int)uap->fd;
 934 #ifdef _BIG_ENDIAN
 935         offset_t off = ((u_offset_t)uap->offhi << 32) | (u_offset_t)uap->offlo;
 936 #else
 937         offset_t off = ((u_offset_t)uap->offlo << 32) | (u_offset_t)uap->offhi;
 938 #endif
 939 
 940         if (flags & _MAP_LOW32)
 941                 error = EINVAL;
 942         else if (fd == -1 && (flags & MAP_ANON) != 0)
 943                 error = smmap_common(&a, uap->len, (int)uap->prot,
 944                     flags | _MAP_LOW32, NULL, off);
 945         else if ((fp = getf(fd)) != NULL) {
 946                 error = smmap_common(&a, uap->len, (int)uap->prot,
 947                     flags | _MAP_LOW32, fp, off);
 948                 releasef(fd);
 949         } else
 950                 error = EBADF;
 951 
 952         if (error == 0)
 953                 rvp->r_val1 = (uintptr_t)a;
 954         return (error);
 955 }
 956 
 957 #endif  /* _SYSCALL32_IMPL || _ILP32 */
 958 
 959 int
 960 munmap(caddr_t addr, size_t len)
 961 {
 962         struct proc *p = curproc;
 963         struct as *as = p->p_as;
 964 
 965         if (((uintptr_t)addr & PAGEOFFSET) != 0 || len == 0)
 966                 return (set_errno(EINVAL));
 967 
 968         if (valid_usr_range(addr, len, 0, as, as->a_userlimit) != RANGE_OKAY)
 969                 return (set_errno(EINVAL));
 970 
 971         /*
 972          * Discard lwpchan mappings.
 973          */
 974         if (p->p_lcp != NULL)
 975                 lwpchan_delete_mapping(p, addr, addr + len);
 976         if (as_unmap(as, addr, len) != 0)
 977                 return (set_errno(EINVAL));
 978 
 979         return (0);
 980 }
 981 
 982 int
 983 mprotect(caddr_t addr, size_t len, int prot)
 984 {
 985         struct as *as = curproc->p_as;
 986         uint_t uprot = prot | PROT_USER;
 987         int error;
 988 
 989         if (((uintptr_t)addr & PAGEOFFSET) != 0 || len == 0)
 990                 return (set_errno(EINVAL));
 991 
 992         switch (valid_usr_range(addr, len, prot, as, as->a_userlimit)) {
 993         case RANGE_OKAY:
 994                 break;
 995         case RANGE_BADPROT:
 996                 return (set_errno(ENOTSUP));
 997         case RANGE_BADADDR:
 998         default:
 999                 return (set_errno(ENOMEM));
1000         }
1001 
1002         error = as_setprot(as, addr, len, uprot);
1003         if (error)
1004                 return (set_errno(error));
1005         return (0);
1006 }
1007 
1008 #define MC_CACHE        128                     /* internal result buffer */
1009 #define MC_QUANTUM      (MC_CACHE * PAGESIZE)   /* addresses covered in loop */
1010 
1011 int
1012 mincore(caddr_t addr, size_t len, char *vecp)
1013 {
1014         struct as *as = curproc->p_as;
1015         caddr_t ea;                     /* end address of loop */
1016         size_t rl;                      /* inner result length */
1017         char vec[MC_CACHE];             /* local vector cache */
1018         int error;
1019         model_t model;
1020         long    llen;
1021 
1022         model = get_udatamodel();
1023         /*
1024          * Validate form of address parameters.
1025          */
1026         if (model == DATAMODEL_NATIVE) {
1027                 llen = (long)len;
1028         } else {
1029                 llen = (int32_t)(size32_t)len;
1030         }
1031         if (((uintptr_t)addr & PAGEOFFSET) != 0 || llen <= 0)
1032                 return (set_errno(EINVAL));
1033 
1034         if (valid_usr_range(addr, len, 0, as, as->a_userlimit) != RANGE_OKAY)
1035                 return (set_errno(ENOMEM));
1036 
1037         /*
1038          * Loop over subranges of interval [addr : addr + len), recovering
1039          * results internally and then copying them out to caller.  Subrange
1040          * is based on the size of MC_CACHE, defined above.
1041          */
1042         for (ea = addr + len; addr < ea; addr += MC_QUANTUM) {
1043                 error = as_incore(as, addr,
1044                     (size_t)MIN(MC_QUANTUM, ea - addr), vec, &rl);
1045                 if (rl != 0) {
1046                         rl = (rl + PAGESIZE - 1) / PAGESIZE;
1047                         if (copyout(vec, vecp, rl) != 0)
1048                                 return (set_errno(EFAULT));
1049                         vecp += rl;
1050                 }
1051                 if (error != 0)
1052                         return (set_errno(ENOMEM));
1053         }
1054         return (0);
1055 }