omnios_quiet Old usr/src/uts/common/os/grow.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /* Copyright 2013 OmniTI Computer Consulting, Inc. All rights reserved. */
  23 
  24 /*
  25  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  26  * Use is subject to license terms.
  27  */
  28 
  29 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T     */
  30 /*        All Rights Reserved   */
  31 
  32 #include <sys/types.h>
  33 #include <sys/inttypes.h>
  34 #include <sys/param.h>
  35 #include <sys/sysmacros.h>
  36 #include <sys/systm.h>
  37 #include <sys/signal.h>
  38 #include <sys/user.h>
  39 #include <sys/errno.h>
  40 #include <sys/var.h>
  41 #include <sys/proc.h>
  42 #include <sys/tuneable.h>
  43 #include <sys/debug.h>
  44 #include <sys/cmn_err.h>
  45 #include <sys/cred.h>
  46 #include <sys/vnode.h>
  47 #include <sys/vfs.h>
  48 #include <sys/vm.h>
  49 #include <sys/file.h>
  50 #include <sys/mman.h>
  51 #include <sys/vmparam.h>
  52 #include <sys/fcntl.h>
  53 #include <sys/lwpchan_impl.h>
  54 #include <sys/nbmlock.h>
  55 
  56 #include <vm/hat.h>
  57 #include <vm/as.h>
  58 #include <vm/seg.h>
  59 #include <vm/seg_dev.h>
  60 #include <vm/seg_vn.h>
  61 
  62 int use_brk_lpg = 1;
  63 int use_stk_lpg = 1;
  64 
  65 static int brk_lpg(caddr_t nva);
  66 static int grow_lpg(caddr_t sp);
  67 
  68 int
  69 brk(caddr_t nva)
  70 {
  71         int error;
  72         proc_t *p = curproc;
  73 
  74         /*
  75          * Serialize brk operations on an address space.
  76          * This also serves as the lock protecting p_brksize
  77          * and p_brkpageszc.
  78          */
  79         as_rangelock(p->p_as);
  80         if (use_brk_lpg && (p->p_flag & SAUTOLPG) != 0) {
  81                 error = brk_lpg(nva);
  82         } else {
  83                 error = brk_internal(nva, p->p_brkpageszc);
  84         }
  85         as_rangeunlock(p->p_as);
  86         return ((error != 0 ? set_errno(error) : 0));
  87 }
  88 
  89 /*
  90  * Algorithm: call arch-specific map_pgsz to get best page size to use,
  91  * then call brk_internal().
  92  * Returns 0 on success.
  93  */
  94 static int
  95 brk_lpg(caddr_t nva)
  96 {
  97         struct proc *p = curproc;
  98         size_t pgsz, len;
  99         caddr_t addr, brkend;
 100         caddr_t bssbase = p->p_bssbase;
 101         caddr_t brkbase = p->p_brkbase;
 102         int oszc, szc;
 103         int err;
 104 
 105         oszc = p->p_brkpageszc;
 106 
 107         /*
 108          * If p_brkbase has not yet been set, the first call
 109          * to brk_internal() will initialize it.
 110          */
 111         if (brkbase == 0) {
 112                 return (brk_internal(nva, oszc));
 113         }
 114 
 115         len = nva - bssbase;
 116 
 117         pgsz = map_pgsz(MAPPGSZ_HEAP, p, bssbase, len, 0);
 118         szc = page_szc(pgsz);
 119 
 120         /*
 121          * Covers two cases:
 122          * 1. page_szc() returns -1 for invalid page size, so we want to
 123          * ignore it in that case.
 124          * 2. By design we never decrease page size, as it is more stable.
 125          */
 126         if (szc <= oszc) {
 127                 err = brk_internal(nva, oszc);
 128                 /* If failed, back off to base page size. */
 129                 if (err != 0 && oszc != 0) {
 130                         err = brk_internal(nva, 0);
 131                 }
 132                 return (err);
 133         }
 134 
 135         err = brk_internal(nva, szc);
 136         /* If using szc failed, map with base page size and return. */
 137         if (err != 0) {
 138                 if (szc != 0) {
 139                         err = brk_internal(nva, 0);
 140                 }
 141                 return (err);
 142         }
 143 
 144         /*
 145          * Round up brk base to a large page boundary and remap
 146          * anything in the segment already faulted in beyond that
 147          * point.
 148          */
 149         addr = (caddr_t)P2ROUNDUP((uintptr_t)p->p_bssbase, pgsz);
 150         brkend = brkbase + p->p_brksize;
 151         len = brkend - addr;
 152         /* Check that len is not negative. Update page size code for heap. */
 153         if (addr >= p->p_bssbase && brkend > addr && IS_P2ALIGNED(len, pgsz)) {
 154                 (void) as_setpagesize(p->p_as, addr, len, szc, B_FALSE);
 155                 p->p_brkpageszc = szc;
 156         }
 157 
 158         ASSERT(err == 0);
 159         return (err);           /* should always be 0 */
 160 }
 161 
 162 /*
 163  * Returns 0 on success.
 164  */
 165 int
 166 brk_internal(caddr_t nva, uint_t brkszc)
 167 {
 168         caddr_t ova;                    /* current break address */
 169         size_t size;
 170         int     error;
 171         struct proc *p = curproc;
 172         struct as *as = p->p_as;
 173         size_t pgsz;
 174         uint_t szc;
 175         rctl_qty_t as_rctl;
 176 
 177         /*
 178          * extend heap to brkszc alignment but use current p->p_brkpageszc
 179          * for the newly created segment. This allows the new extension
 180          * segment to be concatenated successfully with the existing brk
 181          * segment.
 182          */
 183         if ((szc = brkszc) != 0) {
 184                 pgsz = page_get_pagesize(szc);
 185                 ASSERT(pgsz > PAGESIZE);
 186         } else {
 187                 pgsz = PAGESIZE;
 188         }
 189 
 190         mutex_enter(&p->p_lock);
 191         as_rctl = rctl_enforced_value(rctlproc_legacy[RLIMIT_DATA],
 192             p->p_rctls, p);
 193         mutex_exit(&p->p_lock);
 194 
 195         /*
 196          * If p_brkbase has not yet been set, the first call
 197          * to brk() will initialize it.
 198          */
 199         if (p->p_brkbase == 0)
 200                 p->p_brkbase = nva;
 201 
 202         /*
 203          * Before multiple page size support existed p_brksize was the value
 204          * not rounded to the pagesize (i.e. it stored the exact user request
 205          * for heap size). If pgsz is greater than PAGESIZE calculate the
 206          * heap size as the real new heap size by rounding it up to pgsz.
 207          * This is useful since we may want to know where the heap ends
 208          * without knowing heap pagesize (e.g. some old code) and also if
 209          * heap pagesize changes we can update p_brkpageszc but delay adding
 210          * new mapping yet still know from p_brksize where the heap really
 211          * ends. The user requested heap end is stored in libc variable.
 212          */
 213         if (pgsz > PAGESIZE) {
 214                 caddr_t tnva = (caddr_t)P2ROUNDUP((uintptr_t)nva, pgsz);
 215                 size = tnva - p->p_brkbase;
 216                 if (tnva < p->p_brkbase || (size > p->p_brksize &&
 217                     size > (size_t)as_rctl)) {
 218                         szc = 0;
 219                         pgsz = PAGESIZE;
 220                         size = nva - p->p_brkbase;
 221                 }
 222         } else {
 223                 size = nva - p->p_brkbase;
 224         }
 225 
 226         /*
 227          * use PAGESIZE to roundup ova because we want to know the real value
 228          * of the current heap end in case p_brkpageszc changes since the last
 229          * p_brksize was computed.
 230          */
 231         nva = (caddr_t)P2ROUNDUP((uintptr_t)nva, pgsz);
 232         ova = (caddr_t)P2ROUNDUP((uintptr_t)(p->p_brkbase + p->p_brksize),
 233             PAGESIZE);
 234 
 235         if ((nva < p->p_brkbase) || (size > p->p_brksize &&
 236             size > as_rctl)) {
 237                 mutex_enter(&p->p_lock);
 238                 (void) rctl_action(rctlproc_legacy[RLIMIT_DATA], p->p_rctls, p,
 239                     RCA_SAFE);
 240                 mutex_exit(&p->p_lock);
 241                 return (ENOMEM);
 242         }
 243 
 244         if (nva > ova) {
 245                 struct segvn_crargs crargs =
 246                     SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL);
 247 
 248                 if (!(p->p_datprot & PROT_EXEC)) {
 249                         crargs.prot &= ~PROT_EXEC;
 250                 }
 251 
 252                 /*
 253                  * Add new zfod mapping to extend UNIX data segment
 254                  * AS_MAP_NO_LPOOB means use 0, and don't reapply OOB policies
 255                  * via map_pgszcvec(). Use AS_MAP_HEAP to get intermediate
 256                  * page sizes if ova is not aligned to szc's pgsz.
 257                  */
 258                 if (szc > 0) {
 259                         caddr_t rbss;
 260 
 261                         rbss = (caddr_t)P2ROUNDUP((uintptr_t)p->p_bssbase,
 262                             pgsz);
 263                         if (IS_P2ALIGNED(p->p_bssbase, pgsz) || ova > rbss) {
 264                                 crargs.szc = p->p_brkpageszc ? p->p_brkpageszc :
 265                                     AS_MAP_NO_LPOOB;
 266                         } else if (ova == rbss) {
 267                                 crargs.szc = szc;
 268                         } else {
 269                                 crargs.szc = AS_MAP_HEAP;
 270                         }
 271                 } else {
 272                         crargs.szc = AS_MAP_NO_LPOOB;
 273                 }
 274                 crargs.lgrp_mem_policy_flags = LGRP_MP_FLAG_EXTEND_UP;
 275                 error = as_map(as, ova, (size_t)(nva - ova), segvn_create,
 276                     &crargs);
 277                 if (error) {
 278                         return (error);
 279                 }
 280 
 281         } else if (nva < ova) {
 282                 /*
 283                  * Release mapping to shrink UNIX data segment.
 284                  */
 285                 (void) as_unmap(as, nva, (size_t)(ova - nva));
 286         }
 287         p->p_brksize = size;
 288         return (0);
 289 }
 290 
 291 /*
 292  * Grow the stack to include sp.  Return 1 if successful, 0 otherwise.
 293  * This routine assumes that the stack grows downward.
 294  */
 295 int
 296 grow(caddr_t sp)
 297 {
 298         struct proc *p = curproc;
 299         struct as *as = p->p_as;
 300         size_t oldsize = p->p_stksize;
 301         size_t newsize;
 302         int err;
 303 
 304         /*
 305          * Serialize grow operations on an address space.
 306          * This also serves as the lock protecting p_stksize
 307          * and p_stkpageszc.
 308          */
 309         as_rangelock(as);
 310         if (use_stk_lpg && (p->p_flag & SAUTOLPG) != 0) {
 311                 err = grow_lpg(sp);
 312         } else {
 313                 err = grow_internal(sp, p->p_stkpageszc);
 314         }
 315         as_rangeunlock(as);
 316 
 317         if (err == 0 && (newsize = p->p_stksize) > oldsize) {
 318                 ASSERT(IS_P2ALIGNED(oldsize, PAGESIZE));
 319                 ASSERT(IS_P2ALIGNED(newsize, PAGESIZE));
 320                 /*
 321                  * Set up translations so the process doesn't have to fault in
 322                  * the stack pages we just gave it.
 323                  */
 324                 (void) as_fault(as->a_hat, as, p->p_usrstack - newsize,
 325                     newsize - oldsize, F_INVAL, S_WRITE);
 326         }
 327         return ((err == 0 ? 1 : 0));
 328 }
 329 
 330 /*
 331  * Algorithm: call arch-specific map_pgsz to get best page size to use,
 332  * then call grow_internal().
 333  * Returns 0 on success.
 334  */
 335 static int
 336 grow_lpg(caddr_t sp)
 337 {
 338         struct proc *p = curproc;
 339         size_t pgsz;
 340         size_t len, newsize;
 341         caddr_t addr, saddr;
 342         caddr_t growend;
 343         int oszc, szc;
 344         int err;
 345 
 346         newsize = p->p_usrstack - sp;
 347 
 348         oszc = p->p_stkpageszc;
 349         pgsz = map_pgsz(MAPPGSZ_STK, p, sp, newsize, 0);
 350         szc = page_szc(pgsz);
 351 
 352         /*
 353          * Covers two cases:
 354          * 1. page_szc() returns -1 for invalid page size, so we want to
 355          * ignore it in that case.
 356          * 2. By design we never decrease page size, as it is more stable.
 357          * This shouldn't happen as the stack never shrinks.
 358          */
 359         if (szc <= oszc) {
 360                 err = grow_internal(sp, oszc);
 361                 /* failed, fall back to base page size */
 362                 if (err != 0 && oszc != 0) {
 363                         err = grow_internal(sp, 0);
 364                 }
 365                 return (err);
 366         }
 367 
 368         /*
 369          * We've grown sufficiently to switch to a new page size.
 370          * So we are going to remap the whole segment with the new page size.
 371          */
 372         err = grow_internal(sp, szc);
 373         /* The grow with szc failed, so fall back to base page size. */
 374         if (err != 0) {
 375                 if (szc != 0) {
 376                         err = grow_internal(sp, 0);
 377                 }
 378                 return (err);
 379         }
 380 
 381         /*
 382          * Round up stack pointer to a large page boundary and remap
 383          * any pgsz pages in the segment already faulted in beyond that
 384          * point.
 385          */
 386         saddr = p->p_usrstack - p->p_stksize;
 387         addr = (caddr_t)P2ROUNDUP((uintptr_t)saddr, pgsz);
 388         growend = (caddr_t)P2ALIGN((uintptr_t)p->p_usrstack, pgsz);
 389         len = growend - addr;
 390         /* Check that len is not negative. Update page size code for stack. */
 391         if (addr >= saddr && growend > addr && IS_P2ALIGNED(len, pgsz)) {
 392                 (void) as_setpagesize(p->p_as, addr, len, szc, B_FALSE);
 393                 p->p_stkpageszc = szc;
 394         }
 395 
 396         ASSERT(err == 0);
 397         return (err);           /* should always be 0 */
 398 }
 399 
 400 /*
 401  * This routine assumes that the stack grows downward.
 402  * Returns 0 on success, errno on failure.
 403  */
 404 int
 405 grow_internal(caddr_t sp, uint_t growszc)
 406 {
 407         struct proc *p = curproc;
 408         size_t newsize;
 409         size_t oldsize;
 410         int    error;
 411         size_t pgsz;
 412         uint_t szc;
 413         struct segvn_crargs crargs = SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL);
 414 
 415         ASSERT(sp < p->p_usrstack);
 416         sp = (caddr_t)P2ALIGN((uintptr_t)sp, PAGESIZE);
 417 
 418         /*
 419          * grow to growszc alignment but use current p->p_stkpageszc for
 420          * the segvn_crargs szc passed to segvn_create. For memcntl to
 421          * increase the szc, this allows the new extension segment to be
 422          * concatenated successfully with the existing stack segment.
 423          */
 424         if ((szc = growszc) != 0) {
 425                 pgsz = page_get_pagesize(szc);
 426                 ASSERT(pgsz > PAGESIZE);
 427                 newsize = p->p_usrstack - (caddr_t)P2ALIGN((uintptr_t)sp, pgsz);
 428                 if (newsize > (size_t)p->p_stk_ctl) {
 429                         szc = 0;
 430                         pgsz = PAGESIZE;
 431                         newsize = p->p_usrstack - sp;
 432                 }
 433         } else {
 434                 pgsz = PAGESIZE;
 435                 newsize = p->p_usrstack - sp;
 436         }
 437 
 438         if (newsize > (size_t)p->p_stk_ctl) {
 439                 (void) rctl_action(rctlproc_legacy[RLIMIT_STACK], p->p_rctls, p,
 440                     RCA_UNSAFE_ALL);
 441 
 442                 return (ENOMEM);
 443         }
 444 
 445         oldsize = p->p_stksize;
 446         ASSERT(P2PHASE(oldsize, PAGESIZE) == 0);
 447 
 448         if (newsize <= oldsize) {    /* prevent the stack from shrinking */
 449                 return (0);
 450         }
 451 
 452         if (!(p->p_stkprot & PROT_EXEC)) {
 453                 crargs.prot &= ~PROT_EXEC;
 454         }
 455         /*
 456          * extend stack with the proposed new growszc, which is different
 457          * than p_stkpageszc only on a memcntl to increase the stack pagesize.
 458          * AS_MAP_NO_LPOOB means use 0, and don't reapply OOB policies via
 459          * map_pgszcvec(). Use AS_MAP_STACK to get intermediate page sizes
 460          * if not aligned to szc's pgsz.
 461          */
 462         if (szc > 0) {
 463                 caddr_t oldsp = p->p_usrstack - oldsize;
 464                 caddr_t austk = (caddr_t)P2ALIGN((uintptr_t)p->p_usrstack,
 465                     pgsz);
 466 
 467                 if (IS_P2ALIGNED(p->p_usrstack, pgsz) || oldsp < austk) {
 468                         crargs.szc = p->p_stkpageszc ? p->p_stkpageszc :
 469                             AS_MAP_NO_LPOOB;
 470                 } else if (oldsp == austk) {
 471                         crargs.szc = szc;
 472                 } else {
 473                         crargs.szc = AS_MAP_STACK;
 474                 }
 475         } else {
 476                 crargs.szc = AS_MAP_NO_LPOOB;
 477         }
 478         crargs.lgrp_mem_policy_flags = LGRP_MP_FLAG_EXTEND_DOWN;
 479 
 480         if ((error = as_map(p->p_as, p->p_usrstack - newsize, newsize - oldsize,
 481             segvn_create, &crargs)) != 0) {
 482                 if (error == EAGAIN) {
 483                         cmn_err(CE_WARN, "Sorry, no swap space to grow stack "
 484                             "for pid %d (%s)", p->p_pid, PTOU(p)->u_comm);
 485                 }
 486                 return (error);
 487         }
 488         p->p_stksize = newsize;
 489         return (0);
 490 }
 491 
 492 /*
 493  * Find address for user to map.
 494  * If MAP_FIXED is not specified, we can pick any address we want, but we will
 495  * first try the value in *addrp if it is non-NULL.  Thus this is implementing
 496  * a way to try and get a preferred address.
 497  */
 498 int
 499 choose_addr(struct as *as, caddr_t *addrp, size_t len, offset_t off,
 500     int vacalign, uint_t flags)
 501 {
 502         proc_t *p = curproc;
 503         caddr_t basep = (caddr_t)(uintptr_t)((uintptr_t)*addrp & PAGEMASK);
 504         size_t lenp;
 505 
 506         ASSERT(AS_ISCLAIMGAP(as));      /* searches should be serialized */
 507 
 508         /*
 509          * If we have been provided a hint, we should still expand the lenp
 510          * to be the rest of the address space.  This will allow us to
 511          * treat the hint as a strong desire to be "nearby" the provided
 512          * address.  If we can't satisfy the hint, as_gap() will walk forward.
 513          */
 514         if (flags & _MAP_LOW32)
 515                 lenp = (caddr_t)USERLIMIT32 - basep;
 516 #if defined(__amd64)
 517         else if (p->p_model == DATAMODEL_NATIVE)
 518                 lenp = p->p_usrstack - basep -
 519                     ((p->p_stk_ctl + PAGEOFFSET) & PAGEMASK);
 520 #endif
 521         else
 522                 lenp = as->a_userlimit - basep;
 523 
 524         if (flags & MAP_FIXED) {
 525                 (void) as_unmap(as, *addrp, len);
 526                 return (0);
 527         } else if (basep != NULL && ((flags & MAP_ALIGN) == 0) &&
 528             !as_gap(as, len, &basep, &lenp, 0, *addrp)) {
 529                 /* User supplied address was available */
 530                 *addrp = basep;
 531         } else {
 532                 /*
 533                  * No user supplied address or the address supplied was not
 534                  * available.
 535                  */
 536                 map_addr(addrp, len, off, vacalign, flags);
 537         }
 538         if (*addrp == NULL)
 539                 return (ENOMEM);
 540         return (0);
 541 }
 542 
 543 
 544 /*
 545  * Used for MAP_ANON - fast way to get anonymous pages
 546  */
 547 static int
 548 zmap(struct as *as, caddr_t *addrp, size_t len, uint_t uprot, int flags,
 549     offset_t pos)
 550 {
 551         struct segvn_crargs vn_a;
 552         int error;
 553 
 554         if (((PROT_ALL & uprot) != uprot))
 555                 return (EACCES);
 556 
 557         if ((flags & MAP_FIXED) != 0) {
 558                 caddr_t userlimit;
 559 
 560                 /*
 561                  * Use the user address.  First verify that
 562                  * the address to be used is page aligned.
 563                  * Then make some simple bounds checks.
 564                  */
 565                 if (((uintptr_t)*addrp & PAGEOFFSET) != 0)
 566                         return (EINVAL);
 567 
 568                 userlimit = flags & _MAP_LOW32 ?
 569                     (caddr_t)USERLIMIT32 : as->a_userlimit;
 570                 switch (valid_usr_range(*addrp, len, uprot, as, userlimit)) {
 571                 case RANGE_OKAY:
 572                         break;
 573                 case RANGE_BADPROT:
 574                         return (ENOTSUP);
 575                 case RANGE_BADADDR:
 576                 default:
 577                         return (ENOMEM);
 578                 }
 579         }
 580         /*
 581          * No need to worry about vac alignment for anonymous
 582          * pages since this is a "clone" object that doesn't
 583          * yet exist.
 584          */
 585         error = choose_addr(as, addrp, len, pos, ADDR_NOVACALIGN, flags);
 586         if (error != 0) {
 587                 return (error);
 588         }
 589 
 590         /*
 591          * Use the seg_vn segment driver; passing in the NULL amp
 592          * gives the desired "cloning" effect.
 593          */
 594         vn_a.vp = NULL;
 595         vn_a.offset = 0;
 596         vn_a.type = flags & MAP_TYPE;
 597         vn_a.prot = uprot;
 598         vn_a.maxprot = PROT_ALL;
 599         vn_a.flags = flags & ~MAP_TYPE;
 600         vn_a.cred = CRED();
 601         vn_a.amp = NULL;
 602         vn_a.szc = 0;
 603         vn_a.lgrp_mem_policy_flags = 0;
 604 
 605         return (as_map(as, *addrp, len, segvn_create, &vn_a));
 606 }
 607 
 608 static int
 609 smmap_common(caddr_t *addrp, size_t len,
 610     int prot, int flags, struct file *fp, offset_t pos)
 611 {
 612         struct vnode *vp;
 613         struct as *as = curproc->p_as;
 614         uint_t uprot, maxprot, type;
 615         int error;
 616         int in_crit = 0;
 617 
 618         if ((flags & ~(MAP_SHARED | MAP_PRIVATE | MAP_FIXED | _MAP_NEW |
 619             _MAP_LOW32 | MAP_NORESERVE | MAP_ANON | MAP_ALIGN |
 620             MAP_TEXT | MAP_INITDATA)) != 0) {
 621                 /* | MAP_RENAME */      /* not implemented, let user know */
 622                 return (EINVAL);
 623         }
 624 
 625         if ((flags & MAP_TEXT) && !(prot & PROT_EXEC)) {
 626                 return (EINVAL);
 627         }
 628 
 629         if ((flags & (MAP_TEXT | MAP_INITDATA)) == (MAP_TEXT | MAP_INITDATA)) {
 630                 return (EINVAL);
 631         }
 632 
 633 #if defined(__sparc)
 634         /*
 635          * See if this is an "old mmap call".  If so, remember this
 636          * fact and convert the flags value given to mmap to indicate
 637          * the specified address in the system call must be used.
 638          * _MAP_NEW is turned set by all new uses of mmap.
 639          */
 640         if ((flags & _MAP_NEW) == 0)
 641                 flags |= MAP_FIXED;
 642 #endif
 643         flags &= ~_MAP_NEW;
 644 
 645         type = flags & MAP_TYPE;
 646         if (type != MAP_PRIVATE && type != MAP_SHARED)
 647                 return (EINVAL);
 648 
 649 
 650         if (flags & MAP_ALIGN) {
 651 
 652                 if (flags & MAP_FIXED)
 653                         return (EINVAL);
 654 
 655                 /* alignment needs to be a power of 2 >= page size */
 656                 if (((uintptr_t)*addrp < PAGESIZE && (uintptr_t)*addrp != 0) ||
 657                     !ISP2((uintptr_t)*addrp))
 658                         return (EINVAL);
 659         }
 660         /*
 661          * Check for bad lengths and file position.
 662          * We let the VOP_MAP routine check for negative lengths
 663          * since on some vnode types this might be appropriate.
 664          */
 665         if (len == 0 || (pos & (u_offset_t)PAGEOFFSET) != 0)
 666                 return (EINVAL);
 667 
 668         maxprot = PROT_ALL;             /* start out allowing all accesses */
 669         uprot = prot | PROT_USER;
 670 
 671         if (fp == NULL) {
 672                 ASSERT(flags & MAP_ANON);
 673                 /* discard lwpchan mappings, like munmap() */
 674                 if ((flags & MAP_FIXED) && curproc->p_lcp != NULL)
 675                         lwpchan_delete_mapping(curproc, *addrp, *addrp + len);
 676                 as_rangelock(as);
 677                 error = zmap(as, addrp, len, uprot, flags, pos);
 678                 as_rangeunlock(as);
 679                 /*
 680                  * Tell machine specific code that lwp has mapped shared memory
 681                  */
 682                 if (error == 0 && (flags & MAP_SHARED)) {
 683                         /* EMPTY */
 684                         LWP_MMODEL_SHARED_AS(*addrp, len);
 685                 }
 686                 return (error);
 687         } else if ((flags & MAP_ANON) != 0)
 688                 return (EINVAL);
 689 
 690         vp = fp->f_vnode;
 691 
 692         /* Can't execute code from "noexec" mounted filesystem. */
 693         if ((vp->v_vfsp->vfs_flag & VFS_NOEXEC) != 0)
 694                 maxprot &= ~PROT_EXEC;
 695 
 696         /*
 697          * These checks were added as part of large files.
 698          *
 699          * Return ENXIO if the initial position is negative; return EOVERFLOW
 700          * if (offset + len) would overflow the maximum allowed offset for the
 701          * type of file descriptor being used.
 702          */
 703         if (vp->v_type == VREG) {
 704                 if (pos < 0)
 705                         return (ENXIO);
 706                 if ((offset_t)len > (OFFSET_MAX(fp) - pos))
 707                         return (EOVERFLOW);
 708         }
 709 
 710         if (type == MAP_SHARED && (fp->f_flag & FWRITE) == 0) {
 711                 /* no write access allowed */
 712                 maxprot &= ~PROT_WRITE;
 713         }
 714 
 715         /*
 716          * XXX - Do we also adjust maxprot based on protections
 717          * of the vnode?  E.g. if no execute permission is given
 718          * on the vnode for the current user, maxprot probably
 719          * should disallow PROT_EXEC also?  This is different
 720          * from the write access as this would be a per vnode
 721          * test as opposed to a per fd test for writability.
 722          */
 723 
 724         /*
 725          * Verify that the specified protections are not greater than
 726          * the maximum allowable protections.  Also test to make sure
 727          * that the file descriptor does allows for read access since
 728          * "write only" mappings are hard to do since normally we do
 729          * the read from the file before the page can be written.
 730          */
 731         if (((maxprot & uprot) != uprot) || (fp->f_flag & FREAD) == 0)
 732                 return (EACCES);
 733 
 734         /*
 735          * If the user specified an address, do some simple checks here
 736          */
 737         if ((flags & MAP_FIXED) != 0) {
 738                 caddr_t userlimit;
 739 
 740                 /*
 741                  * Use the user address.  First verify that
 742                  * the address to be used is page aligned.
 743                  * Then make some simple bounds checks.
 744                  */
 745                 if (((uintptr_t)*addrp & PAGEOFFSET) != 0)
 746                         return (EINVAL);
 747 
 748                 userlimit = flags & _MAP_LOW32 ?
 749                     (caddr_t)USERLIMIT32 : as->a_userlimit;
 750                 switch (valid_usr_range(*addrp, len, uprot, as, userlimit)) {
 751                 case RANGE_OKAY:
 752                         break;
 753                 case RANGE_BADPROT:
 754                         return (ENOTSUP);
 755                 case RANGE_BADADDR:
 756                 default:
 757                         return (ENOMEM);
 758                 }
 759         }
 760 
 761         if ((prot & (PROT_READ | PROT_WRITE | PROT_EXEC)) &&
 762             nbl_need_check(vp)) {
 763                 int svmand;
 764                 nbl_op_t nop;
 765 
 766                 nbl_start_crit(vp, RW_READER);
 767                 in_crit = 1;
 768                 error = nbl_svmand(vp, fp->f_cred, &svmand);
 769                 if (error != 0)
 770                         goto done;
 771                 if ((prot & PROT_WRITE) && (type == MAP_SHARED)) {
 772                         if (prot & (PROT_READ | PROT_EXEC)) {
 773                                 nop = NBL_READWRITE;
 774                         } else {
 775                                 nop = NBL_WRITE;
 776                         }
 777                 } else {
 778                         nop = NBL_READ;
 779                 }
 780                 if (nbl_conflict(vp, nop, 0, LONG_MAX, svmand, NULL)) {
 781                         error = EACCES;
 782                         goto done;
 783                 }
 784         }
 785 
 786         /* discard lwpchan mappings, like munmap() */
 787         if ((flags & MAP_FIXED) && curproc->p_lcp != NULL)
 788                 lwpchan_delete_mapping(curproc, *addrp, *addrp + len);
 789 
 790         /*
 791          * Ok, now let the vnode map routine do its thing to set things up.
 792          */
 793         error = VOP_MAP(vp, pos, as,
 794             addrp, len, uprot, maxprot, flags, fp->f_cred, NULL);
 795 
 796         if (error == 0) {
 797                 /*
 798                  * Tell machine specific code that lwp has mapped shared memory
 799                  */
 800                 if (flags & MAP_SHARED) {
 801                         /* EMPTY */
 802                         LWP_MMODEL_SHARED_AS(*addrp, len);
 803                 }
 804                 if (vp->v_type == VREG &&
 805                     (flags & (MAP_TEXT | MAP_INITDATA)) != 0) {
 806                         /*
 807                          * Mark this as an executable vnode
 808                          */
 809                         mutex_enter(&vp->v_lock);
 810                         vp->v_flag |= VVMEXEC;
 811                         mutex_exit(&vp->v_lock);
 812                 }
 813         }
 814 
 815 done:
 816         if (in_crit)
 817                 nbl_end_crit(vp);
 818         return (error);
 819 }
 820 
 821 #ifdef _LP64
 822 /*
 823  * LP64 mmap(2) system call: 64-bit offset, 64-bit address.
 824  *
 825  * The "large file" mmap routine mmap64(2) is also mapped to this routine
 826  * by the 64-bit version of libc.
 827  *
 828  * Eventually, this should be the only version, and have smmap_common()
 829  * folded back into it again.  Some day.
 830  */
 831 caddr_t
 832 smmap64(caddr_t addr, size_t len, int prot, int flags, int fd, off_t pos)
 833 {
 834         struct file *fp;
 835         int error;
 836 
 837         if (fd == -1 && (flags & MAP_ANON) != 0)
 838                 error = smmap_common(&addr, len, prot, flags,
 839                     NULL, (offset_t)pos);
 840         else if ((fp = getf(fd)) != NULL) {
 841                 error = smmap_common(&addr, len, prot, flags,
 842                     fp, (offset_t)pos);
 843                 releasef(fd);
 844         } else
 845                 error = EBADF;
 846 
 847         return (error ? (caddr_t)(uintptr_t)set_errno(error) : addr);
 848 }
 849 #endif  /* _LP64 */
 850 
 851 #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
 852 
 853 /*
 854  * ILP32 mmap(2) system call: 32-bit offset, 32-bit address.
 855  */
 856 caddr_t
 857 smmap32(caddr32_t addr, size32_t len, int prot, int flags, int fd, off32_t pos)
 858 {
 859         struct file *fp;
 860         int error;
 861         caddr_t a = (caddr_t)(uintptr_t)addr;
 862 
 863         if (flags & _MAP_LOW32)
 864                 error = EINVAL;
 865         else if (fd == -1 && (flags & MAP_ANON) != 0)
 866                 error = smmap_common(&a, (size_t)len, prot,
 867                     flags | _MAP_LOW32, NULL, (offset_t)pos);
 868         else if ((fp = getf(fd)) != NULL) {
 869                 error = smmap_common(&a, (size_t)len, prot,
 870                     flags | _MAP_LOW32, fp, (offset_t)pos);
 871                 releasef(fd);
 872         } else
 873                 error = EBADF;
 874 
 875         ASSERT(error != 0 || (uintptr_t)(a + len) < (uintptr_t)UINT32_MAX);
 876 
 877         return (error ? (caddr_t)(uintptr_t)set_errno(error) : a);
 878 }
 879 
 880 /*
 881  * ILP32 mmap64(2) system call: 64-bit offset, 32-bit address.
 882  *
 883  * Now things really get ugly because we can't use the C-style
 884  * calling convention for more than 6 args, and 64-bit parameter
 885  * passing on 32-bit systems is less than clean.
 886  */
 887 
 888 struct mmaplf32a {
 889         caddr_t addr;
 890         size_t len;
 891 #ifdef _LP64
 892         /*
 893          * 32-bit contents, 64-bit cells
 894          */
 895         uint64_t prot;
 896         uint64_t flags;
 897         uint64_t fd;
 898         uint64_t offhi;
 899         uint64_t offlo;
 900 #else
 901         /*
 902          * 32-bit contents, 32-bit cells
 903          */
 904         uint32_t prot;
 905         uint32_t flags;
 906         uint32_t fd;
 907         uint32_t offhi;
 908         uint32_t offlo;
 909 #endif
 910 };
 911 
 912 int
 913 smmaplf32(struct mmaplf32a *uap, rval_t *rvp)
 914 {
 915         struct file *fp;
 916         int error;
 917         caddr_t a = uap->addr;
 918         int flags = (int)uap->flags;
 919         int fd = (int)uap->fd;
 920 #ifdef _BIG_ENDIAN
 921         offset_t off = ((u_offset_t)uap->offhi << 32) | (u_offset_t)uap->offlo;
 922 #else
 923         offset_t off = ((u_offset_t)uap->offlo << 32) | (u_offset_t)uap->offhi;
 924 #endif
 925 
 926         if (flags & _MAP_LOW32)
 927                 error = EINVAL;
 928         else if (fd == -1 && (flags & MAP_ANON) != 0)
 929                 error = smmap_common(&a, uap->len, (int)uap->prot,
 930                     flags | _MAP_LOW32, NULL, off);
 931         else if ((fp = getf(fd)) != NULL) {
 932                 error = smmap_common(&a, uap->len, (int)uap->prot,
 933                     flags | _MAP_LOW32, fp, off);
 934                 releasef(fd);
 935         } else
 936                 error = EBADF;
 937 
 938         if (error == 0)
 939                 rvp->r_val1 = (uintptr_t)a;
 940         return (error);
 941 }
 942 
 943 #endif  /* _SYSCALL32_IMPL || _ILP32 */
 944 
 945 int
 946 munmap(caddr_t addr, size_t len)
 947 {
 948         struct proc *p = curproc;
 949         struct as *as = p->p_as;
 950 
 951         if (((uintptr_t)addr & PAGEOFFSET) != 0 || len == 0)
 952                 return (set_errno(EINVAL));
 953 
 954         if (valid_usr_range(addr, len, 0, as, as->a_userlimit) != RANGE_OKAY)
 955                 return (set_errno(EINVAL));
 956 
 957         /*
 958          * Discard lwpchan mappings.
 959          */
 960         if (p->p_lcp != NULL)
 961                 lwpchan_delete_mapping(p, addr, addr + len);
 962         if (as_unmap(as, addr, len) != 0)
 963                 return (set_errno(EINVAL));
 964 
 965         return (0);
 966 }
 967 
 968 int
 969 mprotect(caddr_t addr, size_t len, int prot)
 970 {
 971         struct as *as = curproc->p_as;
 972         uint_t uprot = prot | PROT_USER;
 973         int error;
 974 
 975         if (((uintptr_t)addr & PAGEOFFSET) != 0 || len == 0)
 976                 return (set_errno(EINVAL));
 977 
 978         switch (valid_usr_range(addr, len, prot, as, as->a_userlimit)) {
 979         case RANGE_OKAY:
 980                 break;
 981         case RANGE_BADPROT:
 982                 return (set_errno(ENOTSUP));
 983         case RANGE_BADADDR:
 984         default:
 985                 return (set_errno(ENOMEM));
 986         }
 987 
 988         error = as_setprot(as, addr, len, uprot);
 989         if (error)
 990                 return (set_errno(error));
 991         return (0);
 992 }
 993 
 994 #define MC_CACHE        128                     /* internal result buffer */
 995 #define MC_QUANTUM      (MC_CACHE * PAGESIZE)   /* addresses covered in loop */
 996 
 997 int
 998 mincore(caddr_t addr, size_t len, char *vecp)
 999 {
1000         struct as *as = curproc->p_as;
1001         caddr_t ea;                     /* end address of loop */
1002         size_t rl;                      /* inner result length */
1003         char vec[MC_CACHE];             /* local vector cache */
1004         int error;
1005         model_t model;
1006         long    llen;
1007 
1008         model = get_udatamodel();
1009         /*
1010          * Validate form of address parameters.
1011          */
1012         if (model == DATAMODEL_NATIVE) {
1013                 llen = (long)len;
1014         } else {
1015                 llen = (int32_t)(size32_t)len;
1016         }
1017         if (((uintptr_t)addr & PAGEOFFSET) != 0 || llen <= 0)
1018                 return (set_errno(EINVAL));
1019 
1020         if (valid_usr_range(addr, len, 0, as, as->a_userlimit) != RANGE_OKAY)
1021                 return (set_errno(ENOMEM));
1022 
1023         /*
1024          * Loop over subranges of interval [addr : addr + len), recovering
1025          * results internally and then copying them out to caller.  Subrange
1026          * is based on the size of MC_CACHE, defined above.
1027          */
1028         for (ea = addr + len; addr < ea; addr += MC_QUANTUM) {
1029                 error = as_incore(as, addr,
1030                     (size_t)MIN(MC_QUANTUM, ea - addr), vec, &rl);
1031                 if (rl != 0) {
1032                         rl = (rl + PAGESIZE - 1) / PAGESIZE;
1033                         if (copyout(vec, vecp, rl) != 0)
1034                                 return (set_errno(EFAULT));
1035                         vecp += rl;
1036                 }
1037                 if (error != 0)
1038                         return (set_errno(ENOMEM));
1039         }
1040         return (0);
1041 }