1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /* Copyright 2013 OmniTI Computer Consulting, Inc. All rights reserved. */
  23 
  24 /*
  25  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  26  * Use is subject to license terms.
  27  */
  28 
  29 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T     */
  30 /*        All Rights Reserved   */
  31 
  32 #include <sys/types.h>
  33 #include <sys/inttypes.h>
  34 #include <sys/param.h>
  35 #include <sys/sysmacros.h>
  36 #include <sys/systm.h>
  37 #include <sys/signal.h>
  38 #include <sys/user.h>
  39 #include <sys/errno.h>
  40 #include <sys/var.h>
  41 #include <sys/proc.h>
  42 #include <sys/tuneable.h>
  43 #include <sys/debug.h>
  44 #include <sys/cmn_err.h>
  45 #include <sys/cred.h>
  46 #include <sys/vnode.h>
  47 #include <sys/vfs.h>
  48 #include <sys/vm.h>
  49 #include <sys/file.h>
  50 #include <sys/mman.h>
  51 #include <sys/vmparam.h>
  52 #include <sys/fcntl.h>
  53 #include <sys/lwpchan_impl.h>
  54 #include <sys/nbmlock.h>
  55 
  56 #include <vm/hat.h>
  57 #include <vm/as.h>
  58 #include <vm/seg.h>
  59 #include <vm/seg_dev.h>
  60 #include <vm/seg_vn.h>
  61 
  62 int use_brk_lpg = 1;
  63 int use_stk_lpg = 1;
  64 
  65 static int brk_lpg(caddr_t nva);
  66 static int grow_lpg(caddr_t sp);
  67 
  68 intptr_t
  69 brk(caddr_t nva)
  70 {
  71         int error;
  72         proc_t *p = curproc;
  73 
  74         /*
  75          * As a special case to aid the implementation of sbrk(3C), if given a
  76          * new brk of 0, return the current brk.  We'll hide this in brk(3C).
  77          */
  78         if (nva == 0)
  79                 return ((intptr_t)(p->p_brkbase + p->p_brksize));
  80 
  81         /*
  82          * Serialize brk operations on an address space.
  83          * This also serves as the lock protecting p_brksize
  84          * and p_brkpageszc.
  85          */
  86         as_rangelock(p->p_as);
  87         if (use_brk_lpg && (p->p_flag & SAUTOLPG) != 0) {
  88                 error = brk_lpg(nva);
  89         } else {
  90                 error = brk_internal(nva, p->p_brkpageszc);
  91         }
  92         as_rangeunlock(p->p_as);
  93         return ((error != 0 ? set_errno(error) : 0));
  94 }
  95 
  96 /*
  97  * Algorithm: call arch-specific map_pgsz to get best page size to use,
  98  * then call brk_internal().
  99  * Returns 0 on success.
 100  */
 101 static int
 102 brk_lpg(caddr_t nva)
 103 {
 104         struct proc *p = curproc;
 105         size_t pgsz, len;
 106         caddr_t addr, brkend;
 107         caddr_t bssbase = p->p_bssbase;
 108         caddr_t brkbase = p->p_brkbase;
 109         int oszc, szc;
 110         int err;
 111 
 112         oszc = p->p_brkpageszc;
 113 
 114         /*
 115          * If p_brkbase has not yet been set, the first call
 116          * to brk_internal() will initialize it.
 117          */
 118         if (brkbase == 0) {
 119                 return (brk_internal(nva, oszc));
 120         }
 121 
 122         len = nva - bssbase;
 123 
 124         pgsz = map_pgsz(MAPPGSZ_HEAP, p, bssbase, len, 0);
 125         szc = page_szc(pgsz);
 126 
 127         /*
 128          * Covers two cases:
 129          * 1. page_szc() returns -1 for invalid page size, so we want to
 130          * ignore it in that case.
 131          * 2. By design we never decrease page size, as it is more stable.
 132          */
 133         if (szc <= oszc) {
 134                 err = brk_internal(nva, oszc);
 135                 /* If failed, back off to base page size. */
 136                 if (err != 0 && oszc != 0) {
 137                         err = brk_internal(nva, 0);
 138                 }
 139                 return (err);
 140         }
 141 
 142         err = brk_internal(nva, szc);
 143         /* If using szc failed, map with base page size and return. */
 144         if (err != 0) {
 145                 if (szc != 0) {
 146                         err = brk_internal(nva, 0);
 147                 }
 148                 return (err);
 149         }
 150 
 151         /*
 152          * Round up brk base to a large page boundary and remap
 153          * anything in the segment already faulted in beyond that
 154          * point.
 155          */
 156         addr = (caddr_t)P2ROUNDUP((uintptr_t)p->p_bssbase, pgsz);
 157         brkend = brkbase + p->p_brksize;
 158         len = brkend - addr;
 159         /* Check that len is not negative. Update page size code for heap. */
 160         if (addr >= p->p_bssbase && brkend > addr && IS_P2ALIGNED(len, pgsz)) {
 161                 (void) as_setpagesize(p->p_as, addr, len, szc, B_FALSE);
 162                 p->p_brkpageszc = szc;
 163         }
 164 
 165         ASSERT(err == 0);
 166         return (err);           /* should always be 0 */
 167 }
 168 
 169 /*
 170  * Returns 0 on success.
 171  */
 172 int
 173 brk_internal(caddr_t nva, uint_t brkszc)
 174 {
 175         caddr_t ova;                    /* current break address */
 176         size_t size;
 177         int     error;
 178         struct proc *p = curproc;
 179         struct as *as = p->p_as;
 180         size_t pgsz;
 181         uint_t szc;
 182         rctl_qty_t as_rctl;
 183 
 184         /*
 185          * extend heap to brkszc alignment but use current p->p_brkpageszc
 186          * for the newly created segment. This allows the new extension
 187          * segment to be concatenated successfully with the existing brk
 188          * segment.
 189          */
 190         if ((szc = brkszc) != 0) {
 191                 pgsz = page_get_pagesize(szc);
 192                 ASSERT(pgsz > PAGESIZE);
 193         } else {
 194                 pgsz = PAGESIZE;
 195         }
 196 
 197         mutex_enter(&p->p_lock);
 198         as_rctl = rctl_enforced_value(rctlproc_legacy[RLIMIT_DATA],
 199             p->p_rctls, p);
 200         mutex_exit(&p->p_lock);
 201 
 202         /*
 203          * If p_brkbase has not yet been set, the first call
 204          * to brk() will initialize it.
 205          */
 206         if (p->p_brkbase == 0)
 207                 p->p_brkbase = nva;
 208 
 209         /*
 210          * Before multiple page size support existed p_brksize was the value
 211          * not rounded to the pagesize (i.e. it stored the exact user request
 212          * for heap size). If pgsz is greater than PAGESIZE calculate the
 213          * heap size as the real new heap size by rounding it up to pgsz.
 214          * This is useful since we may want to know where the heap ends
 215          * without knowing heap pagesize (e.g. some old code) and also if
 216          * heap pagesize changes we can update p_brkpageszc but delay adding
 217          * new mapping yet still know from p_brksize where the heap really
 218          * ends. The user requested heap end is stored in libc variable.
 219          */
 220         if (pgsz > PAGESIZE) {
 221                 caddr_t tnva = (caddr_t)P2ROUNDUP((uintptr_t)nva, pgsz);
 222                 size = tnva - p->p_brkbase;
 223                 if (tnva < p->p_brkbase || (size > p->p_brksize &&
 224                     size > (size_t)as_rctl)) {
 225                         szc = 0;
 226                         pgsz = PAGESIZE;
 227                         size = nva - p->p_brkbase;
 228                 }
 229         } else {
 230                 size = nva - p->p_brkbase;
 231         }
 232 
 233         /*
 234          * use PAGESIZE to roundup ova because we want to know the real value
 235          * of the current heap end in case p_brkpageszc changes since the last
 236          * p_brksize was computed.
 237          */
 238         nva = (caddr_t)P2ROUNDUP((uintptr_t)nva, pgsz);
 239         ova = (caddr_t)P2ROUNDUP((uintptr_t)(p->p_brkbase + p->p_brksize),
 240             PAGESIZE);
 241 
 242         if ((nva < p->p_brkbase) || (size > p->p_brksize &&
 243             size > as_rctl)) {
 244                 mutex_enter(&p->p_lock);
 245                 (void) rctl_action(rctlproc_legacy[RLIMIT_DATA], p->p_rctls, p,
 246                     RCA_SAFE);
 247                 mutex_exit(&p->p_lock);
 248                 return (ENOMEM);
 249         }
 250 
 251         if (nva > ova) {
 252                 struct segvn_crargs crargs =
 253                     SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL);
 254 
 255                 if (!(p->p_datprot & PROT_EXEC)) {
 256                         crargs.prot &= ~PROT_EXEC;
 257                 }
 258 
 259                 /*
 260                  * Add new zfod mapping to extend UNIX data segment
 261                  * AS_MAP_NO_LPOOB means use 0, and don't reapply OOB policies
 262                  * via map_pgszcvec(). Use AS_MAP_HEAP to get intermediate
 263                  * page sizes if ova is not aligned to szc's pgsz.
 264                  */
 265                 if (szc > 0) {
 266                         caddr_t rbss;
 267 
 268                         rbss = (caddr_t)P2ROUNDUP((uintptr_t)p->p_bssbase,
 269                             pgsz);
 270                         if (IS_P2ALIGNED(p->p_bssbase, pgsz) || ova > rbss) {
 271                                 crargs.szc = p->p_brkpageszc ? p->p_brkpageszc :
 272                                     AS_MAP_NO_LPOOB;
 273                         } else if (ova == rbss) {
 274                                 crargs.szc = szc;
 275                         } else {
 276                                 crargs.szc = AS_MAP_HEAP;
 277                         }
 278                 } else {
 279                         crargs.szc = AS_MAP_NO_LPOOB;
 280                 }
 281                 crargs.lgrp_mem_policy_flags = LGRP_MP_FLAG_EXTEND_UP;
 282                 error = as_map(as, ova, (size_t)(nva - ova), segvn_create,
 283                     &crargs);
 284                 if (error) {
 285                         return (error);
 286                 }
 287 
 288         } else if (nva < ova) {
 289                 /*
 290                  * Release mapping to shrink UNIX data segment.
 291                  */
 292                 (void) as_unmap(as, nva, (size_t)(ova - nva));
 293         }
 294         p->p_brksize = size;
 295         return (0);
 296 }
 297 
 298 /*
 299  * Grow the stack to include sp.  Return 1 if successful, 0 otherwise.
 300  * This routine assumes that the stack grows downward.
 301  */
 302 int
 303 grow(caddr_t sp)
 304 {
 305         struct proc *p = curproc;
 306         struct as *as = p->p_as;
 307         size_t oldsize = p->p_stksize;
 308         size_t newsize;
 309         int err;
 310 
 311         /*
 312          * Serialize grow operations on an address space.
 313          * This also serves as the lock protecting p_stksize
 314          * and p_stkpageszc.
 315          */
 316         as_rangelock(as);
 317         if (use_stk_lpg && (p->p_flag & SAUTOLPG) != 0) {
 318                 err = grow_lpg(sp);
 319         } else {
 320                 err = grow_internal(sp, p->p_stkpageszc);
 321         }
 322         as_rangeunlock(as);
 323 
 324         if (err == 0 && (newsize = p->p_stksize) > oldsize) {
 325                 ASSERT(IS_P2ALIGNED(oldsize, PAGESIZE));
 326                 ASSERT(IS_P2ALIGNED(newsize, PAGESIZE));
 327                 /*
 328                  * Set up translations so the process doesn't have to fault in
 329                  * the stack pages we just gave it.
 330                  */
 331                 (void) as_fault(as->a_hat, as, p->p_usrstack - newsize,
 332                     newsize - oldsize, F_INVAL, S_WRITE);
 333         }
 334         return ((err == 0 ? 1 : 0));
 335 }
 336 
 337 /*
 338  * Algorithm: call arch-specific map_pgsz to get best page size to use,
 339  * then call grow_internal().
 340  * Returns 0 on success.
 341  */
 342 static int
 343 grow_lpg(caddr_t sp)
 344 {
 345         struct proc *p = curproc;
 346         size_t pgsz;
 347         size_t len, newsize;
 348         caddr_t addr, saddr;
 349         caddr_t growend;
 350         int oszc, szc;
 351         int err;
 352 
 353         newsize = p->p_usrstack - sp;
 354 
 355         oszc = p->p_stkpageszc;
 356         pgsz = map_pgsz(MAPPGSZ_STK, p, sp, newsize, 0);
 357         szc = page_szc(pgsz);
 358 
 359         /*
 360          * Covers two cases:
 361          * 1. page_szc() returns -1 for invalid page size, so we want to
 362          * ignore it in that case.
 363          * 2. By design we never decrease page size, as it is more stable.
 364          * This shouldn't happen as the stack never shrinks.
 365          */
 366         if (szc <= oszc) {
 367                 err = grow_internal(sp, oszc);
 368                 /* failed, fall back to base page size */
 369                 if (err != 0 && oszc != 0) {
 370                         err = grow_internal(sp, 0);
 371                 }
 372                 return (err);
 373         }
 374 
 375         /*
 376          * We've grown sufficiently to switch to a new page size.
 377          * So we are going to remap the whole segment with the new page size.
 378          */
 379         err = grow_internal(sp, szc);
 380         /* The grow with szc failed, so fall back to base page size. */
 381         if (err != 0) {
 382                 if (szc != 0) {
 383                         err = grow_internal(sp, 0);
 384                 }
 385                 return (err);
 386         }
 387 
 388         /*
 389          * Round up stack pointer to a large page boundary and remap
 390          * any pgsz pages in the segment already faulted in beyond that
 391          * point.
 392          */
 393         saddr = p->p_usrstack - p->p_stksize;
 394         addr = (caddr_t)P2ROUNDUP((uintptr_t)saddr, pgsz);
 395         growend = (caddr_t)P2ALIGN((uintptr_t)p->p_usrstack, pgsz);
 396         len = growend - addr;
 397         /* Check that len is not negative. Update page size code for stack. */
 398         if (addr >= saddr && growend > addr && IS_P2ALIGNED(len, pgsz)) {
 399                 (void) as_setpagesize(p->p_as, addr, len, szc, B_FALSE);
 400                 p->p_stkpageszc = szc;
 401         }
 402 
 403         ASSERT(err == 0);
 404         return (err);           /* should always be 0 */
 405 }
 406 
 407 /*
 408  * This routine assumes that the stack grows downward.
 409  * Returns 0 on success, errno on failure.
 410  */
 411 int
 412 grow_internal(caddr_t sp, uint_t growszc)
 413 {
 414         struct proc *p = curproc;
 415         size_t newsize;
 416         size_t oldsize;
 417         int    error;
 418         size_t pgsz;
 419         uint_t szc;
 420         struct segvn_crargs crargs = SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL);
 421 
 422         ASSERT(sp < p->p_usrstack);
 423         sp = (caddr_t)P2ALIGN((uintptr_t)sp, PAGESIZE);
 424 
 425         /*
 426          * grow to growszc alignment but use current p->p_stkpageszc for
 427          * the segvn_crargs szc passed to segvn_create. For memcntl to
 428          * increase the szc, this allows the new extension segment to be
 429          * concatenated successfully with the existing stack segment.
 430          */
 431         if ((szc = growszc) != 0) {
 432                 pgsz = page_get_pagesize(szc);
 433                 ASSERT(pgsz > PAGESIZE);
 434                 newsize = p->p_usrstack - (caddr_t)P2ALIGN((uintptr_t)sp, pgsz);
 435                 if (newsize > (size_t)p->p_stk_ctl) {
 436                         szc = 0;
 437                         pgsz = PAGESIZE;
 438                         newsize = p->p_usrstack - sp;
 439                 }
 440         } else {
 441                 pgsz = PAGESIZE;
 442                 newsize = p->p_usrstack - sp;
 443         }
 444 
 445         if (newsize > (size_t)p->p_stk_ctl) {
 446                 (void) rctl_action(rctlproc_legacy[RLIMIT_STACK], p->p_rctls, p,
 447                     RCA_UNSAFE_ALL);
 448 
 449                 return (ENOMEM);
 450         }
 451 
 452         oldsize = p->p_stksize;
 453         ASSERT(P2PHASE(oldsize, PAGESIZE) == 0);
 454 
 455         if (newsize <= oldsize) {    /* prevent the stack from shrinking */
 456                 return (0);
 457         }
 458 
 459         if (!(p->p_stkprot & PROT_EXEC)) {
 460                 crargs.prot &= ~PROT_EXEC;
 461         }
 462         /*
 463          * extend stack with the proposed new growszc, which is different
 464          * than p_stkpageszc only on a memcntl to increase the stack pagesize.
 465          * AS_MAP_NO_LPOOB means use 0, and don't reapply OOB policies via
 466          * map_pgszcvec(). Use AS_MAP_STACK to get intermediate page sizes
 467          * if not aligned to szc's pgsz.
 468          */
 469         if (szc > 0) {
 470                 caddr_t oldsp = p->p_usrstack - oldsize;
 471                 caddr_t austk = (caddr_t)P2ALIGN((uintptr_t)p->p_usrstack,
 472                     pgsz);
 473 
 474                 if (IS_P2ALIGNED(p->p_usrstack, pgsz) || oldsp < austk) {
 475                         crargs.szc = p->p_stkpageszc ? p->p_stkpageszc :
 476                             AS_MAP_NO_LPOOB;
 477                 } else if (oldsp == austk) {
 478                         crargs.szc = szc;
 479                 } else {
 480                         crargs.szc = AS_MAP_STACK;
 481                 }
 482         } else {
 483                 crargs.szc = AS_MAP_NO_LPOOB;
 484         }
 485         crargs.lgrp_mem_policy_flags = LGRP_MP_FLAG_EXTEND_DOWN;
 486 
 487         if ((error = as_map(p->p_as, p->p_usrstack - newsize, newsize - oldsize,
 488             segvn_create, &crargs)) != 0) {
 489                 if (error == EAGAIN) {
 490                         cmn_err(CE_WARN, "Sorry, no swap space to grow stack "
 491                             "for pid %d (%s)", p->p_pid, PTOU(p)->u_comm);
 492                 }
 493                 return (error);
 494         }
 495         p->p_stksize = newsize;
 496         return (0);
 497 }
 498 
 499 /*
 500  * Find address for user to map.  If MAP_FIXED is not specified, we can pick
 501  * any address we want, but we will first try the value in *addrp if it is
 502  * non-NULL and _MAP_RANDOMIZE is not set.  Thus this is implementing a way to
 503  * try and get a preferred address.
 504  */
 505 int
 506 choose_addr(struct as *as, caddr_t *addrp, size_t len, offset_t off,
 507     int vacalign, uint_t flags)
 508 {
 509         caddr_t basep = (caddr_t)(uintptr_t)((uintptr_t)*addrp & PAGEMASK);
 510         size_t lenp = len;
 511 
 512         ASSERT(AS_ISCLAIMGAP(as));      /* searches should be serialized */
 513         if (flags & MAP_FIXED) {
 514                 (void) as_unmap(as, *addrp, len);
 515                 return (0);
 516         } else if (basep != NULL &&
 517             ((flags & (MAP_ALIGN | _MAP_RANDOMIZE)) == 0) &&
 518             !as_gap(as, len, &basep, &lenp, 0, *addrp)) {
 519                 /* User supplied address was available */
 520                 *addrp = basep;
 521         } else {
 522                 /*
 523                  * No user supplied address or the address supplied was not
 524                  * available.
 525                  */
 526                 map_addr(addrp, len, off, vacalign, flags);
 527         }
 528         if (*addrp == NULL)
 529                 return (ENOMEM);
 530         return (0);
 531 }
 532 
 533 
 534 /*
 535  * Used for MAP_ANON - fast way to get anonymous pages
 536  */
 537 static int
 538 zmap(struct as *as, caddr_t *addrp, size_t len, uint_t uprot, int flags,
 539     offset_t pos)
 540 {
 541         struct segvn_crargs vn_a;
 542         int error;
 543 
 544         if (((PROT_ALL & uprot) != uprot))
 545                 return (EACCES);
 546 
 547         if ((flags & MAP_FIXED) != 0) {
 548                 caddr_t userlimit;
 549 
 550                 /*
 551                  * Use the user address.  First verify that
 552                  * the address to be used is page aligned.
 553                  * Then make some simple bounds checks.
 554                  */
 555                 if (((uintptr_t)*addrp & PAGEOFFSET) != 0)
 556                         return (EINVAL);
 557 
 558                 userlimit = flags & _MAP_LOW32 ?
 559                     (caddr_t)USERLIMIT32 : as->a_userlimit;
 560                 switch (valid_usr_range(*addrp, len, uprot, as, userlimit)) {
 561                 case RANGE_OKAY:
 562                         break;
 563                 case RANGE_BADPROT:
 564                         return (ENOTSUP);
 565                 case RANGE_BADADDR:
 566                 default:
 567                         return (ENOMEM);
 568                 }
 569         }
 570         /*
 571          * No need to worry about vac alignment for anonymous
 572          * pages since this is a "clone" object that doesn't
 573          * yet exist.
 574          */
 575         error = choose_addr(as, addrp, len, pos, ADDR_NOVACALIGN, flags);
 576         if (error != 0) {
 577                 return (error);
 578         }
 579 
 580         /*
 581          * Use the seg_vn segment driver; passing in the NULL amp
 582          * gives the desired "cloning" effect.
 583          */
 584         vn_a.vp = NULL;
 585         vn_a.offset = 0;
 586         vn_a.type = flags & MAP_TYPE;
 587         vn_a.prot = uprot;
 588         vn_a.maxprot = PROT_ALL;
 589         vn_a.flags = flags & ~MAP_TYPE;
 590         vn_a.cred = CRED();
 591         vn_a.amp = NULL;
 592         vn_a.szc = 0;
 593         vn_a.lgrp_mem_policy_flags = 0;
 594 
 595         return (as_map(as, *addrp, len, segvn_create, &vn_a));
 596 }
 597 
 598 static int
 599 smmap_common(caddr_t *addrp, size_t len,
 600     int prot, int flags, struct file *fp, offset_t pos)
 601 {
 602         struct vnode *vp;
 603         struct as *as = curproc->p_as;
 604         uint_t uprot, maxprot, type;
 605         int error;
 606         int in_crit = 0;
 607 
 608         if ((flags & ~(MAP_SHARED | MAP_PRIVATE | MAP_FIXED | _MAP_NEW |
 609             _MAP_LOW32 | MAP_NORESERVE | MAP_ANON | MAP_ALIGN |
 610             MAP_TEXT | MAP_INITDATA)) != 0) {
 611                 /* | MAP_RENAME */      /* not implemented, let user know */
 612                 return (EINVAL);
 613         }
 614 
 615         if ((flags & MAP_TEXT) && !(prot & PROT_EXEC)) {
 616                 return (EINVAL);
 617         }
 618 
 619         if ((flags & (MAP_TEXT | MAP_INITDATA)) == (MAP_TEXT | MAP_INITDATA)) {
 620                 return (EINVAL);
 621         }
 622 
 623         if ((flags & (MAP_FIXED | _MAP_RANDOMIZE)) == (MAP_FIXED | _MAP_RANDOMIZE)) {
 624                 return (EINVAL);
 625         }
 626 
 627         /* If it's not a fixed allocation and mmap ASLR is enabled, randomize it. */
 628         if (((flags & MAP_FIXED) == 0) &&
 629             secflag_enabled(curproc, PROC_SEC_ASLR))
 630                 flags |= _MAP_RANDOMIZE;
 631 
 632 #if defined(__sparc)
 633         /*
 634          * See if this is an "old mmap call".  If so, remember this
 635          * fact and convert the flags value given to mmap to indicate
 636          * the specified address in the system call must be used.
 637          * _MAP_NEW is turned set by all new uses of mmap.
 638          */
 639         if ((flags & _MAP_NEW) == 0)
 640                 flags |= MAP_FIXED;
 641 #endif
 642         flags &= ~_MAP_NEW;
 643 
 644         type = flags & MAP_TYPE;
 645         if (type != MAP_PRIVATE && type != MAP_SHARED)
 646                 return (EINVAL);
 647 
 648 
 649         if (flags & MAP_ALIGN) {
 650                 if (flags & MAP_FIXED)
 651                         return (EINVAL);
 652 
 653                 /* alignment needs to be a power of 2 >= page size */
 654                 if (((uintptr_t)*addrp < PAGESIZE && (uintptr_t)*addrp != 0) ||
 655                     !ISP2((uintptr_t)*addrp))
 656                         return (EINVAL);
 657         }
 658         /*
 659          * Check for bad lengths and file position.
 660          * We let the VOP_MAP routine check for negative lengths
 661          * since on some vnode types this might be appropriate.
 662          */
 663         if (len == 0 || (pos & (u_offset_t)PAGEOFFSET) != 0)
 664                 return (EINVAL);
 665 
 666         maxprot = PROT_ALL;             /* start out allowing all accesses */
 667         uprot = prot | PROT_USER;
 668 
 669         if (fp == NULL) {
 670                 ASSERT(flags & MAP_ANON);
 671                 /* discard lwpchan mappings, like munmap() */
 672                 if ((flags & MAP_FIXED) && curproc->p_lcp != NULL)
 673                         lwpchan_delete_mapping(curproc, *addrp, *addrp + len);
 674                 as_rangelock(as);
 675                 error = zmap(as, addrp, len, uprot, flags, pos);
 676                 as_rangeunlock(as);
 677                 /*
 678                  * Tell machine specific code that lwp has mapped shared memory
 679                  */
 680                 if (error == 0 && (flags & MAP_SHARED)) {
 681                         /* EMPTY */
 682                         LWP_MMODEL_SHARED_AS(*addrp, len);
 683                 }
 684                 return (error);
 685         } else if ((flags & MAP_ANON) != 0)
 686                 return (EINVAL);
 687 
 688         vp = fp->f_vnode;
 689 
 690         /* Can't execute code from "noexec" mounted filesystem. */
 691         if ((vp->v_vfsp->vfs_flag & VFS_NOEXEC) != 0)
 692                 maxprot &= ~PROT_EXEC;
 693 
 694         /*
 695          * These checks were added as part of large files.
 696          *
 697          * Return ENXIO if the initial position is negative; return EOVERFLOW
 698          * if (offset + len) would overflow the maximum allowed offset for the
 699          * type of file descriptor being used.
 700          */
 701         if (vp->v_type == VREG) {
 702                 if (pos < 0)
 703                         return (ENXIO);
 704                 if ((offset_t)len > (OFFSET_MAX(fp) - pos))
 705                         return (EOVERFLOW);
 706         }
 707 
 708         if (type == MAP_SHARED && (fp->f_flag & FWRITE) == 0) {
 709                 /* no write access allowed */
 710                 maxprot &= ~PROT_WRITE;
 711         }
 712 
 713         /*
 714          * XXX - Do we also adjust maxprot based on protections
 715          * of the vnode?  E.g. if no execute permission is given
 716          * on the vnode for the current user, maxprot probably
 717          * should disallow PROT_EXEC also?  This is different
 718          * from the write access as this would be a per vnode
 719          * test as opposed to a per fd test for writability.
 720          */
 721 
 722         /*
 723          * Verify that the specified protections are not greater than
 724          * the maximum allowable protections.  Also test to make sure
 725          * that the file descriptor does allows for read access since
 726          * "write only" mappings are hard to do since normally we do
 727          * the read from the file before the page can be written.
 728          */
 729         if (((maxprot & uprot) != uprot) || (fp->f_flag & FREAD) == 0)
 730                 return (EACCES);
 731 
 732         /*
 733          * If the user specified an address, do some simple checks here
 734          */
 735         if ((flags & MAP_FIXED) != 0) {
 736                 caddr_t userlimit;
 737 
 738                 /*
 739                  * Use the user address.  First verify that
 740                  * the address to be used is page aligned.
 741                  * Then make some simple bounds checks.
 742                  */
 743                 if (((uintptr_t)*addrp & PAGEOFFSET) != 0)
 744                         return (EINVAL);
 745 
 746                 userlimit = flags & _MAP_LOW32 ?
 747                     (caddr_t)USERLIMIT32 : as->a_userlimit;
 748                 switch (valid_usr_range(*addrp, len, uprot, as, userlimit)) {
 749                 case RANGE_OKAY:
 750                         break;
 751                 case RANGE_BADPROT:
 752                         return (ENOTSUP);
 753                 case RANGE_BADADDR:
 754                 default:
 755                         return (ENOMEM);
 756                 }
 757         }
 758 
 759         if ((prot & (PROT_READ | PROT_WRITE | PROT_EXEC)) &&
 760             nbl_need_check(vp)) {
 761                 int svmand;
 762                 nbl_op_t nop;
 763 
 764                 nbl_start_crit(vp, RW_READER);
 765                 in_crit = 1;
 766                 error = nbl_svmand(vp, fp->f_cred, &svmand);
 767                 if (error != 0)
 768                         goto done;
 769                 if ((prot & PROT_WRITE) && (type == MAP_SHARED)) {
 770                         if (prot & (PROT_READ | PROT_EXEC)) {
 771                                 nop = NBL_READWRITE;
 772                         } else {
 773                                 nop = NBL_WRITE;
 774                         }
 775                 } else {
 776                         nop = NBL_READ;
 777                 }
 778                 if (nbl_conflict(vp, nop, 0, LONG_MAX, svmand, NULL)) {
 779                         error = EACCES;
 780                         goto done;
 781                 }
 782         }
 783 
 784         /* discard lwpchan mappings, like munmap() */
 785         if ((flags & MAP_FIXED) && curproc->p_lcp != NULL)
 786                 lwpchan_delete_mapping(curproc, *addrp, *addrp + len);
 787 
 788         /*
 789          * Ok, now let the vnode map routine do its thing to set things up.
 790          */
 791         error = VOP_MAP(vp, pos, as,
 792             addrp, len, uprot, maxprot, flags, fp->f_cred, NULL);
 793 
 794         if (error == 0) {
 795                 /*
 796                  * Tell machine specific code that lwp has mapped shared memory
 797                  */
 798                 if (flags & MAP_SHARED) {
 799                         /* EMPTY */
 800                         LWP_MMODEL_SHARED_AS(*addrp, len);
 801                 }
 802                 if (vp->v_type == VREG &&
 803                     (flags & (MAP_TEXT | MAP_INITDATA)) != 0) {
 804                         /*
 805                          * Mark this as an executable vnode
 806                          */
 807                         mutex_enter(&vp->v_lock);
 808                         vp->v_flag |= VVMEXEC;
 809                         mutex_exit(&vp->v_lock);
 810                 }
 811         }
 812 
 813 done:
 814         if (in_crit)
 815                 nbl_end_crit(vp);
 816         return (error);
 817 }
 818 
 819 #ifdef _LP64
 820 /*
 821  * LP64 mmap(2) system call: 64-bit offset, 64-bit address.
 822  *
 823  * The "large file" mmap routine mmap64(2) is also mapped to this routine
 824  * by the 64-bit version of libc.
 825  *
 826  * Eventually, this should be the only version, and have smmap_common()
 827  * folded back into it again.  Some day.
 828  */
 829 caddr_t
 830 smmap64(caddr_t addr, size_t len, int prot, int flags, int fd, off_t pos)
 831 {
 832         struct file *fp;
 833         int error;
 834 
 835         if (fd == -1 && (flags & MAP_ANON) != 0)
 836                 error = smmap_common(&addr, len, prot, flags,
 837                     NULL, (offset_t)pos);
 838         else if ((fp = getf(fd)) != NULL) {
 839                 error = smmap_common(&addr, len, prot, flags,
 840                     fp, (offset_t)pos);
 841                 releasef(fd);
 842         } else
 843                 error = EBADF;
 844 
 845         return (error ? (caddr_t)(uintptr_t)set_errno(error) : addr);
 846 }
 847 #endif  /* _LP64 */
 848 
 849 #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
 850 
 851 /*
 852  * ILP32 mmap(2) system call: 32-bit offset, 32-bit address.
 853  */
 854 caddr_t
 855 smmap32(caddr32_t addr, size32_t len, int prot, int flags, int fd, off32_t pos)
 856 {
 857         struct file *fp;
 858         int error;
 859         caddr_t a = (caddr_t)(uintptr_t)addr;
 860 
 861         if (flags & _MAP_LOW32)
 862                 error = EINVAL;
 863         else if (fd == -1 && (flags & MAP_ANON) != 0)
 864                 error = smmap_common(&a, (size_t)len, prot,
 865                     flags | _MAP_LOW32, NULL, (offset_t)pos);
 866         else if ((fp = getf(fd)) != NULL) {
 867                 error = smmap_common(&a, (size_t)len, prot,
 868                     flags | _MAP_LOW32, fp, (offset_t)pos);
 869                 releasef(fd);
 870         } else
 871                 error = EBADF;
 872 
 873         ASSERT(error != 0 || (uintptr_t)(a + len) < (uintptr_t)UINT32_MAX);
 874 
 875         return (error ? (caddr_t)(uintptr_t)set_errno(error) : a);
 876 }
 877 
 878 /*
 879  * ILP32 mmap64(2) system call: 64-bit offset, 32-bit address.
 880  *
 881  * Now things really get ugly because we can't use the C-style
 882  * calling convention for more than 6 args, and 64-bit parameter
 883  * passing on 32-bit systems is less than clean.
 884  */
 885 
 886 struct mmaplf32a {
 887         caddr_t addr;
 888         size_t len;
 889 #ifdef _LP64
 890         /*
 891          * 32-bit contents, 64-bit cells
 892          */
 893         uint64_t prot;
 894         uint64_t flags;
 895         uint64_t fd;
 896         uint64_t offhi;
 897         uint64_t offlo;
 898 #else
 899         /*
 900          * 32-bit contents, 32-bit cells
 901          */
 902         uint32_t prot;
 903         uint32_t flags;
 904         uint32_t fd;
 905         uint32_t offhi;
 906         uint32_t offlo;
 907 #endif
 908 };
 909 
 910 int
 911 smmaplf32(struct mmaplf32a *uap, rval_t *rvp)
 912 {
 913         struct file *fp;
 914         int error;
 915         caddr_t a = uap->addr;
 916         int flags = (int)uap->flags;
 917         int fd = (int)uap->fd;
 918 #ifdef _BIG_ENDIAN
 919         offset_t off = ((u_offset_t)uap->offhi << 32) | (u_offset_t)uap->offlo;
 920 #else
 921         offset_t off = ((u_offset_t)uap->offlo << 32) | (u_offset_t)uap->offhi;
 922 #endif
 923 
 924         if (flags & _MAP_LOW32)
 925                 error = EINVAL;
 926         else if (fd == -1 && (flags & MAP_ANON) != 0)
 927                 error = smmap_common(&a, uap->len, (int)uap->prot,
 928                     flags | _MAP_LOW32, NULL, off);
 929         else if ((fp = getf(fd)) != NULL) {
 930                 error = smmap_common(&a, uap->len, (int)uap->prot,
 931                     flags | _MAP_LOW32, fp, off);
 932                 releasef(fd);
 933         } else
 934                 error = EBADF;
 935 
 936         if (error == 0)
 937                 rvp->r_val1 = (uintptr_t)a;
 938         return (error);
 939 }
 940 
 941 #endif  /* _SYSCALL32_IMPL || _ILP32 */
 942 
 943 int
 944 munmap(caddr_t addr, size_t len)
 945 {
 946         struct proc *p = curproc;
 947         struct as *as = p->p_as;
 948 
 949         if (((uintptr_t)addr & PAGEOFFSET) != 0 || len == 0)
 950                 return (set_errno(EINVAL));
 951 
 952         if (valid_usr_range(addr, len, 0, as, as->a_userlimit) != RANGE_OKAY)
 953                 return (set_errno(EINVAL));
 954 
 955         /*
 956          * Discard lwpchan mappings.
 957          */
 958         if (p->p_lcp != NULL)
 959                 lwpchan_delete_mapping(p, addr, addr + len);
 960         if (as_unmap(as, addr, len) != 0)
 961                 return (set_errno(EINVAL));
 962 
 963         return (0);
 964 }
 965 
 966 int
 967 mprotect(caddr_t addr, size_t len, int prot)
 968 {
 969         struct as *as = curproc->p_as;
 970         uint_t uprot = prot | PROT_USER;
 971         int error;
 972 
 973         if (((uintptr_t)addr & PAGEOFFSET) != 0 || len == 0)
 974                 return (set_errno(EINVAL));
 975 
 976         switch (valid_usr_range(addr, len, prot, as, as->a_userlimit)) {
 977         case RANGE_OKAY:
 978                 break;
 979         case RANGE_BADPROT:
 980                 return (set_errno(ENOTSUP));
 981         case RANGE_BADADDR:
 982         default:
 983                 return (set_errno(ENOMEM));
 984         }
 985 
 986         error = as_setprot(as, addr, len, uprot);
 987         if (error)
 988                 return (set_errno(error));
 989         return (0);
 990 }
 991 
 992 #define MC_CACHE        128                     /* internal result buffer */
 993 #define MC_QUANTUM      (MC_CACHE * PAGESIZE)   /* addresses covered in loop */
 994 
 995 int
 996 mincore(caddr_t addr, size_t len, char *vecp)
 997 {
 998         struct as *as = curproc->p_as;
 999         caddr_t ea;                     /* end address of loop */
1000         size_t rl;                      /* inner result length */
1001         char vec[MC_CACHE];             /* local vector cache */
1002         int error;
1003         model_t model;
1004         long    llen;
1005 
1006         model = get_udatamodel();
1007         /*
1008          * Validate form of address parameters.
1009          */
1010         if (model == DATAMODEL_NATIVE) {
1011                 llen = (long)len;
1012         } else {
1013                 llen = (int32_t)(size32_t)len;
1014         }
1015         if (((uintptr_t)addr & PAGEOFFSET) != 0 || llen <= 0)
1016                 return (set_errno(EINVAL));
1017 
1018         if (valid_usr_range(addr, len, 0, as, as->a_userlimit) != RANGE_OKAY)
1019                 return (set_errno(ENOMEM));
1020 
1021         /*
1022          * Loop over subranges of interval [addr : addr + len), recovering
1023          * results internally and then copying them out to caller.  Subrange
1024          * is based on the size of MC_CACHE, defined above.
1025          */
1026         for (ea = addr + len; addr < ea; addr += MC_QUANTUM) {
1027                 error = as_incore(as, addr,
1028                     (size_t)MIN(MC_QUANTUM, ea - addr), vec, &rl);
1029                 if (rl != 0) {
1030                         rl = (rl + PAGESIZE - 1) / PAGESIZE;
1031                         if (copyout(vec, vecp, rl) != 0)
1032                                 return (set_errno(EFAULT));
1033                         vecp += rl;
1034                 }
1035                 if (error != 0)
1036                         return (set_errno(ENOMEM));
1037         }
1038         return (0);
1039 }