1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2015 Garrett D'Amore <garrett@damore.org>
  24  * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
  25  */
  26 
  27 /*      Copyright (c) 1988 AT&T     */
  28 /*        All Rights Reserved   */
  29 /*
  30  * Copyright 2014, Joyent, Inc.  All rights reserved.
  31  */
  32 
  33 #include <sys/types.h>
  34 #include <sys/param.h>
  35 #include <sys/sysmacros.h>
  36 #include <sys/systm.h>
  37 #include <sys/signal.h>
  38 #include <sys/cred_impl.h>
  39 #include <sys/policy.h>
  40 #include <sys/user.h>
  41 #include <sys/errno.h>
  42 #include <sys/file.h>
  43 #include <sys/vfs.h>
  44 #include <sys/vnode.h>
  45 #include <sys/mman.h>
  46 #include <sys/acct.h>
  47 #include <sys/cpuvar.h>
  48 #include <sys/proc.h>
  49 #include <sys/cmn_err.h>
  50 #include <sys/debug.h>
  51 #include <sys/pathname.h>
  52 #include <sys/vm.h>
  53 #include <sys/lgrp.h>
  54 #include <sys/vtrace.h>
  55 #include <sys/exec.h>
  56 #include <sys/exechdr.h>
  57 #include <sys/kmem.h>
  58 #include <sys/prsystm.h>
  59 #include <sys/modctl.h>
  60 #include <sys/vmparam.h>
  61 #include <sys/door.h>
  62 #include <sys/schedctl.h>
  63 #include <sys/utrap.h>
  64 #include <sys/systeminfo.h>
  65 #include <sys/stack.h>
  66 #include <sys/rctl.h>
  67 #include <sys/dtrace.h>
  68 #include <sys/lwpchan_impl.h>
  69 #include <sys/pool.h>
  70 #include <sys/sdt.h>
  71 #include <sys/brand.h>
  72 #include <sys/klpd.h>
  73 
  74 #include <c2/audit.h>
  75 
  76 #include <vm/hat.h>
  77 #include <vm/anon.h>
  78 #include <vm/as.h>
  79 #include <vm/seg.h>
  80 #include <vm/seg_vn.h>
  81 
  82 #define PRIV_RESET              0x01    /* needs to reset privs */
  83 #define PRIV_SETID              0x02    /* needs to change uids */
  84 #define PRIV_SETUGID            0x04    /* is setuid/setgid/forced privs */
  85 #define PRIV_INCREASE           0x08    /* child runs with more privs */
  86 #define MAC_FLAGS               0x10    /* need to adjust MAC flags */
  87 #define PRIV_FORCED             0x20    /* has forced privileges */
  88 
  89 static int execsetid(struct vnode *, struct vattr *, uid_t *, uid_t *,
  90     priv_set_t *, cred_t *, const char *);
  91 static int hold_execsw(struct execsw *);
  92 
  93 uint_t auxv_hwcap = 0;  /* auxv AT_SUN_HWCAP value; determined on the fly */
  94 uint_t auxv_hwcap_2 = 0;        /* AT_SUN_HWCAP2 */
  95 #if defined(_SYSCALL32_IMPL)
  96 uint_t auxv_hwcap32 = 0;        /* 32-bit version of auxv_hwcap */
  97 uint_t auxv_hwcap32_2 = 0;      /* 32-bit version of auxv_hwcap2 */
  98 #endif
  99 
 100 #define PSUIDFLAGS              (SNOCD|SUGID)
 101 
 102 #define DEVFD                   "/dev/fd/"
 103 
 104 /*
 105  * exece() - system call wrapper around exec_common()
 106  */
 107 int
 108 exece(const char *fname, const char **argp, const char **envp)
 109 {
 110         int error;
 111 
 112         error = exec_common(fname, argp, envp, EBA_NONE);
 113         return (error ? (set_errno(error)) : 0);
 114 }
 115 
 116 int
 117 exec_common(const char *fname, const char **argp, const char **envp,
 118     int brand_action)
 119 {
 120         vnode_t *vp = NULL, *dir = NULL, *tmpvp = NULL;
 121         proc_t *p = ttoproc(curthread);
 122         klwp_t *lwp = ttolwp(curthread);
 123         struct user *up = PTOU(p);
 124         long execsz;            /* temporary count of exec size */
 125         int i;
 126         int error;
 127         char exec_file[MAXCOMLEN+1];
 128         struct pathname pn;
 129         struct pathname resolvepn;
 130         struct uarg args;
 131         struct execa ua;
 132         k_sigset_t savedmask;
 133         lwpdir_t *lwpdir = NULL;
 134         tidhash_t *tidhash;
 135         lwpdir_t *old_lwpdir = NULL;
 136         uint_t old_lwpdir_sz;
 137         tidhash_t *old_tidhash;
 138         uint_t old_tidhash_sz;
 139         ret_tidhash_t *ret_tidhash;
 140         lwpent_t *lep;
 141         boolean_t brandme = B_FALSE;
 142 
 143         /*
 144          * exec() is not supported for the /proc agent lwp.
 145          */
 146         if (curthread == p->p_agenttp)
 147                 return (ENOTSUP);
 148 
 149         if (brand_action != EBA_NONE) {
 150                 /*
 151                  * Brand actions are not supported for processes that are not
 152                  * running in a branded zone.
 153                  */
 154                 if (!ZONE_IS_BRANDED(p->p_zone))
 155                         return (ENOTSUP);
 156 
 157                 if (brand_action == EBA_NATIVE) {
 158                         /* Only branded processes can be unbranded */
 159                         if (!PROC_IS_BRANDED(p))
 160                                 return (ENOTSUP);
 161                 } else {
 162                         /* Only unbranded processes can be branded */
 163                         if (PROC_IS_BRANDED(p))
 164                                 return (ENOTSUP);
 165                         brandme = B_TRUE;
 166                 }
 167         } else {
 168                 /*
 169                  * If this is a native zone, or if the process is already
 170                  * branded, then we don't need to do anything.  If this is
 171                  * a native process in a branded zone, we need to brand the
 172                  * process as it exec()s the new binary.
 173                  */
 174                 if (ZONE_IS_BRANDED(p->p_zone) && !PROC_IS_BRANDED(p))
 175                         brandme = B_TRUE;
 176         }
 177 
 178         /*
 179          * Inform /proc that an exec() has started.
 180          * Hold signals that are ignored by default so that we will
 181          * not be interrupted by a signal that will be ignored after
 182          * successful completion of gexec().
 183          */
 184         mutex_enter(&p->p_lock);
 185         prexecstart();
 186         schedctl_finish_sigblock(curthread);
 187         savedmask = curthread->t_hold;
 188         sigorset(&curthread->t_hold, &ignoredefault);
 189         mutex_exit(&p->p_lock);
 190 
 191         /*
 192          * Look up path name and remember last component for later.
 193          * To help coreadm expand its %d token, we attempt to save
 194          * the directory containing the executable in p_execdir. The
 195          * first call to lookuppn() may fail and return EINVAL because
 196          * dirvpp is non-NULL. In that case, we make a second call to
 197          * lookuppn() with dirvpp set to NULL; p_execdir will be NULL,
 198          * but coreadm is allowed to expand %d to the empty string and
 199          * there are other cases in which that failure may occur.
 200          */
 201         if ((error = pn_get((char *)fname, UIO_USERSPACE, &pn)) != 0)
 202                 goto out;
 203         pn_alloc(&resolvepn);
 204 
 205         if (strncmp(pn.pn_path, DEVFD, strlen(DEVFD)) == 0) {
 206                 /* looks like a /dev/fd node */
 207                 char *p = pn.pn_path + strlen(DEVFD);
 208                 int fd = stoi(&p);
 209                 if ((fd < 0) || (*p != 0) || (p == pn.pn_path)) {
 210                         error = EBADF;
 211                         goto out;
 212                 }
 213                 if ((error = fgetstartvp(fd, NULL, &vp)) != 0) {
 214                         goto out;       /* error will be EBADF */
 215                 }
 216                 (void) pn_set(&resolvepn, pn.pn_path);
 217 
 218         } else if ((error =
 219             lookuppn(&pn, &resolvepn, FOLLOW, &dir, &vp)) != 0) {
 220                 pn_free(&resolvepn);
 221                 pn_free(&pn);
 222                 if (error != EINVAL)
 223                         goto out;
 224 
 225                 dir = NULL;
 226                 if ((error = pn_get((char *)fname, UIO_USERSPACE, &pn)) != 0)
 227                         goto out;
 228                 pn_alloc(&resolvepn);
 229                 if ((error = lookuppn(&pn, &resolvepn, FOLLOW, NULLVPP,
 230                     &vp)) != 0) {
 231                         pn_free(&resolvepn);
 232                         pn_free(&pn);
 233                         goto out;
 234                 }
 235         }
 236         if (vp == NULL) {
 237                 if (dir != NULL)
 238                         VN_RELE(dir);
 239                 error = ENOENT;
 240                 pn_free(&resolvepn);
 241                 pn_free(&pn);
 242                 goto out;
 243         }
 244 
 245         if ((error = secpolicy_basic_exec(CRED(), vp)) != 0) {
 246                 if (dir != NULL)
 247                         VN_RELE(dir);
 248                 pn_free(&resolvepn);
 249                 pn_free(&pn);
 250                 VN_RELE(vp);
 251                 goto out;
 252         }
 253 
 254         /*
 255          * We do not allow executing files in attribute directories.
 256          * We test this by determining whether the resolved path
 257          * contains a "/" when we're in an attribute directory;
 258          * only if the pathname does not contain a "/" the resolved path
 259          * points to a file in the current working (attribute) directory.
 260          */
 261         if ((p->p_user.u_cdir->v_flag & V_XATTRDIR) != 0 &&
 262             strchr(resolvepn.pn_path, '/') == NULL) {
 263                 if (dir != NULL)
 264                         VN_RELE(dir);
 265                 error = EACCES;
 266                 pn_free(&resolvepn);
 267                 pn_free(&pn);
 268                 VN_RELE(vp);
 269                 goto out;
 270         }
 271 
 272         bzero(exec_file, MAXCOMLEN+1);
 273         (void) strncpy(exec_file, pn.pn_path, MAXCOMLEN);
 274         bzero(&args, sizeof (args));
 275         args.pathname = resolvepn.pn_path;
 276         /* don't free resolvepn until we are done with args */
 277         pn_free(&pn);
 278 
 279         /*
 280          * If we're running in a profile shell, then call pfexecd.
 281          */
 282         if ((CR_FLAGS(p->p_cred) & PRIV_PFEXEC) != 0) {
 283                 error = pfexec_call(p->p_cred, &resolvepn, &args.pfcred,
 284                     &args.scrubenv);
 285 
 286                 /* Returning errno in case we're not allowed to execute. */
 287                 if (error > 0) {
 288                         if (dir != NULL)
 289                                 VN_RELE(dir);
 290                         pn_free(&resolvepn);
 291                         VN_RELE(vp);
 292                         goto out;
 293                 }
 294 
 295                 /* Don't change the credentials when using old ptrace. */
 296                 if (args.pfcred != NULL &&
 297                     (p->p_proc_flag & P_PR_PTRACE) != 0) {
 298                         crfree(args.pfcred);
 299                         args.pfcred = NULL;
 300                         args.scrubenv = B_FALSE;
 301                 }
 302         }
 303 
 304         /*
 305          * Specific exec handlers, or policies determined via
 306          * /etc/system may override the historical default.
 307          */
 308         args.stk_prot = PROT_ZFOD;
 309         args.dat_prot = PROT_ZFOD;
 310 
 311         CPU_STATS_ADD_K(sys, sysexec, 1);
 312         DTRACE_PROC1(exec, char *, args.pathname);
 313 
 314         ua.fname = fname;
 315         ua.argp = argp;
 316         ua.envp = envp;
 317 
 318         /* If necessary, brand this process before we start the exec. */
 319         if (brandme)
 320                 brand_setbrand(p);
 321 
 322         if ((error = gexec(&vp, &ua, &args, NULL, 0, &execsz,
 323             exec_file, p->p_cred, brand_action)) != 0) {
 324                 if (brandme)
 325                         brand_clearbrand(p, B_FALSE);
 326                 VN_RELE(vp);
 327                 if (dir != NULL)
 328                         VN_RELE(dir);
 329                 pn_free(&resolvepn);
 330                 goto fail;
 331         }
 332 
 333         /*
 334          * Free floating point registers (sun4u only)
 335          */
 336         ASSERT(lwp != NULL);
 337         lwp_freeregs(lwp, 1);
 338 
 339         /*
 340          * Free thread and process context ops.
 341          */
 342         if (curthread->t_ctx)
 343                 freectx(curthread, 1);
 344         if (p->p_pctx)
 345                 freepctx(p, 1);
 346 
 347         /*
 348          * Remember file name for accounting; clear any cached DTrace predicate.
 349          */
 350         up->u_acflag &= ~AFORK;
 351         bcopy(exec_file, up->u_comm, MAXCOMLEN+1);
 352         curthread->t_predcache = NULL;
 353 
 354         /*
 355          * Clear contract template state
 356          */
 357         lwp_ctmpl_clear(lwp);
 358 
 359         /*
 360          * Save the directory in which we found the executable for expanding
 361          * the %d token used in core file patterns.
 362          */
 363         mutex_enter(&p->p_lock);
 364         tmpvp = p->p_execdir;
 365         p->p_execdir = dir;
 366         if (p->p_execdir != NULL)
 367                 VN_HOLD(p->p_execdir);
 368         mutex_exit(&p->p_lock);
 369 
 370         if (tmpvp != NULL)
 371                 VN_RELE(tmpvp);
 372 
 373         /*
 374          * Reset stack state to the user stack, clear set of signals
 375          * caught on the signal stack, and reset list of signals that
 376          * restart system calls; the new program's environment should
 377          * not be affected by detritus from the old program.  Any
 378          * pending held signals remain held, so don't clear t_hold.
 379          */
 380         mutex_enter(&p->p_lock);
 381         lwp->lwp_oldcontext = 0;
 382         lwp->lwp_ustack = 0;
 383         lwp->lwp_old_stk_ctl = 0;
 384         sigemptyset(&up->u_signodefer);
 385         sigemptyset(&up->u_sigonstack);
 386         sigemptyset(&up->u_sigresethand);
 387         lwp->lwp_sigaltstack.ss_sp = 0;
 388         lwp->lwp_sigaltstack.ss_size = 0;
 389         lwp->lwp_sigaltstack.ss_flags = SS_DISABLE;
 390 
 391         /*
 392          * Make saved resource limit == current resource limit.
 393          */
 394         for (i = 0; i < RLIM_NLIMITS; i++) {
 395                 /*CONSTCOND*/
 396                 if (RLIM_SAVED(i)) {
 397                         (void) rctl_rlimit_get(rctlproc_legacy[i], p,
 398                             &up->u_saved_rlimit[i]);
 399                 }
 400         }
 401 
 402         /*
 403          * If the action was to catch the signal, then the action
 404          * must be reset to SIG_DFL.
 405          */
 406         sigdefault(p);
 407         p->p_flag &= ~(SNOWAIT|SJCTL);
 408         p->p_flag |= (SEXECED|SMSACCT|SMSFORK);
 409         up->u_signal[SIGCLD - 1] = SIG_DFL;
 410 
 411         /*
 412          * Delete the dot4 sigqueues/signotifies.
 413          */
 414         sigqfree(p);
 415 
 416         mutex_exit(&p->p_lock);
 417 
 418         mutex_enter(&p->p_pflock);
 419         p->p_prof.pr_base = NULL;
 420         p->p_prof.pr_size = 0;
 421         p->p_prof.pr_off = 0;
 422         p->p_prof.pr_scale = 0;
 423         p->p_prof.pr_samples = 0;
 424         mutex_exit(&p->p_pflock);
 425 
 426         ASSERT(curthread->t_schedctl == NULL);
 427 
 428 #if defined(__sparc)
 429         if (p->p_utraps != NULL)
 430                 utrap_free(p);
 431 #endif  /* __sparc */
 432 
 433         /*
 434          * Close all close-on-exec files.
 435          */
 436         close_exec(P_FINFO(p));
 437         TRACE_2(TR_FAC_PROC, TR_PROC_EXEC, "proc_exec:p %p up %p", p, up);
 438 
 439         /* Unbrand ourself if necessary. */
 440         if (PROC_IS_BRANDED(p) && (brand_action == EBA_NATIVE))
 441                 brand_clearbrand(p, B_FALSE);
 442 
 443         setregs(&args);
 444 
 445         /* Mark this as an executable vnode */
 446         mutex_enter(&vp->v_lock);
 447         vp->v_flag |= VVMEXEC;
 448         mutex_exit(&vp->v_lock);
 449 
 450         VN_RELE(vp);
 451         if (dir != NULL)
 452                 VN_RELE(dir);
 453         pn_free(&resolvepn);
 454 
 455         /*
 456          * Allocate a new lwp directory and lwpid hash table if necessary.
 457          */
 458         if (curthread->t_tid != 1 || p->p_lwpdir_sz != 2) {
 459                 lwpdir = kmem_zalloc(2 * sizeof (lwpdir_t), KM_SLEEP);
 460                 lwpdir->ld_next = lwpdir + 1;
 461                 tidhash = kmem_zalloc(2 * sizeof (tidhash_t), KM_SLEEP);
 462                 if (p->p_lwpdir != NULL)
 463                         lep = p->p_lwpdir[curthread->t_dslot].ld_entry;
 464                 else
 465                         lep = kmem_zalloc(sizeof (*lep), KM_SLEEP);
 466         }
 467 
 468         if (PROC_IS_BRANDED(p))
 469                 BROP(p)->b_exec();
 470 
 471         mutex_enter(&p->p_lock);
 472         prbarrier(p);
 473 
 474         /*
 475          * Reset lwp id to the default value of 1.
 476          * This is a single-threaded process now
 477          * and lwp #1 is lwp_wait()able by default.
 478          * The t_unpark flag should not be inherited.
 479          */
 480         ASSERT(p->p_lwpcnt == 1 && p->p_zombcnt == 0);
 481         curthread->t_tid = 1;
 482         kpreempt_disable();
 483         ASSERT(curthread->t_lpl != NULL);
 484         p->p_t1_lgrpid = curthread->t_lpl->lpl_lgrpid;
 485         kpreempt_enable();
 486         if (p->p_tr_lgrpid != LGRP_NONE && p->p_tr_lgrpid != p->p_t1_lgrpid) {
 487                 lgrp_update_trthr_migrations(1);
 488         }
 489         curthread->t_unpark = 0;
 490         curthread->t_proc_flag |= TP_TWAIT;
 491         curthread->t_proc_flag &= ~TP_DAEMON;    /* daemons shouldn't exec */
 492         p->p_lwpdaemon = 0;                  /* but oh well ... */
 493         p->p_lwpid = 1;
 494 
 495         /*
 496          * Install the newly-allocated lwp directory and lwpid hash table
 497          * and insert the current thread into the new hash table.
 498          */
 499         if (lwpdir != NULL) {
 500                 old_lwpdir = p->p_lwpdir;
 501                 old_lwpdir_sz = p->p_lwpdir_sz;
 502                 old_tidhash = p->p_tidhash;
 503                 old_tidhash_sz = p->p_tidhash_sz;
 504                 p->p_lwpdir = p->p_lwpfree = lwpdir;
 505                 p->p_lwpdir_sz = 2;
 506                 lep->le_thread = curthread;
 507                 lep->le_lwpid = curthread->t_tid;
 508                 lep->le_start = curthread->t_start;
 509                 lwp_hash_in(p, lep, tidhash, 2, 0);
 510                 p->p_tidhash = tidhash;
 511                 p->p_tidhash_sz = 2;
 512         }
 513         ret_tidhash = p->p_ret_tidhash;
 514         p->p_ret_tidhash = NULL;
 515 
 516         /*
 517          * Restore the saved signal mask and
 518          * inform /proc that the exec() has finished.
 519          */
 520         curthread->t_hold = savedmask;
 521         prexecend();
 522         mutex_exit(&p->p_lock);
 523         if (old_lwpdir) {
 524                 kmem_free(old_lwpdir, old_lwpdir_sz * sizeof (lwpdir_t));
 525                 kmem_free(old_tidhash, old_tidhash_sz * sizeof (tidhash_t));
 526         }
 527         while (ret_tidhash != NULL) {
 528                 ret_tidhash_t *next = ret_tidhash->rth_next;
 529                 kmem_free(ret_tidhash->rth_tidhash,
 530                     ret_tidhash->rth_tidhash_sz * sizeof (tidhash_t));
 531                 kmem_free(ret_tidhash, sizeof (*ret_tidhash));
 532                 ret_tidhash = next;
 533         }
 534 
 535         ASSERT(error == 0);
 536         DTRACE_PROC(exec__success);
 537         return (0);
 538 
 539 fail:
 540         DTRACE_PROC1(exec__failure, int, error);
 541 out:            /* error return */
 542         mutex_enter(&p->p_lock);
 543         curthread->t_hold = savedmask;
 544         prexecend();
 545         mutex_exit(&p->p_lock);
 546         ASSERT(error != 0);
 547         return (error);
 548 }
 549 
 550 
 551 /*
 552  * Perform generic exec duties and switchout to object-file specific
 553  * handler.
 554  */
 555 int
 556 gexec(
 557         struct vnode **vpp,
 558         struct execa *uap,
 559         struct uarg *args,
 560         struct intpdata *idatap,
 561         int level,
 562         long *execsz,
 563         caddr_t exec_file,
 564         struct cred *cred,
 565         int brand_action)
 566 {
 567         struct vnode *vp, *execvp = NULL;
 568         proc_t *pp = ttoproc(curthread);
 569         struct execsw *eswp;
 570         int error = 0;
 571         int suidflags = 0;
 572         ssize_t resid;
 573         uid_t uid, gid;
 574         struct vattr vattr;
 575         char magbuf[MAGIC_BYTES];
 576         int setid;
 577         cred_t *oldcred, *newcred = NULL;
 578         int privflags = 0;
 579         int setidfl;
 580         priv_set_t fset;
 581 
 582         /*
 583          * If the SNOCD or SUGID flag is set, turn it off and remember the
 584          * previous setting so we can restore it if we encounter an error.
 585          */
 586         if (level == 0 && (pp->p_flag & PSUIDFLAGS)) {
 587                 mutex_enter(&pp->p_lock);
 588                 suidflags = pp->p_flag & PSUIDFLAGS;
 589                 pp->p_flag &= ~PSUIDFLAGS;
 590                 mutex_exit(&pp->p_lock);
 591         }
 592 
 593         if ((error = execpermissions(*vpp, &vattr, args)) != 0)
 594                 goto bad_noclose;
 595 
 596         /* need to open vnode for stateful file systems */
 597         if ((error = VOP_OPEN(vpp, FREAD, CRED(), NULL)) != 0)
 598                 goto bad_noclose;
 599         vp = *vpp;
 600 
 601         /*
 602          * Note: to support binary compatibility with SunOS a.out
 603          * executables, we read in the first four bytes, as the
 604          * magic number is in bytes 2-3.
 605          */
 606         if (error = vn_rdwr(UIO_READ, vp, magbuf, sizeof (magbuf),
 607             (offset_t)0, UIO_SYSSPACE, 0, (rlim64_t)0, CRED(), &resid))
 608                 goto bad;
 609         if (resid != 0)
 610                 goto bad;
 611 
 612         if ((eswp = findexec_by_hdr(magbuf)) == NULL)
 613                 goto bad;
 614 
 615         if (level == 0 &&
 616             (privflags = execsetid(vp, &vattr, &uid, &gid, &fset,
 617             args->pfcred == NULL ? cred : args->pfcred, args->pathname)) != 0) {
 618 
 619                 /* Pfcred is a credential with a ref count of 1 */
 620 
 621                 if (args->pfcred != NULL) {
 622                         privflags |= PRIV_INCREASE|PRIV_RESET;
 623                         newcred = cred = args->pfcred;
 624                 } else {
 625                         newcred = cred = crdup(cred);
 626                 }
 627 
 628                 /* If we can, drop the PA bit */
 629                 if ((privflags & PRIV_RESET) != 0)
 630                         priv_adjust_PA(cred);
 631 
 632                 if (privflags & PRIV_SETID) {
 633                         cred->cr_uid = uid;
 634                         cred->cr_gid = gid;
 635                         cred->cr_suid = uid;
 636                         cred->cr_sgid = gid;
 637                 }
 638 
 639                 if (privflags & MAC_FLAGS) {
 640                         if (!(CR_FLAGS(cred) & NET_MAC_AWARE_INHERIT))
 641                                 CR_FLAGS(cred) &= ~NET_MAC_AWARE;
 642                         CR_FLAGS(cred) &= ~NET_MAC_AWARE_INHERIT;
 643                 }
 644 
 645                 /*
 646                  * Implement the privilege updates:
 647                  *
 648                  * Restrict with L:
 649                  *
 650                  *      I' = I & L
 651                  *
 652                  *      E' = P' = (I' + F) & A
 653                  *
 654                  * But if running under ptrace, we cap I and F with P.
 655                  */
 656                 if ((privflags & (PRIV_RESET|PRIV_FORCED)) != 0) {
 657                         if ((privflags & PRIV_INCREASE) != 0 &&
 658                             (pp->p_proc_flag & P_PR_PTRACE) != 0) {
 659                                 priv_intersect(&CR_OPPRIV(cred),
 660                                     &CR_IPRIV(cred));
 661                                 priv_intersect(&CR_OPPRIV(cred), &fset);
 662                         }
 663                         priv_intersect(&CR_LPRIV(cred), &CR_IPRIV(cred));
 664                         CR_EPRIV(cred) = CR_PPRIV(cred) = CR_IPRIV(cred);
 665                         if (privflags & PRIV_FORCED) {
 666                                 priv_set_PA(cred);
 667                                 priv_union(&fset, &CR_EPRIV(cred));
 668                                 priv_union(&fset, &CR_PPRIV(cred));
 669                         }
 670                         priv_adjust_PA(cred);
 671                 }
 672         } else if (level == 0 && args->pfcred != NULL) {
 673                 newcred = cred = args->pfcred;
 674                 privflags |= PRIV_INCREASE;
 675                 /* pfcred is not forced to adhere to these settings */
 676                 priv_intersect(&CR_LPRIV(cred), &CR_IPRIV(cred));
 677                 CR_EPRIV(cred) = CR_PPRIV(cred) = CR_IPRIV(cred);
 678                 priv_adjust_PA(cred);
 679         }
 680 
 681         /* SunOS 4.x buy-back */
 682         if ((vp->v_vfsp->vfs_flag & VFS_NOSETUID) &&
 683             (vattr.va_mode & (VSUID|VSGID))) {
 684                 char path[MAXNAMELEN];
 685                 refstr_t *mntpt = NULL;
 686                 int ret = -1;
 687 
 688                 bzero(path, sizeof (path));
 689                 zone_hold(pp->p_zone);
 690 
 691                 ret = vnodetopath(pp->p_zone->zone_rootvp, vp, path,
 692                     sizeof (path), cred);
 693 
 694                 /* fallback to mountpoint if a path can't be found */
 695                 if ((ret != 0) || (ret == 0 && path[0] == '\0'))
 696                         mntpt = vfs_getmntpoint(vp->v_vfsp);
 697 
 698                 if (mntpt == NULL)
 699                         zcmn_err(pp->p_zone->zone_id, CE_NOTE,
 700                             "!uid %d: setuid execution not allowed, "
 701                             "file=%s", cred->cr_uid, path);
 702                 else
 703                         zcmn_err(pp->p_zone->zone_id, CE_NOTE,
 704                             "!uid %d: setuid execution not allowed, "
 705                             "fs=%s, file=%s", cred->cr_uid,
 706                             ZONE_PATH_TRANSLATE(refstr_value(mntpt),
 707                             pp->p_zone), exec_file);
 708 
 709                 if (!INGLOBALZONE(pp)) {
 710                         /* zone_rootpath always has trailing / */
 711                         if (mntpt == NULL)
 712                                 cmn_err(CE_NOTE, "!zone: %s, uid: %d "
 713                                     "setuid execution not allowed, file=%s%s",
 714                                     pp->p_zone->zone_name, cred->cr_uid,
 715                                     pp->p_zone->zone_rootpath, path + 1);
 716                         else
 717                                 cmn_err(CE_NOTE, "!zone: %s, uid: %d "
 718                                     "setuid execution not allowed, fs=%s, "
 719                                     "file=%s", pp->p_zone->zone_name,
 720                                     cred->cr_uid, refstr_value(mntpt),
 721                                     exec_file);
 722                 }
 723 
 724                 if (mntpt != NULL)
 725                         refstr_rele(mntpt);
 726 
 727                 zone_rele(pp->p_zone);
 728         }
 729 
 730         /*
 731          * execsetid() told us whether or not we had to change the
 732          * credentials of the process.  In privflags, it told us
 733          * whether we gained any privileges or executed a set-uid executable.
 734          */
 735         setid = (privflags & (PRIV_SETUGID|PRIV_INCREASE|PRIV_FORCED));
 736 
 737         /*
 738          * Use /etc/system variable to determine if the stack
 739          * should be marked as executable by default.
 740          */
 741         if (noexec_user_stack)
 742                 args->stk_prot &= ~PROT_EXEC;
 743 
 744         args->execswp = eswp; /* Save execsw pointer in uarg for exec_func */
 745         args->ex_vp = vp;
 746 
 747         /*
 748          * Traditionally, the setid flags told the sub processes whether
 749          * the file just executed was set-uid or set-gid; this caused
 750          * some confusion as the 'setid' flag did not match the SUGID
 751          * process flag which is only set when the uids/gids do not match.
 752          * A script set-gid/set-uid to the real uid/gid would start with
 753          * /dev/fd/X but an executable would happily trust LD_LIBRARY_PATH.
 754          * Now we flag those cases where the calling process cannot
 755          * be trusted to influence the newly exec'ed process, either
 756          * because it runs with more privileges or when the uids/gids
 757          * do in fact not match.
 758          * This also makes the runtime linker agree with the on exec
 759          * values of SNOCD and SUGID.
 760          */
 761         setidfl = 0;
 762         if (cred->cr_uid != cred->cr_ruid || (cred->cr_rgid != cred->cr_gid &&
 763             !supgroupmember(cred->cr_gid, cred))) {
 764                 setidfl |= EXECSETID_UGIDS;
 765         }
 766         if (setid & PRIV_SETUGID)
 767                 setidfl |= EXECSETID_SETID;
 768         if (setid & PRIV_FORCED)
 769                 setidfl |= EXECSETID_PRIVS;
 770 
 771         execvp = pp->p_exec;
 772         if (execvp)
 773                 VN_HOLD(execvp);
 774 
 775         error = (*eswp->exec_func)(vp, uap, args, idatap, level, execsz,
 776             setidfl, exec_file, cred, brand_action);
 777         rw_exit(eswp->exec_lock);
 778         if (error != 0) {
 779                 if (execvp)
 780                         VN_RELE(execvp);
 781                 /*
 782                  * If this process's p_exec has been set to the vp of
 783                  * the executable by exec_func, we will return without
 784                  * calling VOP_CLOSE because proc_exit will close it
 785                  * on exit.
 786                  */
 787                 if (pp->p_exec == vp)
 788                         goto bad_noclose;
 789                 else
 790                         goto bad;
 791         }
 792 
 793         if (level == 0) {
 794                 uid_t oruid;
 795 
 796                 if (execvp != NULL) {
 797                         /*
 798                          * Close the previous executable only if we are
 799                          * at level 0.
 800                          */
 801                         (void) VOP_CLOSE(execvp, FREAD, 1, (offset_t)0,
 802                             cred, NULL);
 803                 }
 804 
 805                 mutex_enter(&pp->p_crlock);
 806 
 807                 oruid = pp->p_cred->cr_ruid;
 808 
 809                 if (newcred != NULL) {
 810                         /*
 811                          * Free the old credentials, and set the new ones.
 812                          * Do this for both the process and the (single) thread.
 813                          */
 814                         crfree(pp->p_cred);
 815                         pp->p_cred = cred;   /* cred already held for proc */
 816                         crhold(cred);           /* hold new cred for thread */
 817                         /*
 818                          * DTrace accesses t_cred in probe context.  t_cred
 819                          * must always be either NULL, or point to a valid,
 820                          * allocated cred structure.
 821                          */
 822                         oldcred = curthread->t_cred;
 823                         curthread->t_cred = cred;
 824                         crfree(oldcred);
 825 
 826                         if (priv_basic_test >= 0 &&
 827                             !PRIV_ISASSERT(&CR_IPRIV(newcred),
 828                             priv_basic_test)) {
 829                                 pid_t pid = pp->p_pid;
 830                                 char *fn = PTOU(pp)->u_comm;
 831 
 832                                 cmn_err(CE_WARN, "%s[%d]: exec: basic_test "
 833                                     "privilege removed from E/I", fn, pid);
 834                         }
 835                 }
 836                 /*
 837                  * On emerging from a successful exec(), the saved
 838                  * uid and gid equal the effective uid and gid.
 839                  */
 840                 cred->cr_suid = cred->cr_uid;
 841                 cred->cr_sgid = cred->cr_gid;
 842 
 843                 /*
 844                  * If the real and effective ids do not match, this
 845                  * is a setuid process that should not dump core.
 846                  * The group comparison is tricky; we prevent the code
 847                  * from flagging SNOCD when executing with an effective gid
 848                  * which is a supplementary group.
 849                  */
 850                 if (cred->cr_ruid != cred->cr_uid ||
 851                     (cred->cr_rgid != cred->cr_gid &&
 852                     !supgroupmember(cred->cr_gid, cred)) ||
 853                     (privflags & PRIV_INCREASE) != 0)
 854                         suidflags = PSUIDFLAGS;
 855                 else
 856                         suidflags = 0;
 857 
 858                 mutex_exit(&pp->p_crlock);
 859                 if (newcred != NULL && oruid != newcred->cr_ruid) {
 860                         /* Note that the process remains in the same zone. */
 861                         mutex_enter(&pidlock);
 862                         upcount_dec(oruid, crgetzoneid(newcred));
 863                         upcount_inc(newcred->cr_ruid, crgetzoneid(newcred));
 864                         mutex_exit(&pidlock);
 865                 }
 866                 if (suidflags) {
 867                         mutex_enter(&pp->p_lock);
 868                         pp->p_flag |= suidflags;
 869                         mutex_exit(&pp->p_lock);
 870                 }
 871                 if (setid && (pp->p_proc_flag & P_PR_PTRACE) == 0) {
 872                         /*
 873                          * If process is traced via /proc, arrange to
 874                          * invalidate the associated /proc vnode.
 875                          */
 876                         if (pp->p_plist || (pp->p_proc_flag & P_PR_TRACE))
 877                                 args->traceinval = 1;
 878                 }
 879                 if (pp->p_proc_flag & P_PR_PTRACE)
 880                         psignal(pp, SIGTRAP);
 881                 if (args->traceinval)
 882                         prinvalidate(&pp->p_user);
 883         }
 884         if (execvp)
 885                 VN_RELE(execvp);
 886         return (0);
 887 
 888 bad:
 889         (void) VOP_CLOSE(vp, FREAD, 1, (offset_t)0, cred, NULL);
 890 
 891 bad_noclose:
 892         if (newcred != NULL)
 893                 crfree(newcred);
 894         if (error == 0)
 895                 error = ENOEXEC;
 896 
 897         if (suidflags) {
 898                 mutex_enter(&pp->p_lock);
 899                 pp->p_flag |= suidflags;
 900                 mutex_exit(&pp->p_lock);
 901         }
 902         return (error);
 903 }
 904 
 905 extern char *execswnames[];
 906 
 907 struct execsw *
 908 allocate_execsw(char *name, char *magic, size_t magic_size)
 909 {
 910         int i, j;
 911         char *ename;
 912         char *magicp;
 913 
 914         mutex_enter(&execsw_lock);
 915         for (i = 0; i < nexectype; i++) {
 916                 if (execswnames[i] == NULL) {
 917                         ename = kmem_alloc(strlen(name) + 1, KM_SLEEP);
 918                         (void) strcpy(ename, name);
 919                         execswnames[i] = ename;
 920                         /*
 921                          * Set the magic number last so that we
 922                          * don't need to hold the execsw_lock in
 923                          * findexectype().
 924                          */
 925                         magicp = kmem_alloc(magic_size, KM_SLEEP);
 926                         for (j = 0; j < magic_size; j++)
 927                                 magicp[j] = magic[j];
 928                         execsw[i].exec_magic = magicp;
 929                         mutex_exit(&execsw_lock);
 930                         return (&execsw[i]);
 931                 }
 932         }
 933         mutex_exit(&execsw_lock);
 934         return (NULL);
 935 }
 936 
 937 /*
 938  * Find the exec switch table entry with the corresponding magic string.
 939  */
 940 struct execsw *
 941 findexecsw(char *magic)
 942 {
 943         struct execsw *eswp;
 944 
 945         for (eswp = execsw; eswp < &execsw[nexectype]; eswp++) {
 946                 ASSERT(eswp->exec_maglen <= MAGIC_BYTES);
 947                 if (magic && eswp->exec_maglen != 0 &&
 948                     bcmp(magic, eswp->exec_magic, eswp->exec_maglen) == 0)
 949                         return (eswp);
 950         }
 951         return (NULL);
 952 }
 953 
 954 /*
 955  * Find the execsw[] index for the given exec header string by looking for the
 956  * magic string at a specified offset and length for each kind of executable
 957  * file format until one matches.  If no execsw[] entry is found, try to
 958  * autoload a module for this magic string.
 959  */
 960 struct execsw *
 961 findexec_by_hdr(char *header)
 962 {
 963         struct execsw *eswp;
 964 
 965         for (eswp = execsw; eswp < &execsw[nexectype]; eswp++) {
 966                 ASSERT(eswp->exec_maglen <= MAGIC_BYTES);
 967                 if (header && eswp->exec_maglen != 0 &&
 968                     bcmp(&header[eswp->exec_magoff], eswp->exec_magic,
 969                     eswp->exec_maglen) == 0) {
 970                         if (hold_execsw(eswp) != 0)
 971                                 return (NULL);
 972                         return (eswp);
 973                 }
 974         }
 975         return (NULL);  /* couldn't find the type */
 976 }
 977 
 978 /*
 979  * Find the execsw[] index for the given magic string.  If no execsw[] entry
 980  * is found, try to autoload a module for this magic string.
 981  */
 982 struct execsw *
 983 findexec_by_magic(char *magic)
 984 {
 985         struct execsw *eswp;
 986 
 987         for (eswp = execsw; eswp < &execsw[nexectype]; eswp++) {
 988                 ASSERT(eswp->exec_maglen <= MAGIC_BYTES);
 989                 if (magic && eswp->exec_maglen != 0 &&
 990                     bcmp(magic, eswp->exec_magic, eswp->exec_maglen) == 0) {
 991                         if (hold_execsw(eswp) != 0)
 992                                 return (NULL);
 993                         return (eswp);
 994                 }
 995         }
 996         return (NULL);  /* couldn't find the type */
 997 }
 998 
 999 static int
1000 hold_execsw(struct execsw *eswp)
1001 {
1002         char *name;
1003 
1004         rw_enter(eswp->exec_lock, RW_READER);
1005         while (!LOADED_EXEC(eswp)) {
1006                 rw_exit(eswp->exec_lock);
1007                 name = execswnames[eswp-execsw];
1008                 ASSERT(name);
1009                 if (modload("exec", name) == -1)
1010                         return (-1);
1011                 rw_enter(eswp->exec_lock, RW_READER);
1012         }
1013         return (0);
1014 }
1015 
1016 static int
1017 execsetid(struct vnode *vp, struct vattr *vattrp, uid_t *uidp, uid_t *gidp,
1018     priv_set_t *fset, cred_t *cr, const char *pathname)
1019 {
1020         proc_t *pp = ttoproc(curthread);
1021         uid_t uid, gid;
1022         int privflags = 0;
1023 
1024         /*
1025          * Remember credentials.
1026          */
1027         uid = cr->cr_uid;
1028         gid = cr->cr_gid;
1029 
1030         /* Will try to reset the PRIV_AWARE bit later. */
1031         if ((CR_FLAGS(cr) & (PRIV_AWARE|PRIV_AWARE_INHERIT)) == PRIV_AWARE)
1032                 privflags |= PRIV_RESET;
1033 
1034         if ((vp->v_vfsp->vfs_flag & VFS_NOSETUID) == 0) {
1035                 /*
1036                  * If it's a set-uid root program we perform the
1037                  * forced privilege look-aside. This has three possible
1038                  * outcomes:
1039                  *      no look aside information -> treat as before
1040                  *      look aside in Limit set -> apply forced privs
1041                  *      look aside not in Limit set -> ignore set-uid root
1042                  *
1043                  * Ordinary set-uid root execution only allowed if the limit
1044                  * set holds all unsafe privileges.
1045                  */
1046                 if (vattrp->va_mode & VSUID) {
1047                         if (vattrp->va_uid == 0) {
1048                                 int res = get_forced_privs(cr, pathname, fset);
1049 
1050                                 switch (res) {
1051                                 case -1:
1052                                         if (priv_issubset(&priv_unsafe,
1053                                             &CR_LPRIV(cr))) {
1054                                                 uid = vattrp->va_uid;
1055                                                 privflags |= PRIV_SETUGID;
1056                                         }
1057                                         break;
1058                                 case 0:
1059                                         privflags |= PRIV_FORCED|PRIV_INCREASE;
1060                                         break;
1061                                 default:
1062                                         break;
1063                                 }
1064                         } else {
1065                                 uid = vattrp->va_uid;
1066                                 privflags |= PRIV_SETUGID;
1067                         }
1068                 }
1069                 if (vattrp->va_mode & VSGID) {
1070                         gid = vattrp->va_gid;
1071                         privflags |= PRIV_SETUGID;
1072                 }
1073         }
1074 
1075         /*
1076          * Do we need to change our credential anyway?
1077          * This is the case when E != I or P != I, as
1078          * we need to do the assignments (with F empty and A full)
1079          * Or when I is not a subset of L; in that case we need to
1080          * enforce L.
1081          *
1082          *              I' = L & I
1083          *
1084          *              E' = P' = (I' + F) & A
1085          * or
1086          *              E' = P' = I'
1087          */
1088         if (!priv_isequalset(&CR_EPRIV(cr), &CR_IPRIV(cr)) ||
1089             !priv_issubset(&CR_IPRIV(cr), &CR_LPRIV(cr)) ||
1090             !priv_isequalset(&CR_PPRIV(cr), &CR_IPRIV(cr)))
1091                 privflags |= PRIV_RESET;
1092 
1093         /* Child has more privileges than parent */
1094         if (!priv_issubset(&CR_IPRIV(cr), &CR_PPRIV(cr)))
1095                 privflags |= PRIV_INCREASE;
1096 
1097         /* If MAC-aware flag(s) are on, need to update cred to remove. */
1098         if ((CR_FLAGS(cr) & NET_MAC_AWARE) ||
1099             (CR_FLAGS(cr) & NET_MAC_AWARE_INHERIT))
1100                 privflags |= MAC_FLAGS;
1101         /*
1102          * Set setuid/setgid protections if no ptrace() compatibility.
1103          * For privileged processes, honor setuid/setgid even in
1104          * the presence of ptrace() compatibility.
1105          */
1106         if (((pp->p_proc_flag & P_PR_PTRACE) == 0 ||
1107             PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, (uid == 0))) &&
1108             (cr->cr_uid != uid ||
1109             cr->cr_gid != gid ||
1110             cr->cr_suid != uid ||
1111             cr->cr_sgid != gid)) {
1112                 *uidp = uid;
1113                 *gidp = gid;
1114                 privflags |= PRIV_SETID;
1115         }
1116         return (privflags);
1117 }
1118 
1119 int
1120 execpermissions(struct vnode *vp, struct vattr *vattrp, struct uarg *args)
1121 {
1122         int error;
1123         proc_t *p = ttoproc(curthread);
1124 
1125         vattrp->va_mask = AT_MODE | AT_UID | AT_GID | AT_SIZE;
1126         if (error = VOP_GETATTR(vp, vattrp, ATTR_EXEC, p->p_cred, NULL))
1127                 return (error);
1128         /*
1129          * Check the access mode.
1130          * If VPROC, ask /proc if the file is an object file.
1131          */
1132         if ((error = VOP_ACCESS(vp, VEXEC, 0, p->p_cred, NULL)) != 0 ||
1133             !(vp->v_type == VREG || (vp->v_type == VPROC && pr_isobject(vp))) ||
1134             (vp->v_vfsp->vfs_flag & VFS_NOEXEC) != 0 ||
1135             (vattrp->va_mode & (VEXEC|(VEXEC>>3)|(VEXEC>>6))) == 0) {
1136                 if (error == 0)
1137                         error = EACCES;
1138                 return (error);
1139         }
1140 
1141         if ((p->p_plist || (p->p_proc_flag & (P_PR_PTRACE|P_PR_TRACE))) &&
1142             (error = VOP_ACCESS(vp, VREAD, 0, p->p_cred, NULL))) {
1143                 /*
1144                  * If process is under ptrace(2) compatibility,
1145                  * fail the exec(2).
1146                  */
1147                 if (p->p_proc_flag & P_PR_PTRACE)
1148                         goto bad;
1149                 /*
1150                  * Process is traced via /proc.
1151                  * Arrange to invalidate the /proc vnode.
1152                  */
1153                 args->traceinval = 1;
1154         }
1155         return (0);
1156 bad:
1157         if (error == 0)
1158                 error = ENOEXEC;
1159         return (error);
1160 }
1161 
1162 /*
1163  * Map a section of an executable file into the user's
1164  * address space.
1165  */
1166 int
1167 execmap(struct vnode *vp, caddr_t addr, size_t len, size_t zfodlen,
1168     off_t offset, int prot, int page, uint_t szc)
1169 {
1170         int error = 0;
1171         off_t oldoffset;
1172         caddr_t zfodbase, oldaddr;
1173         size_t end, oldlen;
1174         size_t zfoddiff;
1175         label_t ljb;
1176         proc_t *p = ttoproc(curthread);
1177 
1178         oldaddr = addr;
1179         addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1180         if (len) {
1181                 oldlen = len;
1182                 len += ((size_t)oldaddr - (size_t)addr);
1183                 oldoffset = offset;
1184                 offset = (off_t)((uintptr_t)offset & PAGEMASK);
1185                 if (page) {
1186                         spgcnt_t  prefltmem, availm, npages;
1187                         int preread;
1188                         uint_t mflag = MAP_PRIVATE | MAP_FIXED;
1189 
1190                         if ((prot & (PROT_WRITE | PROT_EXEC)) == PROT_EXEC) {
1191                                 mflag |= MAP_TEXT;
1192                         } else {
1193                                 mflag |= MAP_INITDATA;
1194                         }
1195 
1196                         if (valid_usr_range(addr, len, prot, p->p_as,
1197                             p->p_as->a_userlimit) != RANGE_OKAY) {
1198                                 error = ENOMEM;
1199                                 goto bad;
1200                         }
1201                         if (error = VOP_MAP(vp, (offset_t)offset,
1202                             p->p_as, &addr, len, prot, PROT_ALL,
1203                             mflag, CRED(), NULL))
1204                                 goto bad;
1205 
1206                         /*
1207                          * If the segment can fit, then we prefault
1208                          * the entire segment in.  This is based on the
1209                          * model that says the best working set of a
1210                          * small program is all of its pages.
1211                          */
1212                         npages = (spgcnt_t)btopr(len);
1213                         prefltmem = freemem - desfree;
1214                         preread =
1215                             (npages < prefltmem && len < PGTHRESH) ? 1 : 0;
1216 
1217                         /*
1218                          * If we aren't prefaulting the segment,
1219                          * increment "deficit", if necessary to ensure
1220                          * that pages will become available when this
1221                          * process starts executing.
1222                          */
1223                         availm = freemem - lotsfree;
1224                         if (preread == 0 && npages > availm &&
1225                             deficit < lotsfree) {
1226                                 deficit += MIN((pgcnt_t)(npages - availm),
1227                                     lotsfree - deficit);
1228                         }
1229 
1230                         if (preread) {
1231                                 TRACE_2(TR_FAC_PROC, TR_EXECMAP_PREREAD,
1232                                     "execmap preread:freemem %d size %lu",
1233                                     freemem, len);
1234                                 (void) as_fault(p->p_as->a_hat, p->p_as,
1235                                     (caddr_t)addr, len, F_INVAL, S_READ);
1236                         }
1237                 } else {
1238                         if (valid_usr_range(addr, len, prot, p->p_as,
1239                             p->p_as->a_userlimit) != RANGE_OKAY) {
1240                                 error = ENOMEM;
1241                                 goto bad;
1242                         }
1243 
1244                         if (error = as_map(p->p_as, addr, len,
1245                             segvn_create, zfod_argsp))
1246                                 goto bad;
1247                         /*
1248                          * Read in the segment in one big chunk.
1249                          */
1250                         if (error = vn_rdwr(UIO_READ, vp, (caddr_t)oldaddr,
1251                             oldlen, (offset_t)oldoffset, UIO_USERSPACE, 0,
1252                             (rlim64_t)0, CRED(), (ssize_t *)0))
1253                                 goto bad;
1254                         /*
1255                          * Now set protections.
1256                          */
1257                         if (prot != PROT_ZFOD) {
1258                                 (void) as_setprot(p->p_as, (caddr_t)addr,
1259                                     len, prot);
1260                         }
1261                 }
1262         }
1263 
1264         if (zfodlen) {
1265                 struct as *as = curproc->p_as;
1266                 struct seg *seg;
1267                 uint_t zprot = 0;
1268 
1269                 end = (size_t)addr + len;
1270                 zfodbase = (caddr_t)roundup(end, PAGESIZE);
1271                 zfoddiff = (uintptr_t)zfodbase - end;
1272                 if (zfoddiff) {
1273                         /*
1274                          * Before we go to zero the remaining space on the last
1275                          * page, make sure we have write permission.
1276                          *
1277                          * Normal illumos binaries don't even hit the case
1278                          * where we have to change permission on the last page
1279                          * since their protection is typically either
1280                          *    PROT_USER | PROT_WRITE | PROT_READ
1281                          * or
1282                          *    PROT_ZFOD (same as PROT_ALL).
1283                          *
1284                          * We need to be careful how we zero-fill the last page
1285                          * if the segment protection does not include
1286                          * PROT_WRITE. Using as_setprot() can cause the VM
1287                          * segment code to call segvn_vpage(), which must
1288                          * allocate a page struct for each page in the segment.
1289                          * If we have a very large segment, this may fail, so
1290                          * we have to check for that, even though we ignore
1291                          * other return values from as_setprot.
1292                          */
1293 
1294                         AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
1295                         seg = as_segat(curproc->p_as, (caddr_t)end);
1296                         if (seg != NULL)
1297                                 SEGOP_GETPROT(seg, (caddr_t)end, zfoddiff - 1,
1298                                     &zprot);
1299                         AS_LOCK_EXIT(as, &as->a_lock);
1300 
1301                         if (seg != NULL && (zprot & PROT_WRITE) == 0) {
1302                                 if (as_setprot(as, (caddr_t)end, zfoddiff - 1,
1303                                     zprot | PROT_WRITE) == ENOMEM) {
1304                                         error = ENOMEM;
1305                                         goto bad;
1306                                 }
1307                         }
1308 
1309                         if (on_fault(&ljb)) {
1310                                 no_fault();
1311                                 if (seg != NULL && (zprot & PROT_WRITE) == 0)
1312                                         (void) as_setprot(as, (caddr_t)end,
1313                                             zfoddiff - 1, zprot);
1314                                 error = EFAULT;
1315                                 goto bad;
1316                         }
1317                         uzero((void *)end, zfoddiff);
1318                         no_fault();
1319                         if (seg != NULL && (zprot & PROT_WRITE) == 0)
1320                                 (void) as_setprot(as, (caddr_t)end,
1321                                     zfoddiff - 1, zprot);
1322                 }
1323                 if (zfodlen > zfoddiff) {
1324                         struct segvn_crargs crargs =
1325                             SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL);
1326 
1327                         zfodlen -= zfoddiff;
1328                         if (valid_usr_range(zfodbase, zfodlen, prot, p->p_as,
1329                             p->p_as->a_userlimit) != RANGE_OKAY) {
1330                                 error = ENOMEM;
1331                                 goto bad;
1332                         }
1333                         if (szc > 0) {
1334                                 /*
1335                                  * ASSERT alignment because the mapelfexec()
1336                                  * caller for the szc > 0 case extended zfod
1337                                  * so it's end is pgsz aligned.
1338                                  */
1339                                 size_t pgsz = page_get_pagesize(szc);
1340                                 ASSERT(IS_P2ALIGNED(zfodbase + zfodlen, pgsz));
1341 
1342                                 if (IS_P2ALIGNED(zfodbase, pgsz)) {
1343                                         crargs.szc = szc;
1344                                 } else {
1345                                         crargs.szc = AS_MAP_HEAP;
1346                                 }
1347                         } else {
1348                                 crargs.szc = AS_MAP_NO_LPOOB;
1349                         }
1350                         if (error = as_map(p->p_as, (caddr_t)zfodbase,
1351                             zfodlen, segvn_create, &crargs))
1352                                 goto bad;
1353                         if (prot != PROT_ZFOD) {
1354                                 (void) as_setprot(p->p_as, (caddr_t)zfodbase,
1355                                     zfodlen, prot);
1356                         }
1357                 }
1358         }
1359         return (0);
1360 bad:
1361         return (error);
1362 }
1363 
1364 void
1365 setexecenv(struct execenv *ep)
1366 {
1367         proc_t *p = ttoproc(curthread);
1368         klwp_t *lwp = ttolwp(curthread);
1369         struct vnode *vp;
1370 
1371         p->p_bssbase = ep->ex_bssbase;
1372         p->p_brkbase = ep->ex_brkbase;
1373         p->p_brksize = ep->ex_brksize;
1374         if (p->p_exec)
1375                 VN_RELE(p->p_exec);  /* out with the old */
1376         vp = p->p_exec = ep->ex_vp;
1377         if (vp != NULL)
1378                 VN_HOLD(vp);            /* in with the new */
1379 
1380         lwp->lwp_sigaltstack.ss_sp = 0;
1381         lwp->lwp_sigaltstack.ss_size = 0;
1382         lwp->lwp_sigaltstack.ss_flags = SS_DISABLE;
1383 }
1384 
1385 int
1386 execopen(struct vnode **vpp, int *fdp)
1387 {
1388         struct vnode *vp = *vpp;
1389         file_t *fp;
1390         int error = 0;
1391         int filemode = FREAD;
1392 
1393         VN_HOLD(vp);            /* open reference */
1394         if (error = falloc(NULL, filemode, &fp, fdp)) {
1395                 VN_RELE(vp);
1396                 *fdp = -1;      /* just in case falloc changed value */
1397                 return (error);
1398         }
1399         if (error = VOP_OPEN(&vp, filemode, CRED(), NULL)) {
1400                 VN_RELE(vp);
1401                 setf(*fdp, NULL);
1402                 unfalloc(fp);
1403                 *fdp = -1;
1404                 return (error);
1405         }
1406         *vpp = vp;              /* vnode should not have changed */
1407         fp->f_vnode = vp;
1408         mutex_exit(&fp->f_tlock);
1409         setf(*fdp, fp);
1410         return (0);
1411 }
1412 
1413 int
1414 execclose(int fd)
1415 {
1416         return (closeandsetf(fd, NULL));
1417 }
1418 
1419 
1420 /*
1421  * noexec stub function.
1422  */
1423 /*ARGSUSED*/
1424 int
1425 noexec(
1426     struct vnode *vp,
1427     struct execa *uap,
1428     struct uarg *args,
1429     struct intpdata *idatap,
1430     int level,
1431     long *execsz,
1432     int setid,
1433     caddr_t exec_file,
1434     struct cred *cred)
1435 {
1436         cmn_err(CE_WARN, "missing exec capability for %s", uap->fname);
1437         return (ENOEXEC);
1438 }
1439 
1440 /*
1441  * Support routines for building a user stack.
1442  *
1443  * execve(path, argv, envp) must construct a new stack with the specified
1444  * arguments and environment variables (see exec_args() for a description
1445  * of the user stack layout).  To do this, we copy the arguments and
1446  * environment variables from the old user address space into the kernel,
1447  * free the old as, create the new as, and copy our buffered information
1448  * to the new stack.  Our kernel buffer has the following structure:
1449  *
1450  *      +-----------------------+ <--- stk_base + stk_size
1451  *      | string offsets        |
1452  *      +-----------------------+ <--- stk_offp
1453  *      |                       |
1454  *      | STK_AVAIL() space     |
1455  *      |                       |
1456  *      +-----------------------+ <--- stk_strp
1457  *      | strings               |
1458  *      +-----------------------+ <--- stk_base
1459  *
1460  * When we add a string, we store the string's contents (including the null
1461  * terminator) at stk_strp, and we store the offset of the string relative to
1462  * stk_base at --stk_offp.  At strings are added, stk_strp increases and
1463  * stk_offp decreases.  The amount of space remaining, STK_AVAIL(), is just
1464  * the difference between these pointers.  If we run out of space, we return
1465  * an error and exec_args() starts all over again with a buffer twice as large.
1466  * When we're all done, the kernel buffer looks like this:
1467  *
1468  *      +-----------------------+ <--- stk_base + stk_size
1469  *      | argv[0] offset        |
1470  *      +-----------------------+
1471  *      | ...                   |
1472  *      +-----------------------+
1473  *      | argv[argc-1] offset   |
1474  *      +-----------------------+
1475  *      | envp[0] offset        |
1476  *      +-----------------------+
1477  *      | ...                   |
1478  *      +-----------------------+
1479  *      | envp[envc-1] offset   |
1480  *      +-----------------------+
1481  *      | AT_SUN_PLATFORM offset|
1482  *      +-----------------------+
1483  *      | AT_SUN_EXECNAME offset|
1484  *      +-----------------------+ <--- stk_offp
1485  *      |                       |
1486  *      | STK_AVAIL() space     |
1487  *      |                       |
1488  *      +-----------------------+ <--- stk_strp
1489  *      | AT_SUN_EXECNAME offset|
1490  *      +-----------------------+
1491  *      | AT_SUN_PLATFORM offset|
1492  *      +-----------------------+
1493  *      | envp[envc-1] string   |
1494  *      +-----------------------+
1495  *      | ...                   |
1496  *      +-----------------------+
1497  *      | envp[0] string        |
1498  *      +-----------------------+
1499  *      | argv[argc-1] string   |
1500  *      +-----------------------+
1501  *      | ...                   |
1502  *      +-----------------------+
1503  *      | argv[0] string        |
1504  *      +-----------------------+ <--- stk_base
1505  */
1506 
1507 #define STK_AVAIL(args)         ((char *)(args)->stk_offp - (args)->stk_strp)
1508 
1509 /*
1510  * Add a string to the stack.
1511  */
1512 static int
1513 stk_add(uarg_t *args, const char *sp, enum uio_seg segflg)
1514 {
1515         int error;
1516         size_t len;
1517 
1518         if (STK_AVAIL(args) < sizeof (int))
1519                 return (E2BIG);
1520         *--args->stk_offp = args->stk_strp - args->stk_base;
1521 
1522         if (segflg == UIO_USERSPACE) {
1523                 error = copyinstr(sp, args->stk_strp, STK_AVAIL(args), &len);
1524                 if (error != 0)
1525                         return (error);
1526         } else {
1527                 len = strlen(sp) + 1;
1528                 if (len > STK_AVAIL(args))
1529                         return (E2BIG);
1530                 bcopy(sp, args->stk_strp, len);
1531         }
1532 
1533         args->stk_strp += len;
1534 
1535         return (0);
1536 }
1537 
1538 static int
1539 stk_getptr(uarg_t *args, char *src, char **dst)
1540 {
1541         int error;
1542 
1543         if (args->from_model == DATAMODEL_NATIVE) {
1544                 ulong_t ptr;
1545                 error = fulword(src, &ptr);
1546                 *dst = (caddr_t)ptr;
1547         } else {
1548                 uint32_t ptr;
1549                 error = fuword32(src, &ptr);
1550                 *dst = (caddr_t)(uintptr_t)ptr;
1551         }
1552         return (error);
1553 }
1554 
1555 static int
1556 stk_putptr(uarg_t *args, char *addr, char *value)
1557 {
1558         if (args->to_model == DATAMODEL_NATIVE)
1559                 return (sulword(addr, (ulong_t)value));
1560         else
1561                 return (suword32(addr, (uint32_t)(uintptr_t)value));
1562 }
1563 
1564 static int
1565 stk_copyin(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp)
1566 {
1567         char *sp;
1568         int argc, error;
1569         int argv_empty = 0;
1570         size_t ptrsize = args->from_ptrsize;
1571         size_t size, pad;
1572         char *argv = (char *)uap->argp;
1573         char *envp = (char *)uap->envp;
1574 
1575         /*
1576          * Copy interpreter's name and argument to argv[0] and argv[1].
1577          */
1578         if (intp != NULL && intp->intp_name != NULL) {
1579                 if ((error = stk_add(args, intp->intp_name, UIO_SYSSPACE)) != 0)
1580                         return (error);
1581                 if (intp->intp_arg != NULL &&
1582                     (error = stk_add(args, intp->intp_arg, UIO_SYSSPACE)) != 0)
1583                         return (error);
1584                 if (args->fname != NULL)
1585                         error = stk_add(args, args->fname, UIO_SYSSPACE);
1586                 else
1587                         error = stk_add(args, uap->fname, UIO_USERSPACE);
1588                 if (error)
1589                         return (error);
1590 
1591                 /*
1592                  * Check for an empty argv[].
1593                  */
1594                 if (stk_getptr(args, argv, &sp))
1595                         return (EFAULT);
1596                 if (sp == NULL)
1597                         argv_empty = 1;
1598 
1599                 argv += ptrsize;                /* ignore original argv[0] */
1600         }
1601 
1602         if (argv_empty == 0) {
1603                 /*
1604                  * Add argv[] strings to the stack.
1605                  */
1606                 for (;;) {
1607                         if (stk_getptr(args, argv, &sp))
1608                                 return (EFAULT);
1609                         if (sp == NULL)
1610                                 break;
1611                         if ((error = stk_add(args, sp, UIO_USERSPACE)) != 0)
1612                                 return (error);
1613                         argv += ptrsize;
1614                 }
1615         }
1616         argc = (int *)(args->stk_base + args->stk_size) - args->stk_offp;
1617         args->arglen = args->stk_strp - args->stk_base;
1618 
1619         /*
1620          * Add environ[] strings to the stack.
1621          */
1622         if (envp != NULL) {
1623                 for (;;) {
1624                         char *tmp = args->stk_strp;
1625                         if (stk_getptr(args, envp, &sp))
1626                                 return (EFAULT);
1627                         if (sp == NULL)
1628                                 break;
1629                         if ((error = stk_add(args, sp, UIO_USERSPACE)) != 0)
1630                                 return (error);
1631                         if (args->scrubenv && strncmp(tmp, "LD_", 3) == 0) {
1632                                 /* Undo the copied string */
1633                                 args->stk_strp = tmp;
1634                                 *(args->stk_offp++) = NULL;
1635                         }
1636                         envp += ptrsize;
1637                 }
1638         }
1639         args->na = (int *)(args->stk_base + args->stk_size) - args->stk_offp;
1640         args->ne = args->na - argc;
1641 
1642         /*
1643          * Add AT_SUN_PLATFORM, AT_SUN_EXECNAME, AT_SUN_BRANDNAME, and
1644          * AT_SUN_EMULATOR strings to the stack.
1645          */
1646         if (auxvpp != NULL && *auxvpp != NULL) {
1647                 if ((error = stk_add(args, platform, UIO_SYSSPACE)) != 0)
1648                         return (error);
1649                 if ((error = stk_add(args, args->pathname, UIO_SYSSPACE)) != 0)
1650                         return (error);
1651                 if (args->brandname != NULL &&
1652                     (error = stk_add(args, args->brandname, UIO_SYSSPACE)) != 0)
1653                         return (error);
1654                 if (args->emulator != NULL &&
1655                     (error = stk_add(args, args->emulator, UIO_SYSSPACE)) != 0)
1656                         return (error);
1657         }
1658 
1659         /*
1660          * Compute the size of the stack.  This includes all the pointers,
1661          * the space reserved for the aux vector, and all the strings.
1662          * The total number of pointers is args->na (which is argc + envc)
1663          * plus 4 more: (1) a pointer's worth of space for argc; (2) the NULL
1664          * after the last argument (i.e. argv[argc]); (3) the NULL after the
1665          * last environment variable (i.e. envp[envc]); and (4) the NULL after
1666          * all the strings, at the very top of the stack.
1667          */
1668         size = (args->na + 4) * args->to_ptrsize + args->auxsize +
1669             (args->stk_strp - args->stk_base);
1670 
1671         /*
1672          * Pad the string section with zeroes to align the stack size.
1673          */
1674         pad = P2NPHASE(size, args->stk_align);
1675 
1676         if (STK_AVAIL(args) < pad)
1677                 return (E2BIG);
1678 
1679         args->usrstack_size = size + pad;
1680 
1681         while (pad-- != 0)
1682                 *args->stk_strp++ = 0;
1683 
1684         args->nc = args->stk_strp - args->stk_base;
1685 
1686         return (0);
1687 }
1688 
1689 static int
1690 stk_copyout(uarg_t *args, char *usrstack, void **auxvpp, user_t *up)
1691 {
1692         size_t ptrsize = args->to_ptrsize;
1693         ssize_t pslen;
1694         char *kstrp = args->stk_base;
1695         char *ustrp = usrstack - args->nc - ptrsize;
1696         char *usp = usrstack - args->usrstack_size;
1697         int *offp = (int *)(args->stk_base + args->stk_size);
1698         int envc = args->ne;
1699         int argc = args->na - envc;
1700         int i;
1701 
1702         /*
1703          * Record argc for /proc.
1704          */
1705         up->u_argc = argc;
1706 
1707         /*
1708          * Put argc on the stack.  Note that even though it's an int,
1709          * it always consumes ptrsize bytes (for alignment).
1710          */
1711         if (stk_putptr(args, usp, (char *)(uintptr_t)argc))
1712                 return (-1);
1713 
1714         /*
1715          * Add argc space (ptrsize) to usp and record argv for /proc.
1716          */
1717         up->u_argv = (uintptr_t)(usp += ptrsize);
1718 
1719         /*
1720          * Put the argv[] pointers on the stack.
1721          */
1722         for (i = 0; i < argc; i++, usp += ptrsize)
1723                 if (stk_putptr(args, usp, &ustrp[*--offp]))
1724                         return (-1);
1725 
1726         /*
1727          * Copy arguments to u_psargs.
1728          */
1729         pslen = MIN(args->arglen, PSARGSZ) - 1;
1730         for (i = 0; i < pslen; i++)
1731                 up->u_psargs[i] = (kstrp[i] == '\0' ? ' ' : kstrp[i]);
1732         while (i < PSARGSZ)
1733                 up->u_psargs[i++] = '\0';
1734 
1735         /*
1736          * Add space for argv[]'s NULL terminator (ptrsize) to usp and
1737          * record envp for /proc.
1738          */
1739         up->u_envp = (uintptr_t)(usp += ptrsize);
1740 
1741         /*
1742          * Put the envp[] pointers on the stack.
1743          */
1744         for (i = 0; i < envc; i++, usp += ptrsize)
1745                 if (stk_putptr(args, usp, &ustrp[*--offp]))
1746                         return (-1);
1747 
1748         /*
1749          * Add space for envp[]'s NULL terminator (ptrsize) to usp and
1750          * remember where the stack ends, which is also where auxv begins.
1751          */
1752         args->stackend = usp += ptrsize;
1753 
1754         /*
1755          * Put all the argv[], envp[], and auxv strings on the stack.
1756          */
1757         if (copyout(args->stk_base, ustrp, args->nc))
1758                 return (-1);
1759 
1760         /*
1761          * Fill in the aux vector now that we know the user stack addresses
1762          * for the AT_SUN_PLATFORM, AT_SUN_EXECNAME, AT_SUN_BRANDNAME and
1763          * AT_SUN_EMULATOR strings.
1764          */
1765         if (auxvpp != NULL && *auxvpp != NULL) {
1766                 if (args->to_model == DATAMODEL_NATIVE) {
1767                         auxv_t **a = (auxv_t **)auxvpp;
1768                         ADDAUX(*a, AT_SUN_PLATFORM, (long)&ustrp[*--offp])
1769                         ADDAUX(*a, AT_SUN_EXECNAME, (long)&ustrp[*--offp])
1770                         if (args->brandname != NULL)
1771                                 ADDAUX(*a,
1772                                     AT_SUN_BRANDNAME, (long)&ustrp[*--offp])
1773                         if (args->emulator != NULL)
1774                                 ADDAUX(*a,
1775                                     AT_SUN_EMULATOR, (long)&ustrp[*--offp])
1776                 } else {
1777                         auxv32_t **a = (auxv32_t **)auxvpp;
1778                         ADDAUX(*a,
1779                             AT_SUN_PLATFORM, (int)(uintptr_t)&ustrp[*--offp])
1780                         ADDAUX(*a,
1781                             AT_SUN_EXECNAME, (int)(uintptr_t)&ustrp[*--offp])
1782                         if (args->brandname != NULL)
1783                                 ADDAUX(*a, AT_SUN_BRANDNAME,
1784                                     (int)(uintptr_t)&ustrp[*--offp])
1785                         if (args->emulator != NULL)
1786                                 ADDAUX(*a, AT_SUN_EMULATOR,
1787                                     (int)(uintptr_t)&ustrp[*--offp])
1788                 }
1789         }
1790 
1791         return (0);
1792 }
1793 
1794 /*
1795  * Initialize a new user stack with the specified arguments and environment.
1796  * The initial user stack layout is as follows:
1797  *
1798  *      User Stack
1799  *      +---------------+ <--- curproc->p_usrstack
1800  *      |               |
1801  *      | slew          |
1802  *      |               |
1803  *      +---------------+
1804  *      | NULL          |
1805  *      +---------------+
1806  *      |               |
1807  *      | auxv strings  |
1808  *      |               |
1809  *      +---------------+
1810  *      |               |
1811  *      | envp strings  |
1812  *      |               |
1813  *      +---------------+
1814  *      |               |
1815  *      | argv strings  |
1816  *      |               |
1817  *      +---------------+ <--- ustrp
1818  *      |               |
1819  *      | aux vector    |
1820  *      |               |
1821  *      +---------------+ <--- auxv
1822  *      | NULL          |
1823  *      +---------------+
1824  *      | envp[envc-1]  |
1825  *      +---------------+
1826  *      | ...           |
1827  *      +---------------+
1828  *      | envp[0]       |
1829  *      +---------------+ <--- envp[]
1830  *      | NULL          |
1831  *      +---------------+
1832  *      | argv[argc-1]  |
1833  *      +---------------+
1834  *      | ...           |
1835  *      +---------------+
1836  *      | argv[0]       |
1837  *      +---------------+ <--- argv[]
1838  *      | argc          |
1839  *      +---------------+ <--- stack base
1840  */
1841 int
1842 exec_args(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp)
1843 {
1844         size_t size;
1845         int error;
1846         proc_t *p = ttoproc(curthread);
1847         user_t *up = PTOU(p);
1848         char *usrstack;
1849         rctl_entity_p_t e;
1850         struct as *as;
1851         extern int use_stk_lpg;
1852         size_t sp_slew;
1853 
1854         args->from_model = p->p_model;
1855         if (p->p_model == DATAMODEL_NATIVE) {
1856                 args->from_ptrsize = sizeof (long);
1857         } else {
1858                 args->from_ptrsize = sizeof (int32_t);
1859         }
1860 
1861         if (args->to_model == DATAMODEL_NATIVE) {
1862                 args->to_ptrsize = sizeof (long);
1863                 args->ncargs = NCARGS;
1864                 args->stk_align = STACK_ALIGN;
1865                 if (args->addr32)
1866                         usrstack = (char *)USRSTACK64_32;
1867                 else
1868                         usrstack = (char *)USRSTACK;
1869         } else {
1870                 args->to_ptrsize = sizeof (int32_t);
1871                 args->ncargs = NCARGS32;
1872                 args->stk_align = STACK_ALIGN32;
1873                 usrstack = (char *)USRSTACK32;
1874         }
1875 
1876         ASSERT(P2PHASE((uintptr_t)usrstack, args->stk_align) == 0);
1877 
1878 #if defined(__sparc)
1879         /*
1880          * Make sure user register windows are empty before
1881          * attempting to make a new stack.
1882          */
1883         (void) flush_user_windows_to_stack(NULL);
1884 #endif
1885 
1886         for (size = PAGESIZE; ; size *= 2) {
1887                 args->stk_size = size;
1888                 args->stk_base = kmem_alloc(size, KM_SLEEP);
1889                 args->stk_strp = args->stk_base;
1890                 args->stk_offp = (int *)(args->stk_base + size);
1891                 error = stk_copyin(uap, args, intp, auxvpp);
1892                 if (error == 0)
1893                         break;
1894                 kmem_free(args->stk_base, size);
1895                 if (error != E2BIG && error != ENAMETOOLONG)
1896                         return (error);
1897                 if (size >= args->ncargs)
1898                         return (E2BIG);
1899         }
1900 
1901         size = args->usrstack_size;
1902 
1903         ASSERT(error == 0);
1904         ASSERT(P2PHASE(size, args->stk_align) == 0);
1905         ASSERT((ssize_t)STK_AVAIL(args) >= 0);
1906 
1907         if (size > args->ncargs) {
1908                 kmem_free(args->stk_base, args->stk_size);
1909                 return (E2BIG);
1910         }
1911 
1912         /*
1913          * Leave only the current lwp and force the other lwps to exit.
1914          * If another lwp beat us to the punch by calling exit(), bail out.
1915          */
1916         if ((error = exitlwps(0)) != 0) {
1917                 kmem_free(args->stk_base, args->stk_size);
1918                 return (error);
1919         }
1920 
1921         /*
1922          * Revoke any doors created by the process.
1923          */
1924         if (p->p_door_list)
1925                 door_exit();
1926 
1927         /*
1928          * Release schedctl data structures.
1929          */
1930         if (p->p_pagep)
1931                 schedctl_proc_cleanup();
1932 
1933         /*
1934          * Clean up any DTrace helpers for the process.
1935          */
1936         if (p->p_dtrace_helpers != NULL) {
1937                 ASSERT(dtrace_helpers_cleanup != NULL);
1938                 (*dtrace_helpers_cleanup)();
1939         }
1940 
1941         mutex_enter(&p->p_lock);
1942         /*
1943          * Cleanup the DTrace provider associated with this process.
1944          */
1945         if (p->p_dtrace_probes) {
1946                 ASSERT(dtrace_fasttrap_exec_ptr != NULL);
1947                 dtrace_fasttrap_exec_ptr(p);
1948         }
1949         mutex_exit(&p->p_lock);
1950 
1951         /*
1952          * discard the lwpchan cache.
1953          */
1954         if (p->p_lcp != NULL)
1955                 lwpchan_destroy_cache(1);
1956 
1957         /*
1958          * Delete the POSIX timers.
1959          */
1960         if (p->p_itimer != NULL)
1961                 timer_exit();
1962 
1963         /*
1964          * Delete the ITIMER_REALPROF interval timer.
1965          * The other ITIMER_* interval timers are specified
1966          * to be inherited across exec().
1967          */
1968         delete_itimer_realprof();
1969 
1970         if (AU_AUDITING())
1971                 audit_exec(args->stk_base, args->stk_base + args->arglen,
1972                     args->na - args->ne, args->ne, args->pfcred);
1973 
1974         /*
1975          * Ensure that we don't change resource associations while we
1976          * change address spaces.
1977          */
1978         mutex_enter(&p->p_lock);
1979         pool_barrier_enter();
1980         mutex_exit(&p->p_lock);
1981 
1982         /*
1983          * Destroy the old address space and create a new one.
1984          * From here on, any errors are fatal to the exec()ing process.
1985          * On error we return -1, which means the caller must SIGKILL
1986          * the process.
1987          */
1988         relvm();
1989 
1990         mutex_enter(&p->p_lock);
1991         pool_barrier_exit();
1992         mutex_exit(&p->p_lock);
1993 
1994         up->u_execsw = args->execswp;
1995 
1996         p->p_brkbase = NULL;
1997         p->p_brksize = 0;
1998         p->p_brkpageszc = 0;
1999         p->p_stksize = 0;
2000         p->p_stkpageszc = 0;
2001         p->p_model = args->to_model;
2002         p->p_usrstack = usrstack;
2003         p->p_stkprot = args->stk_prot;
2004         p->p_datprot = args->dat_prot;
2005 
2006         /*
2007          * Reset resource controls such that all controls are again active as
2008          * well as appropriate to the potentially new address model for the
2009          * process.
2010          */
2011         e.rcep_p.proc = p;
2012         e.rcep_t = RCENTITY_PROCESS;
2013         rctl_set_reset(p->p_rctls, p, &e);
2014 
2015         /* Too early to call map_pgsz for the heap */
2016         if (use_stk_lpg) {
2017                 p->p_stkpageszc = page_szc(map_pgsz(MAPPGSZ_STK, p, 0, 0, 0));
2018         }
2019 
2020         mutex_enter(&p->p_lock);
2021         p->p_flag |= SAUTOLPG;       /* kernel controls page sizes */
2022         mutex_exit(&p->p_lock);
2023 
2024         /*
2025          * Some platforms may choose to randomize real stack start by adding a
2026          * small slew (not more than a few hundred bytes) to the top of the
2027          * stack. This helps avoid cache thrashing when identical processes
2028          * simultaneously share caches that don't provide enough associativity
2029          * (e.g. sun4v systems). In this case stack slewing makes the same hot
2030          * stack variables in different processes to live in different cache
2031          * sets increasing effective associativity.
2032          */
2033         sp_slew = exec_get_spslew();
2034         ASSERT(P2PHASE(sp_slew, args->stk_align) == 0);
2035         exec_set_sp(size + sp_slew);
2036 
2037         as = as_alloc();
2038         p->p_as = as;
2039         as->a_proc = p;
2040         if (p->p_model == DATAMODEL_ILP32 || args->addr32)
2041                 as->a_userlimit = (caddr_t)USERLIMIT32;
2042         (void) hat_setup(as->a_hat, HAT_ALLOC);
2043         hat_join_srd(as->a_hat, args->ex_vp);
2044 
2045         /*
2046          * Finally, write out the contents of the new stack.
2047          */
2048         error = stk_copyout(args, usrstack - sp_slew, auxvpp, up);
2049         kmem_free(args->stk_base, args->stk_size);
2050         return (error);
2051 }