1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
  24  */
  25 
  26 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T     */
  27 /*        All Rights Reserved   */
  28 /*
  29  * Copyright (c) 2013, Joyent, Inc.  All rights reserved.
  30  */
  31 
  32 #include <sys/types.h>
  33 #include <sys/param.h>
  34 #include <sys/thread.h>
  35 #include <sys/sysmacros.h>
  36 #include <sys/signal.h>
  37 #include <sys/cred.h>
  38 #include <sys/user.h>
  39 #include <sys/errno.h>
  40 #include <sys/vnode.h>
  41 #include <sys/mman.h>
  42 #include <sys/kmem.h>
  43 #include <sys/proc.h>
  44 #include <sys/pathname.h>
  45 #include <sys/policy.h>
  46 #include <sys/cmn_err.h>
  47 #include <sys/systm.h>
  48 #include <sys/elf.h>
  49 #include <sys/vmsystm.h>
  50 #include <sys/debug.h>
  51 #include <sys/auxv.h>
  52 #include <sys/exec.h>
  53 #include <sys/prsystm.h>
  54 #include <vm/as.h>
  55 #include <vm/rm.h>
  56 #include <vm/seg.h>
  57 #include <vm/seg_vn.h>
  58 #include <sys/modctl.h>
  59 #include <sys/systeminfo.h>
  60 #include <sys/vmparam.h>
  61 #include <sys/machelf.h>
  62 #include <sys/shm_impl.h>
  63 #include <sys/archsystm.h>
  64 #include <sys/fasttrap.h>
  65 #include <sys/brand.h>
  66 #include "elf_impl.h"
  67 #include <sys/sdt.h>
  68 #include <sys/siginfo.h>
  69 #include <sys/random.h>
  70 
  71 extern int at_flags;
  72 extern volatile size_t aslr_max_brk_skew;
  73 
  74 #define ORIGIN_STR      "ORIGIN"
  75 #define ORIGIN_STR_SIZE 6
  76 
  77 static int getelfhead(vnode_t *, cred_t *, Ehdr *, int *, int *, int *);
  78 static int getelfphdr(vnode_t *, cred_t *, const Ehdr *, int, caddr_t *,
  79     ssize_t *);
  80 static int getelfshdr(vnode_t *, cred_t *, const Ehdr *, int, int, caddr_t *,
  81     ssize_t *, caddr_t *, ssize_t *);
  82 static size_t elfsize(Ehdr *, int, caddr_t, uintptr_t *);
  83 static int mapelfexec(vnode_t *, Ehdr *, int, caddr_t,
  84     Phdr **, Phdr **, Phdr **, Phdr **, Phdr *,
  85     caddr_t *, caddr_t *, intptr_t *, intptr_t *, size_t, long *, size_t *);
  86 
  87 typedef enum {
  88         STR_CTF,
  89         STR_SYMTAB,
  90         STR_DYNSYM,
  91         STR_STRTAB,
  92         STR_DYNSTR,
  93         STR_SHSTRTAB,
  94         STR_NUM
  95 } shstrtype_t;
  96 
  97 static const char *shstrtab_data[] = {
  98         ".SUNW_ctf",
  99         ".symtab",
 100         ".dynsym",
 101         ".strtab",
 102         ".dynstr",
 103         ".shstrtab"
 104 };
 105 
 106 typedef struct shstrtab {
 107         int     sst_ndx[STR_NUM];
 108         int     sst_cur;
 109 } shstrtab_t;
 110 
 111 static void
 112 shstrtab_init(shstrtab_t *s)
 113 {
 114         bzero(&s->sst_ndx, sizeof (s->sst_ndx));
 115         s->sst_cur = 1;
 116 }
 117 
 118 static int
 119 shstrtab_ndx(shstrtab_t *s, shstrtype_t type)
 120 {
 121         int ret;
 122 
 123         if ((ret = s->sst_ndx[type]) != 0)
 124                 return (ret);
 125 
 126         ret = s->sst_ndx[type] = s->sst_cur;
 127         s->sst_cur += strlen(shstrtab_data[type]) + 1;
 128 
 129         return (ret);
 130 }
 131 
 132 static size_t
 133 shstrtab_size(const shstrtab_t *s)
 134 {
 135         return (s->sst_cur);
 136 }
 137 
 138 static void
 139 shstrtab_dump(const shstrtab_t *s, char *buf)
 140 {
 141         int i, ndx;
 142 
 143         *buf = '\0';
 144         for (i = 0; i < STR_NUM; i++) {
 145                 if ((ndx = s->sst_ndx[i]) != 0)
 146                         (void) strcpy(buf + ndx, shstrtab_data[i]);
 147         }
 148 }
 149 
 150 static int
 151 dtrace_safe_phdr(Phdr *phdrp, struct uarg *args, uintptr_t base)
 152 {
 153         ASSERT(phdrp->p_type == PT_SUNWDTRACE);
 154 
 155         /*
 156          * See the comment in fasttrap.h for information on how to safely
 157          * update this program header.
 158          */
 159         if (phdrp->p_memsz < PT_SUNWDTRACE_SIZE ||
 160             (phdrp->p_flags & (PF_R | PF_W | PF_X)) != (PF_R | PF_W | PF_X))
 161                 return (-1);
 162 
 163         args->thrptr = phdrp->p_vaddr + base;
 164 
 165         return (0);
 166 }
 167 
 168 static int
 169 handle_secflag_dt(proc_t *p, uint_t dt, uint_t val)
 170 {
 171         uint_t flag;
 172 
 173         switch (dt) {
 174         case DT_SUNW_ASLR:
 175                 flag = PROC_SEC_ASLR;
 176                 break;
 177         default:
 178                 return (EINVAL);
 179         }
 180 
 181         if (val == 0) {
 182                 if (secflag_isset(p->p_secflags.psf_lower, flag))
 183                         return (EPERM);
 184                 if ((secpolicy_psecflags(CRED(), p, p) != 0) &&
 185                     secflag_isset(p->p_secflags.psf_inherit, flag))
 186                         return (EPERM);
 187 
 188                 secflag_clear(&p->p_secflags.psf_effective, flag);
 189         } else {
 190                 if (!secflag_isset(p->p_secflags.psf_upper, flag))
 191                         return (EPERM);
 192 
 193                 if ((secpolicy_psecflags(CRED(), p, p) != 0) &&
 194                     !secflag_isset(p->p_secflags.psf_inherit, flag))
 195                         return (EPERM);
 196 
 197                 secflag_set(&p->p_secflags.psf_effective, flag);
 198         }
 199 
 200         return (0);
 201 }
 202 
 203 /*
 204  * Map in the executable pointed to by vp. Returns 0 on success.
 205  */
 206 int
 207 mapexec_brand(vnode_t *vp, uarg_t *args, Ehdr *ehdr, Addr *uphdr_vaddr,
 208     intptr_t *voffset, caddr_t exec_file, int *interp, caddr_t *bssbase,
 209     caddr_t *brkbase, size_t *brksize, uintptr_t *lddatap)
 210 {
 211         size_t          len;
 212         struct vattr    vat;
 213         caddr_t         phdrbase = NULL;
 214         ssize_t         phdrsize;
 215         int             nshdrs, shstrndx, nphdrs;
 216         int             error = 0;
 217         Phdr            *uphdr = NULL;
 218         Phdr            *junk = NULL;
 219         Phdr            *dynphdr = NULL;
 220         Phdr            *dtrphdr = NULL;
 221         uintptr_t       lddata;
 222         long            execsz;
 223         intptr_t        minaddr;
 224 
 225         if (lddatap != NULL)
 226                 *lddatap = NULL;
 227 
 228         if (error = execpermissions(vp, &vat, args)) {
 229                 uprintf("%s: Cannot execute %s\n", exec_file, args->pathname);
 230                 return (error);
 231         }
 232 
 233         if ((error = getelfhead(vp, CRED(), ehdr, &nshdrs, &shstrndx,
 234             &nphdrs)) != 0 ||
 235             (error = getelfphdr(vp, CRED(), ehdr, nphdrs, &phdrbase,
 236             &phdrsize)) != 0) {
 237                 uprintf("%s: Cannot read %s\n", exec_file, args->pathname);
 238                 return (error);
 239         }
 240 
 241         if ((len = elfsize(ehdr, nphdrs, phdrbase, &lddata)) == 0) {
 242                 uprintf("%s: Nothing to load in %s", exec_file, args->pathname);
 243                 kmem_free(phdrbase, phdrsize);
 244                 return (ENOEXEC);
 245         }
 246         if (lddatap != NULL)
 247                 *lddatap = lddata;
 248 
 249         if (error = mapelfexec(vp, ehdr, nphdrs, phdrbase, &uphdr, &dynphdr,
 250             &junk, &dtrphdr, NULL, bssbase, brkbase, voffset, &minaddr,
 251             len, &execsz, brksize)) {
 252                 uprintf("%s: Cannot map %s\n", exec_file, args->pathname);
 253                 kmem_free(phdrbase, phdrsize);
 254                 return (error);
 255         }
 256 
 257         /*
 258          * Inform our caller if the executable needs an interpreter.
 259          */
 260         *interp = (dynphdr == NULL) ? 0 : 1;
 261 
 262         /*
 263          * If this is a statically linked executable, voffset should indicate
 264          * the address of the executable itself (it normally holds the address
 265          * of the interpreter).
 266          */
 267         if (ehdr->e_type == ET_EXEC && *interp == 0)
 268                 *voffset = minaddr;
 269 
 270         if (uphdr != NULL) {
 271                 *uphdr_vaddr = uphdr->p_vaddr;
 272         } else {
 273                 *uphdr_vaddr = (Addr)-1;
 274         }
 275 
 276         kmem_free(phdrbase, phdrsize);
 277         return (error);
 278 }
 279 
 280 /*ARGSUSED*/
 281 int
 282 elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap,
 283     int level, long *execsz, int setid, caddr_t exec_file, cred_t *cred,
 284     int brand_action)
 285 {
 286         caddr_t         phdrbase = NULL;
 287         caddr_t         bssbase = 0;
 288         caddr_t         brkbase = 0;
 289         size_t          brksize = 0;
 290         ssize_t         dlnsize;
 291         aux_entry_t     *aux;
 292         int             error;
 293         ssize_t         resid;
 294         int             fd = -1;
 295         intptr_t        voffset;
 296         Phdr            *intphdr = NULL;
 297         Phdr            *dynamicphdr = NULL;
 298         Phdr            *stphdr = NULL;
 299         Phdr            *uphdr = NULL;
 300         Phdr            *junk = NULL;
 301         size_t          len;
 302         ssize_t         phdrsize;
 303         int             postfixsize = 0;
 304         int             i, hsize;
 305         Phdr            *phdrp;
 306         Phdr            *dataphdrp = NULL;
 307         Phdr            *dtrphdr;
 308         Phdr            *capphdr = NULL;
 309         Cap             *cap = NULL;
 310         ssize_t         capsize;
 311         Dyn             *dyn = NULL;
 312         int             hasu = 0;
 313         int             hasauxv = 0;
 314         int             hasintp = 0;
 315         int             branded = 0;
 316 
 317         struct proc *p = ttoproc(curthread);
 318         struct user *up = PTOU(p);
 319         struct bigwad {
 320                 Ehdr    ehdr;
 321                 aux_entry_t     elfargs[__KERN_NAUXV_IMPL];
 322                 char            dl_name[MAXPATHLEN];
 323                 char            pathbuf[MAXPATHLEN];
 324                 struct vattr    vattr;
 325                 struct execenv  exenv;
 326         } *bigwad;      /* kmem_alloc this behemoth so we don't blow stack */
 327         Ehdr            *ehdrp;
 328         int             nshdrs, shstrndx, nphdrs;
 329         char            *dlnp;
 330         char            *pathbufp;
 331         rlim64_t        limit;
 332         rlim64_t        roundlimit;
 333 
 334         ASSERT(p->p_model == DATAMODEL_ILP32 || p->p_model == DATAMODEL_LP64);
 335 
 336         bigwad = kmem_alloc(sizeof (struct bigwad), KM_SLEEP);
 337         ehdrp = &bigwad->ehdr;
 338         dlnp = bigwad->dl_name;
 339         pathbufp = bigwad->pathbuf;
 340 
 341         /*
 342          * Obtain ELF and program header information.
 343          */
 344         if ((error = getelfhead(vp, CRED(), ehdrp, &nshdrs, &shstrndx,
 345             &nphdrs)) != 0 ||
 346             (error = getelfphdr(vp, CRED(), ehdrp, nphdrs, &phdrbase,
 347             &phdrsize)) != 0)
 348                 goto out;
 349 
 350         /*
 351          * Prevent executing an ELF file that has no entry point.
 352          */
 353         if (ehdrp->e_entry == 0) {
 354                 uprintf("%s: Bad entry point\n", exec_file);
 355                 goto bad;
 356         }
 357 
 358         /*
 359          * Put data model that we're exec-ing to into the args passed to
 360          * exec_args(), so it will know what it is copying to on new stack.
 361          * Now that we know whether we are exec-ing a 32-bit or 64-bit
 362          * executable, we can set execsz with the appropriate NCARGS.
 363          */
 364 #ifdef  _LP64
 365         if (ehdrp->e_ident[EI_CLASS] == ELFCLASS32) {
 366                 args->to_model = DATAMODEL_ILP32;
 367                 *execsz = btopr(SINCR) + btopr(SSIZE) + btopr(NCARGS32-1);
 368         } else {
 369                 args->to_model = DATAMODEL_LP64;
 370                 args->stk_prot &= ~PROT_EXEC;
 371 #if defined(__i386) || defined(__amd64)
 372                 args->dat_prot &= ~PROT_EXEC;
 373 #endif
 374                 *execsz = btopr(SINCR) + btopr(SSIZE) + btopr(NCARGS64-1);
 375         }
 376 #else   /* _LP64 */
 377         args->to_model = DATAMODEL_ILP32;
 378         *execsz = btopr(SINCR) + btopr(SSIZE) + btopr(NCARGS-1);
 379 #endif  /* _LP64 */
 380 
 381         /*
 382          * We delay invoking the brand callback until we've figured out
 383          * what kind of elf binary we're trying to run, 32-bit or 64-bit.
 384          * We do this because now the brand library can just check
 385          * args->to_model to see if the target is 32-bit or 64-bit without
 386          * having do duplicate all the code above.
 387          *
 388          * The level checks associated with brand handling below are used to
 389          * prevent a loop since the brand elfexec function typically comes back
 390          * through this function. We must check <= here since the nested
 391          * handling in the #! interpreter code will increment the level before
 392          * calling gexec to run the final elfexec interpreter.
 393          */
 394         if ((level <= INTP_MAXDEPTH) &&
 395             (brand_action != EBA_NATIVE) && (PROC_IS_BRANDED(p))) {
 396                 error = BROP(p)->b_elfexec(vp, uap, args,
 397                     idatap, level + 1, execsz, setid, exec_file, cred,
 398                     brand_action);
 399                 goto out;
 400         }
 401 
 402         /*
 403          * Determine aux size now so that stack can be built
 404          * in one shot (except actual copyout of aux image),
 405          * determine any non-default stack protections,
 406          * and still have this code be machine independent.
 407          */
 408         hsize = ehdrp->e_phentsize;
 409         phdrp = (Phdr *)phdrbase;
 410         for (i = nphdrs; i > 0; i--) {
 411                 switch (phdrp->p_type) {
 412                 case PT_INTERP:
 413                         hasauxv = hasintp = 1;
 414                         break;
 415                 case PT_PHDR:
 416                         hasu = 1;
 417                         break;
 418                 case PT_SUNWSTACK:
 419                         args->stk_prot = PROT_USER;
 420                         if (phdrp->p_flags & PF_R)
 421                                 args->stk_prot |= PROT_READ;
 422                         if (phdrp->p_flags & PF_W)
 423                                 args->stk_prot |= PROT_WRITE;
 424                         if (phdrp->p_flags & PF_X)
 425                                 args->stk_prot |= PROT_EXEC;
 426                         break;
 427                 case PT_LOAD:
 428                         dataphdrp = phdrp;
 429                         break;
 430                 case PT_SUNWCAP:
 431                         capphdr = phdrp;
 432                         break;
 433                 case PT_DYNAMIC:
 434                         dynamicphdr = phdrp;
 435                         break;
 436                 }
 437                 phdrp = (Phdr *)((caddr_t)phdrp + hsize);
 438         }
 439 
 440         if (ehdrp->e_type != ET_EXEC) {
 441                 dataphdrp = NULL;
 442                 hasauxv = 1;
 443         }
 444 
 445         /* Copy BSS permissions to args->dat_prot */
 446         if (dataphdrp != NULL) {
 447                 args->dat_prot = PROT_USER;
 448                 if (dataphdrp->p_flags & PF_R)
 449                         args->dat_prot |= PROT_READ;
 450                 if (dataphdrp->p_flags & PF_W)
 451                         args->dat_prot |= PROT_WRITE;
 452                 if (dataphdrp->p_flags & PF_X)
 453                         args->dat_prot |= PROT_EXEC;
 454         }
 455 
 456         /*
 457          * If a auxvector will be required - reserve the space for
 458          * it now.  This may be increased by exec_args if there are
 459          * ISA-specific types (included in __KERN_NAUXV_IMPL).
 460          */
 461         if (hasauxv) {
 462                 /*
 463                  * If a AUX vector is being built - the base AUX
 464                  * entries are:
 465                  *
 466                  *      AT_BASE
 467                  *      AT_FLAGS
 468                  *      AT_PAGESZ
 469                  *      AT_SUN_AUXFLAGS
 470                  *      AT_SUN_HWCAP
 471                  *      AT_SUN_HWCAP2
 472                  *      AT_SUN_PLATFORM (added in stk_copyout)
 473                  *      AT_SUN_EXECNAME (added in stk_copyout)
 474                  *      AT_NULL
 475                  *
 476                  * total == 9
 477                  */
 478                 if (hasintp && hasu) {
 479                         /*
 480                          * Has PT_INTERP & PT_PHDR - the auxvectors that
 481                          * will be built are:
 482                          *
 483                          *      AT_PHDR
 484                          *      AT_PHENT
 485                          *      AT_PHNUM
 486                          *      AT_ENTRY
 487                          *      AT_LDDATA
 488                          *
 489                          * total = 5
 490                          */
 491                         args->auxsize = (9 + 5) * sizeof (aux_entry_t);
 492                 } else if (hasintp) {
 493                         /*
 494                          * Has PT_INTERP but no PT_PHDR
 495                          *
 496                          *      AT_EXECFD
 497                          *      AT_LDDATA
 498                          *
 499                          * total = 2
 500                          */
 501                         args->auxsize = (9 + 2) * sizeof (aux_entry_t);
 502                 } else {
 503                         args->auxsize = 9 * sizeof (aux_entry_t);
 504                 }
 505         } else {
 506                 args->auxsize = 0;
 507         }
 508 
 509         /*
 510          * If this binary is using an emulator, we need to add an
 511          * AT_SUN_EMULATOR aux entry.
 512          */
 513         if (args->emulator != NULL)
 514                 args->auxsize += sizeof (aux_entry_t);
 515 
 516         if ((brand_action != EBA_NATIVE) && (PROC_IS_BRANDED(p))) {
 517                 branded = 1;
 518                 /*
 519                  * We will be adding 4 entries to the aux vectors.  One for
 520                  * the the brandname and 3 for the brand specific aux vectors.
 521                  */
 522                 args->auxsize += 4 * sizeof (aux_entry_t);
 523         }
 524 
 525         /* If the binary has an explicit ASLR flag, it must be honoured */
 526         if ((dynamicphdr != NULL) &&
 527             (dynamicphdr->p_filesz > 0)) {
 528                 Dyn *dp;
 529                 off_t i = 0;
 530 
 531 #define DYN_STRIDE      100
 532                 for (i = 0; i < dynamicphdr->p_filesz;
 533                     i += sizeof (*dyn) * DYN_STRIDE) {
 534                         int ndyns = (dynamicphdr->p_filesz - i) / sizeof (*dyn);
 535                         size_t dynsize;
 536 
 537                         ndyns = MIN(DYN_STRIDE, ndyns);
 538                         dynsize = ndyns * sizeof (*dyn);
 539 
 540                         dyn = kmem_alloc(dynsize, KM_SLEEP);
 541 
 542                         if ((error = vn_rdwr(UIO_READ, vp, (caddr_t)dyn,
 543                             dynsize, (offset_t)(dynamicphdr->p_offset + i),
 544                             UIO_SYSSPACE, 0, (rlim64_t)0,
 545                             CRED(), &resid)) != 0) {
 546                                 uprintf("%s: cannot read .dynamic section\n",
 547                                     exec_file);
 548                                 goto out;
 549                         }
 550 
 551                         for (dp = dyn; dp < (dyn + ndyns); dp++) {
 552                                 if (dp->d_tag == DT_SUNW_ASLR) {
 553                                         if ((error = handle_secflag_dt(p,
 554                                             DT_SUNW_ASLR,
 555                                             dp->d_un.d_val)) != 0) {
 556                                                 uprintf("%s: error setting "
 557                                                     "security-flag from "
 558                                                     "DT_SUNW_ASLR: %d\n",
 559                                                     exec_file, error);
 560                                                 goto out;
 561                                         }
 562                                 }
 563                         }
 564 
 565                         kmem_free(dyn, dynsize);
 566                 }
 567         }
 568 
 569         /* Hardware/Software capabilities */
 570         if (capphdr != NULL &&
 571             (capsize = capphdr->p_filesz) > 0 &&
 572             capsize <= 16 * sizeof (*cap)) {
 573                 int ncaps = capsize / sizeof (*cap);
 574                 Cap *cp;
 575 
 576                 cap = kmem_alloc(capsize, KM_SLEEP);
 577                 if ((error = vn_rdwr(UIO_READ, vp, (caddr_t)cap,
 578                     capsize, (offset_t)capphdr->p_offset,
 579                     UIO_SYSSPACE, 0, (rlim64_t)0, CRED(), &resid)) != 0) {
 580                         uprintf("%s: Cannot read capabilities section\n",
 581                             exec_file);
 582                         goto out;
 583                 }
 584                 for (cp = cap; cp < cap + ncaps; cp++) {
 585                         if (cp->c_tag == CA_SUNW_SF_1 &&
 586                             (cp->c_un.c_val & SF1_SUNW_ADDR32)) {
 587                                 if (args->to_model == DATAMODEL_LP64)
 588                                         args->addr32 = 1;
 589                                 break;
 590                         }
 591                 }
 592         }
 593 
 594         aux = bigwad->elfargs;
 595         /*
 596          * Move args to the user's stack.
 597          * This can fill in the AT_SUN_PLATFORM and AT_SUN_EXECNAME aux entries.
 598          */
 599         if ((error = exec_args(uap, args, idatap, (void **)&aux)) != 0) {
 600                 if (error == -1) {
 601                         error = ENOEXEC;
 602                         goto bad;
 603                 }
 604                 goto out;
 605         }
 606         /* we're single threaded after this point */
 607 
 608         /*
 609          * If this is an ET_DYN executable (shared object),
 610          * determine its memory size so that mapelfexec() can load it.
 611          */
 612         if (ehdrp->e_type == ET_DYN)
 613                 len = elfsize(ehdrp, nphdrs, phdrbase, NULL);
 614         else
 615                 len = 0;
 616 
 617         dtrphdr = NULL;
 618 
 619         if ((error = mapelfexec(vp, ehdrp, nphdrs, phdrbase, &uphdr, &intphdr,
 620             &stphdr, &dtrphdr, dataphdrp, &bssbase, &brkbase, &voffset, NULL,
 621             len, execsz, &brksize)) != 0)
 622                 goto bad;
 623 
 624         if (uphdr != NULL && intphdr == NULL)
 625                 goto bad;
 626 
 627         if (dtrphdr != NULL && dtrace_safe_phdr(dtrphdr, args, voffset) != 0) {
 628                 uprintf("%s: Bad DTrace phdr in %s\n", exec_file, exec_file);
 629                 goto bad;
 630         }
 631 
 632         if (intphdr != NULL) {
 633                 size_t          len;
 634                 uintptr_t       lddata;
 635                 char            *p;
 636                 struct vnode    *nvp;
 637 
 638                 dlnsize = intphdr->p_filesz;
 639 
 640                 if (dlnsize > MAXPATHLEN || dlnsize <= 0)
 641                         goto bad;
 642 
 643                 /*
 644                  * Read in "interpreter" pathname.
 645                  */
 646                 if ((error = vn_rdwr(UIO_READ, vp, dlnp, intphdr->p_filesz,
 647                     (offset_t)intphdr->p_offset, UIO_SYSSPACE, 0, (rlim64_t)0,
 648                     CRED(), &resid)) != 0) {
 649                         uprintf("%s: Cannot obtain interpreter pathname\n",
 650                             exec_file);
 651                         goto bad;
 652                 }
 653 
 654                 if (resid != 0 || dlnp[dlnsize - 1] != '\0')
 655                         goto bad;
 656 
 657                 /*
 658                  * Search for '$ORIGIN' token in interpreter path.
 659                  * If found, expand it.
 660                  */
 661                 for (p = dlnp; p = strchr(p, '$'); ) {
 662                         uint_t  len, curlen;
 663                         char    *_ptr;
 664 
 665                         if (strncmp(++p, ORIGIN_STR, ORIGIN_STR_SIZE))
 666                                 continue;
 667 
 668                         /*
 669                          * We don't support $ORIGIN on setid programs to close
 670                          * a potential attack vector.
 671                          */
 672                         if ((setid & EXECSETID_SETID) != 0) {
 673                                 error = ENOEXEC;
 674                                 goto bad;
 675                         }
 676 
 677                         curlen = 0;
 678                         len = p - dlnp - 1;
 679                         if (len) {
 680                                 bcopy(dlnp, pathbufp, len);
 681                                 curlen += len;
 682                         }
 683                         if (_ptr = strrchr(args->pathname, '/')) {
 684                                 len = _ptr - args->pathname;
 685                                 if ((curlen + len) > MAXPATHLEN)
 686                                         break;
 687 
 688                                 bcopy(args->pathname, &pathbufp[curlen], len);
 689                                 curlen += len;
 690                         } else {
 691                                 /*
 692                                  * executable is a basename found in the
 693                                  * current directory.  So - just substitue
 694                                  * '.' for ORIGIN.
 695                                  */
 696                                 pathbufp[curlen] = '.';
 697                                 curlen++;
 698                         }
 699                         p += ORIGIN_STR_SIZE;
 700                         len = strlen(p);
 701 
 702                         if ((curlen + len) > MAXPATHLEN)
 703                                 break;
 704                         bcopy(p, &pathbufp[curlen], len);
 705                         curlen += len;
 706                         pathbufp[curlen++] = '\0';
 707                         bcopy(pathbufp, dlnp, curlen);
 708                 }
 709 
 710                 /*
 711                  * /usr/lib/ld.so.1 is known to be a symlink to /lib/ld.so.1
 712                  * (and /usr/lib/64/ld.so.1 is a symlink to /lib/64/ld.so.1).
 713                  * Just in case /usr is not mounted, change it now.
 714                  */
 715                 if (strcmp(dlnp, USR_LIB_RTLD) == 0)
 716                         dlnp += 4;
 717                 error = lookupname(dlnp, UIO_SYSSPACE, FOLLOW, NULLVPP, &nvp);
 718                 if (error && dlnp != bigwad->dl_name) {
 719                         /* new kernel, old user-level */
 720                         error = lookupname(dlnp -= 4, UIO_SYSSPACE, FOLLOW,
 721                             NULLVPP, &nvp);
 722                 }
 723                 if (error) {
 724                         uprintf("%s: Cannot find %s\n", exec_file, dlnp);
 725                         goto bad;
 726                 }
 727 
 728                 /*
 729                  * Setup the "aux" vector.
 730                  */
 731                 if (uphdr) {
 732                         if (ehdrp->e_type == ET_DYN) {
 733                                 /* don't use the first page */
 734                                 bigwad->exenv.ex_brkbase = (caddr_t)PAGESIZE;
 735                                 bigwad->exenv.ex_bssbase = (caddr_t)PAGESIZE;
 736                         } else {
 737                                 bigwad->exenv.ex_bssbase = bssbase;
 738                                 bigwad->exenv.ex_brkbase = brkbase;
 739                         }
 740                         bigwad->exenv.ex_brksize = brksize;
 741                         bigwad->exenv.ex_magic = elfmagic;
 742                         bigwad->exenv.ex_vp = vp;
 743                         setexecenv(&bigwad->exenv);
 744 
 745                         ADDAUX(aux, AT_PHDR, uphdr->p_vaddr + voffset)
 746                         ADDAUX(aux, AT_PHENT, ehdrp->e_phentsize)
 747                         ADDAUX(aux, AT_PHNUM, nphdrs)
 748                         ADDAUX(aux, AT_ENTRY, ehdrp->e_entry + voffset)
 749                 } else {
 750                         if ((error = execopen(&vp, &fd)) != 0) {
 751                                 VN_RELE(nvp);
 752                                 goto bad;
 753                         }
 754 
 755                         ADDAUX(aux, AT_EXECFD, fd)
 756                 }
 757 
 758                 if ((error = execpermissions(nvp, &bigwad->vattr, args)) != 0) {
 759                         VN_RELE(nvp);
 760                         uprintf("%s: Cannot execute %s\n", exec_file, dlnp);
 761                         goto bad;
 762                 }
 763 
 764                 /*
 765                  * Now obtain the ELF header along with the entire program
 766                  * header contained in "nvp".
 767                  */
 768                 kmem_free(phdrbase, phdrsize);
 769                 phdrbase = NULL;
 770                 if ((error = getelfhead(nvp, CRED(), ehdrp, &nshdrs,
 771                     &shstrndx, &nphdrs)) != 0 ||
 772                     (error = getelfphdr(nvp, CRED(), ehdrp, nphdrs, &phdrbase,
 773                     &phdrsize)) != 0) {
 774                         VN_RELE(nvp);
 775                         uprintf("%s: Cannot read %s\n", exec_file, dlnp);
 776                         goto bad;
 777                 }
 778 
 779                 /*
 780                  * Determine memory size of the "interpreter's" loadable
 781                  * sections.  This size is then used to obtain the virtual
 782                  * address of a hole, in the user's address space, large
 783                  * enough to map the "interpreter".
 784                  */
 785                 if ((len = elfsize(ehdrp, nphdrs, phdrbase, &lddata)) == 0) {
 786                         VN_RELE(nvp);
 787                         uprintf("%s: Nothing to load in %s\n", exec_file, dlnp);
 788                         goto bad;
 789                 }
 790 
 791                 dtrphdr = NULL;
 792 
 793                 error = mapelfexec(nvp, ehdrp, nphdrs, phdrbase, &junk, &junk,
 794                     &junk, &dtrphdr, NULL, NULL, NULL, &voffset, NULL, len,
 795                     execsz, NULL);
 796                 if (error || junk != NULL) {
 797                         VN_RELE(nvp);
 798                         uprintf("%s: Cannot map %s\n", exec_file, dlnp);
 799                         goto bad;
 800                 }
 801 
 802                 /*
 803                  * We use the DTrace program header to initialize the
 804                  * architecture-specific user per-LWP location. The dtrace
 805                  * fasttrap provider requires ready access to per-LWP scratch
 806                  * space. We assume that there is only one such program header
 807                  * in the interpreter.
 808                  */
 809                 if (dtrphdr != NULL &&
 810                     dtrace_safe_phdr(dtrphdr, args, voffset) != 0) {
 811                         VN_RELE(nvp);
 812                         uprintf("%s: Bad DTrace phdr in %s\n", exec_file, dlnp);
 813                         goto bad;
 814                 }
 815 
 816                 VN_RELE(nvp);
 817                 ADDAUX(aux, AT_SUN_LDDATA, voffset + lddata)
 818         }
 819 
 820         if (hasauxv) {
 821                 int auxf = AF_SUN_HWCAPVERIFY;
 822                 /*
 823                  * Note: AT_SUN_PLATFORM and AT_SUN_EXECNAME were filled in via
 824                  * exec_args()
 825                  */
 826                 ADDAUX(aux, AT_BASE, voffset)
 827                 ADDAUX(aux, AT_FLAGS, at_flags)
 828                 ADDAUX(aux, AT_PAGESZ, PAGESIZE)
 829                 /*
 830                  * Linker flags. (security)
 831                  * p_flag not yet set at this time.
 832                  * We rely on gexec() to provide us with the information.
 833                  * If the application is set-uid but this is not reflected
 834                  * in a mismatch between real/effective uids/gids, then
 835                  * don't treat this as a set-uid exec.  So we care about
 836                  * the EXECSETID_UGIDS flag but not the ...SETID flag.
 837                  */
 838                 if ((setid &= ~EXECSETID_SETID) != 0)
 839                         auxf |= AF_SUN_SETUGID;
 840 
 841                 /*
 842                  * If we're running a native process from within a branded
 843                  * zone under pfexec then we clear the AF_SUN_SETUGID flag so
 844                  * that the native ld.so.1 is able to link with the native
 845                  * libraries instead of using the brand libraries that are
 846                  * installed in the zone.  We only do this for processes
 847                  * which we trust because we see they are already running
 848                  * under pfexec (where uid != euid).  This prevents a
 849                  * malicious user within the zone from crafting a wrapper to
 850                  * run native suid commands with unsecure libraries interposed.
 851                  */
 852                 if ((brand_action == EBA_NATIVE) && (PROC_IS_BRANDED(p) &&
 853                     (setid &= ~EXECSETID_SETID) != 0))
 854                         auxf &= ~AF_SUN_SETUGID;
 855 
 856                 /*
 857                  * Record the user addr of the auxflags aux vector entry
 858                  * since brands may optionally want to manipulate this field.
 859                  */
 860                 args->auxp_auxflags =
 861                     (char *)((char *)args->stackend +
 862                     ((char *)&aux->a_type -
 863                     (char *)bigwad->elfargs));
 864                 ADDAUX(aux, AT_SUN_AUXFLAGS, auxf);
 865 
 866                 /*
 867                  * Hardware capability flag word (performance hints)
 868                  * Used for choosing faster library routines.
 869                  * (Potentially different between 32-bit and 64-bit ABIs)
 870                  */
 871 #if defined(_LP64)
 872                 if (args->to_model == DATAMODEL_NATIVE) {
 873                         ADDAUX(aux, AT_SUN_HWCAP, auxv_hwcap)
 874                         ADDAUX(aux, AT_SUN_HWCAP2, auxv_hwcap_2)
 875                 } else {
 876                         ADDAUX(aux, AT_SUN_HWCAP, auxv_hwcap32)
 877                         ADDAUX(aux, AT_SUN_HWCAP2, auxv_hwcap32_2)
 878                 }
 879 #else
 880                 ADDAUX(aux, AT_SUN_HWCAP, auxv_hwcap)
 881                 ADDAUX(aux, AT_SUN_HWCAP2, auxv_hwcap_2)
 882 #endif
 883                 if (branded) {
 884                         /*
 885                          * Reserve space for the brand-private aux vectors,
 886                          * and record the user addr of that space.
 887                          */
 888                         args->auxp_brand =
 889                             (char *)((char *)args->stackend +
 890                             ((char *)&aux->a_type -
 891                             (char *)bigwad->elfargs));
 892                         ADDAUX(aux, AT_SUN_BRAND_AUX1, 0)
 893                         ADDAUX(aux, AT_SUN_BRAND_AUX2, 0)
 894                         ADDAUX(aux, AT_SUN_BRAND_AUX3, 0)
 895                 }
 896 
 897                 ADDAUX(aux, AT_NULL, 0)
 898                 postfixsize = (char *)aux - (char *)bigwad->elfargs;
 899 
 900                 /*
 901                  * We make assumptions above when we determine how many aux
 902                  * vector entries we will be adding. However, if we have an
 903                  * invalid elf file, it is possible that mapelfexec might
 904                  * behave differently (but not return an error), in which case
 905                  * the number of aux entries we actually add will be different.
 906                  * We detect that now and error out.
 907                  */
 908                 if (postfixsize != args->auxsize) {
 909                         DTRACE_PROBE2(elfexec_badaux, int, postfixsize,
 910                             int, args->auxsize);
 911                         goto bad;
 912                 }
 913                 ASSERT(postfixsize <= __KERN_NAUXV_IMPL * sizeof (aux_entry_t));
 914         }
 915 
 916         /*
 917          * For the 64-bit kernel, the limit is big enough that rounding it up
 918          * to a page can overflow the 64-bit limit, so we check for btopr()
 919          * overflowing here by comparing it with the unrounded limit in pages.
 920          * If it hasn't overflowed, compare the exec size with the rounded up
 921          * limit in pages.  Otherwise, just compare with the unrounded limit.
 922          */
 923         limit = btop(p->p_vmem_ctl);
 924         roundlimit = btopr(p->p_vmem_ctl);
 925         if ((roundlimit > limit && *execsz > roundlimit) ||
 926             (roundlimit < limit && *execsz > limit)) {
 927                 mutex_enter(&p->p_lock);
 928                 (void) rctl_action(rctlproc_legacy[RLIMIT_VMEM], p->p_rctls, p,
 929                     RCA_SAFE);
 930                 mutex_exit(&p->p_lock);
 931                 error = ENOMEM;
 932                 goto bad;
 933         }
 934 
 935         bzero(up->u_auxv, sizeof (up->u_auxv));
 936         if (postfixsize) {
 937                 int num_auxv;
 938 
 939                 /*
 940                  * Copy the aux vector to the user stack.
 941                  */
 942                 error = execpoststack(args, bigwad->elfargs, postfixsize);
 943                 if (error)
 944                         goto bad;
 945 
 946                 /*
 947                  * Copy auxv to the process's user structure for use by /proc.
 948                  * If this is a branded process, the brand's exec routine will
 949                  * copy it's private entries to the user structure later. It
 950                  * relies on the fact that the blank entries are at the end.
 951                  */
 952                 num_auxv = postfixsize / sizeof (aux_entry_t);
 953                 ASSERT(num_auxv <= sizeof (up->u_auxv) / sizeof (auxv_t));
 954                 aux = bigwad->elfargs;
 955                 for (i = 0; i < num_auxv; i++) {
 956                         up->u_auxv[i].a_type = aux[i].a_type;
 957                         up->u_auxv[i].a_un.a_val = (aux_val_t)aux[i].a_un.a_val;
 958                 }
 959         }
 960 
 961         /*
 962          * Pass back the starting address so we can set the program counter.
 963          */
 964         args->entry = (uintptr_t)(ehdrp->e_entry + voffset);
 965 
 966         if (!uphdr) {
 967                 if (ehdrp->e_type == ET_DYN) {
 968                         /*
 969                          * If we are executing a shared library which doesn't
 970                          * have a interpreter (probably ld.so.1) then
 971                          * we don't set the brkbase now.  Instead we
 972                          * delay it's setting until the first call
 973                          * via grow.c::brk().  This permits ld.so.1 to
 974                          * initialize brkbase to the tail of the executable it
 975                          * loads (which is where it needs to be).
 976                          */
 977                         bigwad->exenv.ex_brkbase = (caddr_t)0;
 978                         bigwad->exenv.ex_bssbase = (caddr_t)0;
 979                         bigwad->exenv.ex_brksize = 0;
 980                 } else {
 981                         bigwad->exenv.ex_brkbase = brkbase;
 982                         bigwad->exenv.ex_bssbase = bssbase;
 983                         bigwad->exenv.ex_brksize = brksize;
 984                 }
 985                 bigwad->exenv.ex_magic = elfmagic;
 986                 bigwad->exenv.ex_vp = vp;
 987                 setexecenv(&bigwad->exenv);
 988         }
 989 
 990         ASSERT(error == 0);
 991         goto out;
 992 
 993 bad:
 994         if (fd != -1)           /* did we open the a.out yet */
 995                 (void) execclose(fd);
 996 
 997         psignal(p, SIGKILL);
 998 
 999         if (error == 0)
1000                 error = ENOEXEC;
1001 out:
1002         if (phdrbase != NULL)
1003                 kmem_free(phdrbase, phdrsize);
1004         if (cap != NULL)
1005                 kmem_free(cap, capsize);
1006         kmem_free(bigwad, sizeof (struct bigwad));
1007         return (error);
1008 }
1009 
1010 /*
1011  * Compute the memory size requirement for the ELF file.
1012  */
1013 static size_t
1014 elfsize(Ehdr *ehdrp, int nphdrs, caddr_t phdrbase, uintptr_t *lddata)
1015 {
1016         size_t  len;
1017         Phdr    *phdrp = (Phdr *)phdrbase;
1018         int     hsize = ehdrp->e_phentsize;
1019         int     first = 1;
1020         int     dfirst = 1;     /* first data segment */
1021         uintptr_t loaddr = 0;
1022         uintptr_t hiaddr = 0;
1023         uintptr_t lo, hi;
1024         int     i;
1025 
1026         for (i = nphdrs; i > 0; i--) {
1027                 if (phdrp->p_type == PT_LOAD) {
1028                         lo = phdrp->p_vaddr;
1029                         hi = lo + phdrp->p_memsz;
1030                         if (first) {
1031                                 loaddr = lo;
1032                                 hiaddr = hi;
1033                                 first = 0;
1034                         } else {
1035                                 if (loaddr > lo)
1036                                         loaddr = lo;
1037                                 if (hiaddr < hi)
1038                                         hiaddr = hi;
1039                         }
1040 
1041                         /*
1042                          * save the address of the first data segment
1043                          * of a object - used for the AT_SUNW_LDDATA
1044                          * aux entry.
1045                          */
1046                         if ((lddata != NULL) && dfirst &&
1047                             (phdrp->p_flags & PF_W)) {
1048                                 *lddata = lo;
1049                                 dfirst = 0;
1050                         }
1051                 }
1052                 phdrp = (Phdr *)((caddr_t)phdrp + hsize);
1053         }
1054 
1055         len = hiaddr - (loaddr & PAGEMASK);
1056         len = roundup(len, PAGESIZE);
1057 
1058         return (len);
1059 }
1060 
1061 /*
1062  * Read in the ELF header and program header table.
1063  * SUSV3 requires:
1064  *      ENOEXEC File format is not recognized
1065  *      EINVAL  Format recognized but execution not supported
1066  */
1067 static int
1068 getelfhead(vnode_t *vp, cred_t *credp, Ehdr *ehdr, int *nshdrs, int *shstrndx,
1069     int *nphdrs)
1070 {
1071         int error;
1072         ssize_t resid;
1073 
1074         /*
1075          * We got here by the first two bytes in ident,
1076          * now read the entire ELF header.
1077          */
1078         if ((error = vn_rdwr(UIO_READ, vp, (caddr_t)ehdr,
1079             sizeof (Ehdr), (offset_t)0, UIO_SYSSPACE, 0,
1080             (rlim64_t)0, credp, &resid)) != 0)
1081                 return (error);
1082 
1083         /*
1084          * Since a separate version is compiled for handling 32-bit and
1085          * 64-bit ELF executables on a 64-bit kernel, the 64-bit version
1086          * doesn't need to be able to deal with 32-bit ELF files.
1087          */
1088         if (resid != 0 ||
1089             ehdr->e_ident[EI_MAG2] != ELFMAG2 ||
1090             ehdr->e_ident[EI_MAG3] != ELFMAG3)
1091                 return (ENOEXEC);
1092 
1093         if ((ehdr->e_type != ET_EXEC && ehdr->e_type != ET_DYN) ||
1094 #if defined(_ILP32) || defined(_ELF32_COMPAT)
1095             ehdr->e_ident[EI_CLASS] != ELFCLASS32 ||
1096 #else
1097             ehdr->e_ident[EI_CLASS] != ELFCLASS64 ||
1098 #endif
1099             !elfheadcheck(ehdr->e_ident[EI_DATA], ehdr->e_machine,
1100             ehdr->e_flags))
1101                 return (EINVAL);
1102 
1103         *nshdrs = ehdr->e_shnum;
1104         *shstrndx = ehdr->e_shstrndx;
1105         *nphdrs = ehdr->e_phnum;
1106 
1107         /*
1108          * If e_shnum, e_shstrndx, or e_phnum is its sentinel value, we need
1109          * to read in the section header at index zero to acces the true
1110          * values for those fields.
1111          */
1112         if ((*nshdrs == 0 && ehdr->e_shoff != 0) ||
1113             *shstrndx == SHN_XINDEX || *nphdrs == PN_XNUM) {
1114                 Shdr shdr;
1115 
1116                 if (ehdr->e_shoff == 0)
1117                         return (EINVAL);
1118 
1119                 if ((error = vn_rdwr(UIO_READ, vp, (caddr_t)&shdr,
1120                     sizeof (shdr), (offset_t)ehdr->e_shoff, UIO_SYSSPACE, 0,
1121                     (rlim64_t)0, credp, &resid)) != 0)
1122                         return (error);
1123 
1124                 if (*nshdrs == 0)
1125                         *nshdrs = shdr.sh_size;
1126                 if (*shstrndx == SHN_XINDEX)
1127                         *shstrndx = shdr.sh_link;
1128                 if (*nphdrs == PN_XNUM && shdr.sh_info != 0)
1129                         *nphdrs = shdr.sh_info;
1130         }
1131 
1132         return (0);
1133 }
1134 
1135 #ifdef _ELF32_COMPAT
1136 extern size_t elf_nphdr_max;
1137 #else
1138 size_t elf_nphdr_max = 1000;
1139 #endif
1140 
1141 static int
1142 getelfphdr(vnode_t *vp, cred_t *credp, const Ehdr *ehdr, int nphdrs,
1143     caddr_t *phbasep, ssize_t *phsizep)
1144 {
1145         ssize_t resid, minsize;
1146         int err;
1147 
1148         /*
1149          * Since we're going to be using e_phentsize to iterate down the
1150          * array of program headers, it must be 8-byte aligned or else
1151          * a we might cause a misaligned access. We use all members through
1152          * p_flags on 32-bit ELF files and p_memsz on 64-bit ELF files so
1153          * e_phentsize must be at least large enough to include those
1154          * members.
1155          */
1156 #if !defined(_LP64) || defined(_ELF32_COMPAT)
1157         minsize = offsetof(Phdr, p_flags) + sizeof (((Phdr *)NULL)->p_flags);
1158 #else
1159         minsize = offsetof(Phdr, p_memsz) + sizeof (((Phdr *)NULL)->p_memsz);
1160 #endif
1161         if (ehdr->e_phentsize < minsize || (ehdr->e_phentsize & 3))
1162                 return (EINVAL);
1163 
1164         *phsizep = nphdrs * ehdr->e_phentsize;
1165 
1166         if (*phsizep > sizeof (Phdr) * elf_nphdr_max) {
1167                 if ((*phbasep = kmem_alloc(*phsizep, KM_NOSLEEP)) == NULL)
1168                         return (ENOMEM);
1169         } else {
1170                 *phbasep = kmem_alloc(*phsizep, KM_SLEEP);
1171         }
1172 
1173         if ((err = vn_rdwr(UIO_READ, vp, *phbasep, *phsizep,
1174             (offset_t)ehdr->e_phoff, UIO_SYSSPACE, 0, (rlim64_t)0,
1175             credp, &resid)) != 0) {
1176                 kmem_free(*phbasep, *phsizep);
1177                 *phbasep = NULL;
1178                 return (err);
1179         }
1180 
1181         return (0);
1182 }
1183 
1184 #ifdef _ELF32_COMPAT
1185 extern size_t elf_nshdr_max;
1186 extern size_t elf_shstrtab_max;
1187 #else
1188 size_t elf_nshdr_max = 10000;
1189 size_t elf_shstrtab_max = 100 * 1024;
1190 #endif
1191 
1192 
1193 static int
1194 getelfshdr(vnode_t *vp, cred_t *credp, const Ehdr *ehdr,
1195     int nshdrs, int shstrndx, caddr_t *shbasep, ssize_t *shsizep,
1196     char **shstrbasep, ssize_t *shstrsizep)
1197 {
1198         ssize_t resid, minsize;
1199         int err;
1200         Shdr *shdr;
1201 
1202         /*
1203          * Since we're going to be using e_shentsize to iterate down the
1204          * array of section headers, it must be 8-byte aligned or else
1205          * a we might cause a misaligned access. We use all members through
1206          * sh_entsize (on both 32- and 64-bit ELF files) so e_shentsize
1207          * must be at least large enough to include that member. The index
1208          * of the string table section must also be valid.
1209          */
1210         minsize = offsetof(Shdr, sh_entsize) + sizeof (shdr->sh_entsize);
1211         if (ehdr->e_shentsize < minsize || (ehdr->e_shentsize & 3) ||
1212             shstrndx >= nshdrs)
1213                 return (EINVAL);
1214 
1215         *shsizep = nshdrs * ehdr->e_shentsize;
1216 
1217         if (*shsizep > sizeof (Shdr) * elf_nshdr_max) {
1218                 if ((*shbasep = kmem_alloc(*shsizep, KM_NOSLEEP)) == NULL)
1219                         return (ENOMEM);
1220         } else {
1221                 *shbasep = kmem_alloc(*shsizep, KM_SLEEP);
1222         }
1223 
1224         if ((err = vn_rdwr(UIO_READ, vp, *shbasep, *shsizep,
1225             (offset_t)ehdr->e_shoff, UIO_SYSSPACE, 0, (rlim64_t)0,
1226             credp, &resid)) != 0) {
1227                 kmem_free(*shbasep, *shsizep);
1228                 return (err);
1229         }
1230 
1231         /*
1232          * Pull the section string table out of the vnode; fail if the size
1233          * is zero.
1234          */
1235         shdr = (Shdr *)(*shbasep + shstrndx * ehdr->e_shentsize);
1236         if ((*shstrsizep = shdr->sh_size) == 0) {
1237                 kmem_free(*shbasep, *shsizep);
1238                 return (EINVAL);
1239         }
1240 
1241         if (*shstrsizep > elf_shstrtab_max) {
1242                 if ((*shstrbasep = kmem_alloc(*shstrsizep,
1243                     KM_NOSLEEP)) == NULL) {
1244                         kmem_free(*shbasep, *shsizep);
1245                         return (ENOMEM);
1246                 }
1247         } else {
1248                 *shstrbasep = kmem_alloc(*shstrsizep, KM_SLEEP);
1249         }
1250 
1251         if ((err = vn_rdwr(UIO_READ, vp, *shstrbasep, *shstrsizep,
1252             (offset_t)shdr->sh_offset, UIO_SYSSPACE, 0, (rlim64_t)0,
1253             credp, &resid)) != 0) {
1254                 kmem_free(*shbasep, *shsizep);
1255                 kmem_free(*shstrbasep, *shstrsizep);
1256                 return (err);
1257         }
1258 
1259         /*
1260          * Make sure the strtab is null-terminated to make sure we
1261          * don't run off the end of the table.
1262          */
1263         (*shstrbasep)[*shstrsizep - 1] = '\0';
1264 
1265         return (0);
1266 }
1267 
1268 static int
1269 mapelfexec(
1270         vnode_t *vp,
1271         Ehdr *ehdr,
1272         int nphdrs,
1273         caddr_t phdrbase,
1274         Phdr **uphdr,
1275         Phdr **intphdr,
1276         Phdr **stphdr,
1277         Phdr **dtphdr,
1278         Phdr *dataphdrp,
1279         caddr_t *bssbase,
1280         caddr_t *brkbase,
1281         intptr_t *voffset,
1282         intptr_t *minaddr,
1283         size_t len,
1284         long *execsz,
1285         size_t *brksize)
1286 {
1287         Phdr *phdr;
1288         int i, prot, error;
1289         caddr_t addr = NULL;
1290         size_t zfodsz;
1291         int ptload = 0;
1292         int page;
1293         off_t offset;
1294         int hsize = ehdr->e_phentsize;
1295         caddr_t mintmp = (caddr_t)-1;
1296         extern int use_brk_lpg;
1297 
1298         if (ehdr->e_type == ET_DYN) {
1299                 secflagset_t flags = 0;
1300                 /*
1301                  * Obtain the virtual address of a hole in the
1302                  * address space to map the "interpreter".
1303                  */
1304                 if (secflag_enabled(curproc, PROC_SEC_ASLR))
1305                         flags |= _MAP_RANDOMIZE;
1306 
1307                 map_addr(&addr, len, (offset_t)0, 1, flags);
1308                 if (addr == NULL)
1309                         return (ENOMEM);
1310                 *voffset = (intptr_t)addr;
1311 
1312                 /*
1313                  * Calculate the minimum vaddr so it can be subtracted out.
1314                  * According to the ELF specification, since PT_LOAD sections
1315                  * must be sorted by increasing p_vaddr values, this is
1316                  * guaranteed to be the first PT_LOAD section.
1317                  */
1318                 phdr = (Phdr *)phdrbase;
1319                 for (i = nphdrs; i > 0; i--) {
1320                         if (phdr->p_type == PT_LOAD) {
1321                                 *voffset -= (uintptr_t)phdr->p_vaddr;
1322                                 break;
1323                         }
1324                         phdr = (Phdr *)((caddr_t)phdr + hsize);
1325                 }
1326 
1327         } else {
1328                 *voffset = 0;
1329         }
1330         phdr = (Phdr *)phdrbase;
1331         for (i = nphdrs; i > 0; i--) {
1332                 switch (phdr->p_type) {
1333                 case PT_LOAD:
1334                         if ((*intphdr != NULL) && (*uphdr == NULL))
1335                                 return (0);
1336 
1337                         ptload = 1;
1338                         prot = PROT_USER;
1339                         if (phdr->p_flags & PF_R)
1340                                 prot |= PROT_READ;
1341                         if (phdr->p_flags & PF_W)
1342                                 prot |= PROT_WRITE;
1343                         if (phdr->p_flags & PF_X)
1344                                 prot |= PROT_EXEC;
1345 
1346                         addr = (caddr_t)((uintptr_t)phdr->p_vaddr + *voffset);
1347 
1348                         /*
1349                          * Keep track of the segment with the lowest starting
1350                          * address.
1351                          */
1352                         if (addr < mintmp)
1353                                 mintmp = addr;
1354 
1355                         zfodsz = (size_t)phdr->p_memsz - phdr->p_filesz;
1356 
1357                         offset = phdr->p_offset;
1358                         if (((uintptr_t)offset & PAGEOFFSET) ==
1359                             ((uintptr_t)addr & PAGEOFFSET) &&
1360                             (!(vp->v_flag & VNOMAP))) {
1361                                 page = 1;
1362                         } else {
1363                                 page = 0;
1364                         }
1365 
1366                         /*
1367                          * Set the heap pagesize for OOB when the bss size
1368                          * is known and use_brk_lpg is not 0.
1369                          */
1370                         if (brksize != NULL && use_brk_lpg &&
1371                             zfodsz != 0 && phdr == dataphdrp &&
1372                             (prot & PROT_WRITE)) {
1373                                 size_t tlen = P2NPHASE((uintptr_t)addr +
1374                                     phdr->p_filesz, PAGESIZE);
1375 
1376                                 if (zfodsz > tlen) {
1377                                         curproc->p_brkpageszc =
1378                                             page_szc(map_pgsz(MAPPGSZ_HEAP,
1379                                             curproc, addr + phdr->p_filesz +
1380                                             tlen, zfodsz - tlen, 0));
1381                                 }
1382                         }
1383 
1384                         if (curproc->p_brkpageszc != 0 && phdr == dataphdrp &&
1385                             (prot & PROT_WRITE)) {
1386                                 uint_t  szc = curproc->p_brkpageszc;
1387                                 size_t pgsz = page_get_pagesize(szc);
1388                                 caddr_t ebss = addr + phdr->p_memsz;
1389                                 /*
1390                                  * If we need extra space to keep the BSS an
1391                                  * integral number of pages in size, some of
1392                                  * that space may fall beyond p_brkbase, so we
1393                                  * need to set p_brksize to account for it
1394                                  * being (logically) part of the brk.
1395                                  */
1396                                 size_t extra_zfodsz;
1397 
1398                                 ASSERT(pgsz > PAGESIZE);
1399 
1400                                 extra_zfodsz = P2NPHASE((uintptr_t)ebss, pgsz);
1401 
1402                                 if (error = execmap(vp, addr, phdr->p_filesz,
1403                                     zfodsz + extra_zfodsz, phdr->p_offset,
1404                                     prot, page, szc))
1405                                         goto bad;
1406                                 if (brksize != NULL)
1407                                         *brksize = extra_zfodsz;
1408                         } else {
1409                                 if (error = execmap(vp, addr, phdr->p_filesz,
1410                                     zfodsz, phdr->p_offset, prot, page, 0))
1411                                         goto bad;
1412                         }
1413 
1414                         if (bssbase != NULL && addr >= *bssbase &&
1415                             phdr == dataphdrp) {
1416                                 *bssbase = addr + phdr->p_filesz;
1417                         }
1418                         if (brkbase != NULL && addr >= *brkbase) {
1419                                 *brkbase = addr + phdr->p_memsz;
1420                         }
1421 
1422                         *execsz += btopr(phdr->p_memsz);
1423                         break;
1424 
1425                 case PT_INTERP:
1426                         if (ptload)
1427                                 goto bad;
1428                         *intphdr = phdr;
1429                         break;
1430 
1431                 case PT_SHLIB:
1432                         *stphdr = phdr;
1433                         break;
1434 
1435                 case PT_PHDR:
1436                         if (ptload)
1437                                 goto bad;
1438                         *uphdr = phdr;
1439                         break;
1440 
1441                 case PT_NULL:
1442                 case PT_DYNAMIC:
1443                 case PT_NOTE:
1444                         break;
1445 
1446                 case PT_SUNWDTRACE:
1447                         if (dtphdr != NULL)
1448                                 *dtphdr = phdr;
1449                         break;
1450 
1451                 default:
1452                         break;
1453                 }
1454                 phdr = (Phdr *)((caddr_t)phdr + hsize);
1455         }
1456 
1457         if (minaddr != NULL) {
1458                 ASSERT(mintmp != (caddr_t)-1);
1459                 *minaddr = (intptr_t)mintmp;
1460         }
1461 
1462         if (brkbase != NULL && secflag_enabled(curproc, PROC_SEC_ASLR)) {
1463                 size_t off;
1464                 uintptr_t base = (uintptr_t)*brkbase;
1465                 uintptr_t oend = base + *brksize;
1466 
1467                 ASSERT(ISP2(aslr_max_brk_skew));
1468 
1469                 (void) random_get_pseudo_bytes((uint8_t *)&off, sizeof (off));
1470                 base += P2PHASE(off, aslr_max_brk_skew);
1471                 base = P2ROUNDUP(base, PAGESIZE);
1472                 *brkbase = (caddr_t)base;
1473                 /*
1474                  * Above, we set *brksize to account for the possibility we
1475                  * had to grow the 'brk' in padding out the BSS to a page
1476                  * boundary.
1477                  *
1478                  * We now need to adjust that based on where we now are
1479                  * actually putting the brk.
1480                  */
1481                 if (oend > base)
1482                         *brksize = oend - base;
1483                 else
1484                         *brksize = 0;
1485         }
1486 
1487         return (0);
1488 bad:
1489         if (error == 0)
1490                 error = EINVAL;
1491         return (error);
1492 }
1493 
1494 int
1495 elfnote(vnode_t *vp, offset_t *offsetp, int type, int descsz, void *desc,
1496     rlim64_t rlimit, cred_t *credp)
1497 {
1498         Note note;
1499         int error;
1500 
1501         bzero(&note, sizeof (note));
1502         bcopy("CORE", note.name, 4);
1503         note.nhdr.n_type = type;
1504         /*
1505          * The System V ABI states that n_namesz must be the length of the
1506          * string that follows the Nhdr structure including the terminating
1507          * null. The ABI also specifies that sufficient padding should be
1508          * included so that the description that follows the name string
1509          * begins on a 4- or 8-byte boundary for 32- and 64-bit binaries
1510          * respectively. However, since this change was not made correctly
1511          * at the time of the 64-bit port, both 32- and 64-bit binaries
1512          * descriptions are only guaranteed to begin on a 4-byte boundary.
1513          */
1514         note.nhdr.n_namesz = 5;
1515         note.nhdr.n_descsz = roundup(descsz, sizeof (Word));
1516 
1517         if (error = core_write(vp, UIO_SYSSPACE, *offsetp, &note,
1518             sizeof (note), rlimit, credp))
1519                 return (error);
1520 
1521         *offsetp += sizeof (note);
1522 
1523         if (error = core_write(vp, UIO_SYSSPACE, *offsetp, desc,
1524             note.nhdr.n_descsz, rlimit, credp))
1525                 return (error);
1526 
1527         *offsetp += note.nhdr.n_descsz;
1528         return (0);
1529 }
1530 
1531 /*
1532  * Copy the section data from one vnode to the section of another vnode.
1533  */
1534 static void
1535 copy_scn(Shdr *src, vnode_t *src_vp, Shdr *dst, vnode_t *dst_vp, Off *doffset,
1536     void *buf, size_t size, cred_t *credp, rlim64_t rlimit)
1537 {
1538         ssize_t resid;
1539         size_t len, n = src->sh_size;
1540         offset_t off = 0;
1541 
1542         while (n != 0) {
1543                 len = MIN(size, n);
1544                 if (vn_rdwr(UIO_READ, src_vp, buf, len, src->sh_offset + off,
1545                     UIO_SYSSPACE, 0, (rlim64_t)0, credp, &resid) != 0 ||
1546                     resid >= len ||
1547                     core_write(dst_vp, UIO_SYSSPACE, *doffset + off,
1548                     buf, len - resid, rlimit, credp) != 0) {
1549                         dst->sh_size = 0;
1550                         dst->sh_offset = 0;
1551                         return;
1552                 }
1553 
1554                 ASSERT(n >= len - resid);
1555 
1556                 n -= len - resid;
1557                 off += len - resid;
1558         }
1559 
1560         *doffset += src->sh_size;
1561 }
1562 
1563 #ifdef _ELF32_COMPAT
1564 extern size_t elf_datasz_max;
1565 #else
1566 size_t elf_datasz_max = 1 * 1024 * 1024;
1567 #endif
1568 
1569 /*
1570  * This function processes mappings that correspond to load objects to
1571  * examine their respective sections for elfcore(). It's called once with
1572  * v set to NULL to count the number of sections that we're going to need
1573  * and then again with v set to some allocated buffer that we fill in with
1574  * all the section data.
1575  */
1576 static int
1577 process_scns(core_content_t content, proc_t *p, cred_t *credp, vnode_t *vp,
1578     Shdr *v, int nv, rlim64_t rlimit, Off *doffsetp, int *nshdrsp)
1579 {
1580         vnode_t *lastvp = NULL;
1581         struct seg *seg;
1582         int i, j;
1583         void *data = NULL;
1584         size_t datasz = 0;
1585         shstrtab_t shstrtab;
1586         struct as *as = p->p_as;
1587         int error = 0;
1588 
1589         if (v != NULL)
1590                 shstrtab_init(&shstrtab);
1591 
1592         i = 1;
1593         for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
1594                 uint_t prot;
1595                 vnode_t *mvp;
1596                 void *tmp = NULL;
1597                 caddr_t saddr = seg->s_base;
1598                 caddr_t naddr;
1599                 caddr_t eaddr;
1600                 size_t segsize;
1601 
1602                 Ehdr ehdr;
1603                 int nshdrs, shstrndx, nphdrs;
1604                 caddr_t shbase;
1605                 ssize_t shsize;
1606                 char *shstrbase;
1607                 ssize_t shstrsize;
1608 
1609                 Shdr *shdr;
1610                 const char *name;
1611                 size_t sz;
1612                 uintptr_t off;
1613 
1614                 int ctf_ndx = 0;
1615                 int symtab_ndx = 0;
1616 
1617                 /*
1618                  * Since we're just looking for text segments of load
1619                  * objects, we only care about the protection bits; we don't
1620                  * care about the actual size of the segment so we use the
1621                  * reserved size. If the segment's size is zero, there's
1622                  * something fishy going on so we ignore this segment.
1623                  */
1624                 if (seg->s_ops != &segvn_ops ||
1625                     SEGOP_GETVP(seg, seg->s_base, &mvp) != 0 ||
1626                     mvp == lastvp || mvp == NULL || mvp->v_type != VREG ||
1627                     (segsize = pr_getsegsize(seg, 1)) == 0)
1628                         continue;
1629 
1630                 eaddr = saddr + segsize;
1631                 prot = pr_getprot(seg, 1, &tmp, &saddr, &naddr, eaddr);
1632                 pr_getprot_done(&tmp);
1633 
1634                 /*
1635                  * Skip this segment unless the protection bits look like
1636                  * what we'd expect for a text segment.
1637                  */
1638                 if ((prot & (PROT_WRITE | PROT_EXEC)) != PROT_EXEC)
1639                         continue;
1640 
1641                 if (getelfhead(mvp, credp, &ehdr, &nshdrs, &shstrndx,
1642                     &nphdrs) != 0 ||
1643                     getelfshdr(mvp, credp, &ehdr, nshdrs, shstrndx,
1644                     &shbase, &shsize, &shstrbase, &shstrsize) != 0)
1645                         continue;
1646 
1647                 off = ehdr.e_shentsize;
1648                 for (j = 1; j < nshdrs; j++, off += ehdr.e_shentsize) {
1649                         Shdr *symtab = NULL, *strtab;
1650 
1651                         shdr = (Shdr *)(shbase + off);
1652 
1653                         if (shdr->sh_name >= shstrsize)
1654                                 continue;
1655 
1656                         name = shstrbase + shdr->sh_name;
1657 
1658                         if (strcmp(name, shstrtab_data[STR_CTF]) == 0) {
1659                                 if ((content & CC_CONTENT_CTF) == 0 ||
1660                                     ctf_ndx != 0)
1661                                         continue;
1662 
1663                                 if (shdr->sh_link > 0 &&
1664                                     shdr->sh_link < nshdrs) {
1665                                         symtab = (Shdr *)(shbase +
1666                                             shdr->sh_link * ehdr.e_shentsize);
1667                                 }
1668 
1669                                 if (v != NULL && i < nv - 1) {
1670                                         if (shdr->sh_size > datasz &&
1671                                             shdr->sh_size <= elf_datasz_max) {
1672                                                 if (data != NULL)
1673                                                         kmem_free(data, datasz);
1674 
1675                                                 datasz = shdr->sh_size;
1676                                                 data = kmem_alloc(datasz,
1677                                                     KM_SLEEP);
1678                                         }
1679 
1680                                         v[i].sh_name = shstrtab_ndx(&shstrtab,
1681                                             STR_CTF);
1682                                         v[i].sh_addr = (Addr)(uintptr_t)saddr;
1683                                         v[i].sh_type = SHT_PROGBITS;
1684                                         v[i].sh_addralign = 4;
1685                                         *doffsetp = roundup(*doffsetp,
1686                                             v[i].sh_addralign);
1687                                         v[i].sh_offset = *doffsetp;
1688                                         v[i].sh_size = shdr->sh_size;
1689                                         if (symtab == NULL)  {
1690                                                 v[i].sh_link = 0;
1691                                         } else if (symtab->sh_type ==
1692                                             SHT_SYMTAB &&
1693                                             symtab_ndx != 0) {
1694                                                 v[i].sh_link =
1695                                                     symtab_ndx;
1696                                         } else {
1697                                                 v[i].sh_link = i + 1;
1698                                         }
1699 
1700                                         copy_scn(shdr, mvp, &v[i], vp,
1701                                             doffsetp, data, datasz, credp,
1702                                             rlimit);
1703                                 }
1704 
1705                                 ctf_ndx = i++;
1706 
1707                                 /*
1708                                  * We've already dumped the symtab.
1709                                  */
1710                                 if (symtab != NULL &&
1711                                     symtab->sh_type == SHT_SYMTAB &&
1712                                     symtab_ndx != 0)
1713                                         continue;
1714 
1715                         } else if (strcmp(name,
1716                             shstrtab_data[STR_SYMTAB]) == 0) {
1717                                 if ((content & CC_CONTENT_SYMTAB) == 0 ||
1718                                     symtab != 0)
1719                                         continue;
1720 
1721                                 symtab = shdr;
1722                         }
1723 
1724                         if (symtab != NULL) {
1725                                 if ((symtab->sh_type != SHT_DYNSYM &&
1726                                     symtab->sh_type != SHT_SYMTAB) ||
1727                                     symtab->sh_link == 0 ||
1728                                     symtab->sh_link >= nshdrs)
1729                                         continue;
1730 
1731                                 strtab = (Shdr *)(shbase +
1732                                     symtab->sh_link * ehdr.e_shentsize);
1733 
1734                                 if (strtab->sh_type != SHT_STRTAB)
1735                                         continue;
1736 
1737                                 if (v != NULL && i < nv - 2) {
1738                                         sz = MAX(symtab->sh_size,
1739                                             strtab->sh_size);
1740                                         if (sz > datasz &&
1741                                             sz <= elf_datasz_max) {
1742                                                 if (data != NULL)
1743                                                         kmem_free(data, datasz);
1744 
1745                                                 datasz = sz;
1746                                                 data = kmem_alloc(datasz,
1747                                                     KM_SLEEP);
1748                                         }
1749 
1750                                         if (symtab->sh_type == SHT_DYNSYM) {
1751                                                 v[i].sh_name = shstrtab_ndx(
1752                                                     &shstrtab, STR_DYNSYM);
1753                                                 v[i + 1].sh_name = shstrtab_ndx(
1754                                                     &shstrtab, STR_DYNSTR);
1755                                         } else {
1756                                                 v[i].sh_name = shstrtab_ndx(
1757                                                     &shstrtab, STR_SYMTAB);
1758                                                 v[i + 1].sh_name = shstrtab_ndx(
1759                                                     &shstrtab, STR_STRTAB);
1760                                         }
1761 
1762                                         v[i].sh_type = symtab->sh_type;
1763                                         v[i].sh_addr = symtab->sh_addr;
1764                                         if (ehdr.e_type == ET_DYN ||
1765                                             v[i].sh_addr == 0)
1766                                                 v[i].sh_addr +=
1767                                                     (Addr)(uintptr_t)saddr;
1768                                         v[i].sh_addralign =
1769                                             symtab->sh_addralign;
1770                                         *doffsetp = roundup(*doffsetp,
1771                                             v[i].sh_addralign);
1772                                         v[i].sh_offset = *doffsetp;
1773                                         v[i].sh_size = symtab->sh_size;
1774                                         v[i].sh_link = i + 1;
1775                                         v[i].sh_entsize = symtab->sh_entsize;
1776                                         v[i].sh_info = symtab->sh_info;
1777 
1778                                         copy_scn(symtab, mvp, &v[i], vp,
1779                                             doffsetp, data, datasz, credp,
1780                                             rlimit);
1781 
1782                                         v[i + 1].sh_type = SHT_STRTAB;
1783                                         v[i + 1].sh_flags = SHF_STRINGS;
1784                                         v[i + 1].sh_addr = symtab->sh_addr;
1785                                         if (ehdr.e_type == ET_DYN ||
1786                                             v[i + 1].sh_addr == 0)
1787                                                 v[i + 1].sh_addr +=
1788                                                     (Addr)(uintptr_t)saddr;
1789                                         v[i + 1].sh_addralign =
1790                                             strtab->sh_addralign;
1791                                         *doffsetp = roundup(*doffsetp,
1792                                             v[i + 1].sh_addralign);
1793                                         v[i + 1].sh_offset = *doffsetp;
1794                                         v[i + 1].sh_size = strtab->sh_size;
1795 
1796                                         copy_scn(strtab, mvp, &v[i + 1], vp,
1797                                             doffsetp, data, datasz, credp,
1798                                             rlimit);
1799                                 }
1800 
1801                                 if (symtab->sh_type == SHT_SYMTAB)
1802                                         symtab_ndx = i;
1803                                 i += 2;
1804                         }
1805                 }
1806 
1807                 kmem_free(shstrbase, shstrsize);
1808                 kmem_free(shbase, shsize);
1809 
1810                 lastvp = mvp;
1811         }
1812 
1813         if (v == NULL) {
1814                 if (i == 1)
1815                         *nshdrsp = 0;
1816                 else
1817                         *nshdrsp = i + 1;
1818                 goto done;
1819         }
1820 
1821         if (i != nv - 1) {
1822                 cmn_err(CE_WARN, "elfcore: core dump failed for "
1823                     "process %d; address space is changing", p->p_pid);
1824                 error = EIO;
1825                 goto done;
1826         }
1827 
1828         v[i].sh_name = shstrtab_ndx(&shstrtab, STR_SHSTRTAB);
1829         v[i].sh_size = shstrtab_size(&shstrtab);
1830         v[i].sh_addralign = 1;
1831         *doffsetp = roundup(*doffsetp, v[i].sh_addralign);
1832         v[i].sh_offset = *doffsetp;
1833         v[i].sh_flags = SHF_STRINGS;
1834         v[i].sh_type = SHT_STRTAB;
1835 
1836         if (v[i].sh_size > datasz) {
1837                 if (data != NULL)
1838                         kmem_free(data, datasz);
1839 
1840                 datasz = v[i].sh_size;
1841                 data = kmem_alloc(datasz,
1842                     KM_SLEEP);
1843         }
1844 
1845         shstrtab_dump(&shstrtab, data);
1846 
1847         if ((error = core_write(vp, UIO_SYSSPACE, *doffsetp,
1848             data, v[i].sh_size, rlimit, credp)) != 0)
1849                 goto done;
1850 
1851         *doffsetp += v[i].sh_size;
1852 
1853 done:
1854         if (data != NULL)
1855                 kmem_free(data, datasz);
1856 
1857         return (error);
1858 }
1859 
1860 int
1861 elfcore(vnode_t *vp, proc_t *p, cred_t *credp, rlim64_t rlimit, int sig,
1862     core_content_t content)
1863 {
1864         offset_t poffset, soffset;
1865         Off doffset;
1866         int error, i, nphdrs, nshdrs;
1867         int overflow = 0;
1868         struct seg *seg;
1869         struct as *as = p->p_as;
1870         union {
1871                 Ehdr ehdr;
1872                 Phdr phdr[1];
1873                 Shdr shdr[1];
1874         } *bigwad;
1875         size_t bigsize;
1876         size_t phdrsz, shdrsz;
1877         Ehdr *ehdr;
1878         Phdr *v;
1879         caddr_t brkbase;
1880         size_t brksize;
1881         caddr_t stkbase;
1882         size_t stksize;
1883         int ntries = 0;
1884         klwp_t *lwp = ttolwp(curthread);
1885 
1886 top:
1887         /*
1888          * Make sure we have everything we need (registers, etc.).
1889          * All other lwps have already stopped and are in an orderly state.
1890          */
1891         ASSERT(p == ttoproc(curthread));
1892         prstop(0, 0);
1893 
1894         AS_LOCK_ENTER(as, RW_WRITER);
1895         nphdrs = prnsegs(as, 0) + 2;            /* two CORE note sections */
1896 
1897         /*
1898          * Count the number of section headers we're going to need.
1899          */
1900         nshdrs = 0;
1901         if (content & (CC_CONTENT_CTF | CC_CONTENT_SYMTAB)) {
1902                 (void) process_scns(content, p, credp, NULL, NULL, NULL, 0,
1903                     NULL, &nshdrs);
1904         }
1905         AS_LOCK_EXIT(as);
1906 
1907         ASSERT(nshdrs == 0 || nshdrs > 1);
1908 
1909         /*
1910          * The core file contents may required zero section headers, but if
1911          * we overflow the 16 bits allotted to the program header count in
1912          * the ELF header, we'll need that program header at index zero.
1913          */
1914         if (nshdrs == 0 && nphdrs >= PN_XNUM)
1915                 nshdrs = 1;
1916 
1917         phdrsz = nphdrs * sizeof (Phdr);
1918         shdrsz = nshdrs * sizeof (Shdr);
1919 
1920         bigsize = MAX(sizeof (*bigwad), MAX(phdrsz, shdrsz));
1921         bigwad = kmem_alloc(bigsize, KM_SLEEP);
1922 
1923         ehdr = &bigwad->ehdr;
1924         bzero(ehdr, sizeof (*ehdr));
1925 
1926         ehdr->e_ident[EI_MAG0] = ELFMAG0;
1927         ehdr->e_ident[EI_MAG1] = ELFMAG1;
1928         ehdr->e_ident[EI_MAG2] = ELFMAG2;
1929         ehdr->e_ident[EI_MAG3] = ELFMAG3;
1930         ehdr->e_ident[EI_CLASS] = ELFCLASS;
1931         ehdr->e_type = ET_CORE;
1932 
1933 #if !defined(_LP64) || defined(_ELF32_COMPAT)
1934 
1935 #if defined(__sparc)
1936         ehdr->e_ident[EI_DATA] = ELFDATA2MSB;
1937         ehdr->e_machine = EM_SPARC;
1938 #elif defined(__i386) || defined(__i386_COMPAT)
1939         ehdr->e_ident[EI_DATA] = ELFDATA2LSB;
1940         ehdr->e_machine = EM_386;
1941 #else
1942 #error "no recognized machine type is defined"
1943 #endif
1944 
1945 #else   /* !defined(_LP64) || defined(_ELF32_COMPAT) */
1946 
1947 #if defined(__sparc)
1948         ehdr->e_ident[EI_DATA] = ELFDATA2MSB;
1949         ehdr->e_machine = EM_SPARCV9;
1950 #elif defined(__amd64)
1951         ehdr->e_ident[EI_DATA] = ELFDATA2LSB;
1952         ehdr->e_machine = EM_AMD64;
1953 #else
1954 #error "no recognized 64-bit machine type is defined"
1955 #endif
1956 
1957 #endif  /* !defined(_LP64) || defined(_ELF32_COMPAT) */
1958 
1959         /*
1960          * If the count of program headers or section headers or the index
1961          * of the section string table can't fit in the mere 16 bits
1962          * shortsightedly allotted to them in the ELF header, we use the
1963          * extended formats and put the real values in the section header
1964          * as index 0.
1965          */
1966         ehdr->e_version = EV_CURRENT;
1967         ehdr->e_ehsize = sizeof (Ehdr);
1968 
1969         if (nphdrs >= PN_XNUM)
1970                 ehdr->e_phnum = PN_XNUM;
1971         else
1972                 ehdr->e_phnum = (unsigned short)nphdrs;
1973 
1974         ehdr->e_phoff = sizeof (Ehdr);
1975         ehdr->e_phentsize = sizeof (Phdr);
1976 
1977         if (nshdrs > 0) {
1978                 if (nshdrs >= SHN_LORESERVE)
1979                         ehdr->e_shnum = 0;
1980                 else
1981                         ehdr->e_shnum = (unsigned short)nshdrs;
1982 
1983                 if (nshdrs - 1 >= SHN_LORESERVE)
1984                         ehdr->e_shstrndx = SHN_XINDEX;
1985                 else
1986                         ehdr->e_shstrndx = (unsigned short)(nshdrs - 1);
1987 
1988                 ehdr->e_shoff = ehdr->e_phoff + ehdr->e_phentsize * nphdrs;
1989                 ehdr->e_shentsize = sizeof (Shdr);
1990         }
1991 
1992         if (error = core_write(vp, UIO_SYSSPACE, (offset_t)0, ehdr,
1993             sizeof (Ehdr), rlimit, credp))
1994                 goto done;
1995 
1996         poffset = sizeof (Ehdr);
1997         soffset = sizeof (Ehdr) + phdrsz;
1998         doffset = sizeof (Ehdr) + phdrsz + shdrsz;
1999 
2000         v = &bigwad->phdr[0];
2001         bzero(v, phdrsz);
2002 
2003         setup_old_note_header(&v[0], p);
2004         v[0].p_offset = doffset = roundup(doffset, sizeof (Word));
2005         doffset += v[0].p_filesz;
2006 
2007         setup_note_header(&v[1], p);
2008         v[1].p_offset = doffset = roundup(doffset, sizeof (Word));
2009         doffset += v[1].p_filesz;
2010 
2011         mutex_enter(&p->p_lock);
2012 
2013         brkbase = p->p_brkbase;
2014         brksize = p->p_brksize;
2015 
2016         stkbase = p->p_usrstack - p->p_stksize;
2017         stksize = p->p_stksize;
2018 
2019         mutex_exit(&p->p_lock);
2020 
2021         AS_LOCK_ENTER(as, RW_WRITER);
2022         i = 2;
2023         for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
2024                 caddr_t eaddr = seg->s_base + pr_getsegsize(seg, 0);
2025                 caddr_t saddr, naddr;
2026                 void *tmp = NULL;
2027                 extern struct seg_ops segspt_shmops;
2028 
2029                 for (saddr = seg->s_base; saddr < eaddr; saddr = naddr) {
2030                         uint_t prot;
2031                         size_t size;
2032                         int type;
2033                         vnode_t *mvp;
2034 
2035                         prot = pr_getprot(seg, 0, &tmp, &saddr, &naddr, eaddr);
2036                         prot &= PROT_READ | PROT_WRITE | PROT_EXEC;
2037                         if ((size = (size_t)(naddr - saddr)) == 0)
2038                                 continue;
2039                         if (i == nphdrs) {
2040                                 overflow++;
2041                                 continue;
2042                         }
2043                         v[i].p_type = PT_LOAD;
2044                         v[i].p_vaddr = (Addr)(uintptr_t)saddr;
2045                         v[i].p_memsz = size;
2046                         if (prot & PROT_READ)
2047                                 v[i].p_flags |= PF_R;
2048                         if (prot & PROT_WRITE)
2049                                 v[i].p_flags |= PF_W;
2050                         if (prot & PROT_EXEC)
2051                                 v[i].p_flags |= PF_X;
2052 
2053                         /*
2054                          * Figure out which mappings to include in the core.
2055                          */
2056                         type = SEGOP_GETTYPE(seg, saddr);
2057 
2058                         if (saddr == stkbase && size == stksize) {
2059                                 if (!(content & CC_CONTENT_STACK))
2060                                         goto exclude;
2061 
2062                         } else if (saddr == brkbase && size == brksize) {
2063                                 if (!(content & CC_CONTENT_HEAP))
2064                                         goto exclude;
2065 
2066                         } else if (seg->s_ops == &segspt_shmops) {
2067                                 if (type & MAP_NORESERVE) {
2068                                         if (!(content & CC_CONTENT_DISM))
2069                                                 goto exclude;
2070                                 } else {
2071                                         if (!(content & CC_CONTENT_ISM))
2072                                                 goto exclude;
2073                                 }
2074 
2075                         } else if (seg->s_ops != &segvn_ops) {
2076                                 goto exclude;
2077 
2078                         } else if (type & MAP_SHARED) {
2079                                 if (shmgetid(p, saddr) != SHMID_NONE) {
2080                                         if (!(content & CC_CONTENT_SHM))
2081                                                 goto exclude;
2082 
2083                                 } else if (SEGOP_GETVP(seg, seg->s_base,
2084                                     &mvp) != 0 || mvp == NULL ||
2085                                     mvp->v_type != VREG) {
2086                                         if (!(content & CC_CONTENT_SHANON))
2087                                                 goto exclude;
2088 
2089                                 } else {
2090                                         if (!(content & CC_CONTENT_SHFILE))
2091                                                 goto exclude;
2092                                 }
2093 
2094                         } else if (SEGOP_GETVP(seg, seg->s_base, &mvp) != 0 ||
2095                             mvp == NULL || mvp->v_type != VREG) {
2096                                 if (!(content & CC_CONTENT_ANON))
2097                                         goto exclude;
2098 
2099                         } else if (prot == (PROT_READ | PROT_EXEC)) {
2100                                 if (!(content & CC_CONTENT_TEXT))
2101                                         goto exclude;
2102 
2103                         } else if (prot == PROT_READ) {
2104                                 if (!(content & CC_CONTENT_RODATA))
2105                                         goto exclude;
2106 
2107                         } else {
2108                                 if (!(content & CC_CONTENT_DATA))
2109                                         goto exclude;
2110                         }
2111 
2112                         doffset = roundup(doffset, sizeof (Word));
2113                         v[i].p_offset = doffset;
2114                         v[i].p_filesz = size;
2115                         doffset += size;
2116 exclude:
2117                         i++;
2118                 }
2119                 ASSERT(tmp == NULL);
2120         }
2121         AS_LOCK_EXIT(as);
2122 
2123         if (overflow || i != nphdrs) {
2124                 if (ntries++ == 0) {
2125                         kmem_free(bigwad, bigsize);
2126                         overflow = 0;
2127                         goto top;
2128                 }
2129                 cmn_err(CE_WARN, "elfcore: core dump failed for "
2130                     "process %d; address space is changing", p->p_pid);
2131                 error = EIO;
2132                 goto done;
2133         }
2134 
2135         if ((error = core_write(vp, UIO_SYSSPACE, poffset,
2136             v, phdrsz, rlimit, credp)) != 0)
2137                 goto done;
2138 
2139         if ((error = write_old_elfnotes(p, sig, vp, v[0].p_offset, rlimit,
2140             credp)) != 0)
2141                 goto done;
2142 
2143         if ((error = write_elfnotes(p, sig, vp, v[1].p_offset, rlimit,
2144             credp, content)) != 0)
2145                 goto done;
2146 
2147         for (i = 2; i < nphdrs; i++) {
2148                 prkillinfo_t killinfo;
2149                 sigqueue_t *sq;
2150                 int sig, j;
2151 
2152                 if (v[i].p_filesz == 0)
2153                         continue;
2154 
2155                 /*
2156                  * If dumping out this segment fails, rather than failing
2157                  * the core dump entirely, we reset the size of the mapping
2158                  * to zero to indicate that the data is absent from the core
2159                  * file and or in the PF_SUNW_FAILURE flag to differentiate
2160                  * this from mappings that were excluded due to the core file
2161                  * content settings.
2162                  */
2163                 if ((error = core_seg(p, vp, v[i].p_offset,
2164                     (caddr_t)(uintptr_t)v[i].p_vaddr, v[i].p_filesz,
2165                     rlimit, credp)) == 0) {
2166                         continue;
2167                 }
2168 
2169                 if ((sig = lwp->lwp_cursig) == 0) {
2170                         /*
2171                          * We failed due to something other than a signal.
2172                          * Since the space reserved for the segment is now
2173                          * unused, we stash the errno in the first four
2174                          * bytes. This undocumented interface will let us
2175                          * understand the nature of the failure.
2176                          */
2177                         (void) core_write(vp, UIO_SYSSPACE, v[i].p_offset,
2178                             &error, sizeof (error), rlimit, credp);
2179 
2180                         v[i].p_filesz = 0;
2181                         v[i].p_flags |= PF_SUNW_FAILURE;
2182                         if ((error = core_write(vp, UIO_SYSSPACE,
2183                             poffset + sizeof (v[i]) * i, &v[i], sizeof (v[i]),
2184                             rlimit, credp)) != 0)
2185                                 goto done;
2186 
2187                         continue;
2188                 }
2189 
2190                 /*
2191                  * We took a signal.  We want to abort the dump entirely, but
2192                  * we also want to indicate what failed and why.  We therefore
2193                  * use the space reserved for the first failing segment to
2194                  * write our error (which, for purposes of compatability with
2195                  * older core dump readers, we set to EINTR) followed by any
2196                  * siginfo associated with the signal.
2197                  */
2198                 bzero(&killinfo, sizeof (killinfo));
2199                 killinfo.prk_error = EINTR;
2200 
2201                 sq = sig == SIGKILL ? curproc->p_killsqp : lwp->lwp_curinfo;
2202 
2203                 if (sq != NULL) {
2204                         bcopy(&sq->sq_info, &killinfo.prk_info,
2205                             sizeof (sq->sq_info));
2206                 } else {
2207                         killinfo.prk_info.si_signo = lwp->lwp_cursig;
2208                         killinfo.prk_info.si_code = SI_NOINFO;
2209                 }
2210 
2211 #if (defined(_SYSCALL32_IMPL) || defined(_LP64))
2212                 /*
2213                  * If this is a 32-bit process, we need to translate from the
2214                  * native siginfo to the 32-bit variant.  (Core readers must
2215                  * always have the same data model as their target or must
2216                  * be aware of -- and compensate for -- data model differences.)
2217                  */
2218                 if (curproc->p_model == DATAMODEL_ILP32) {
2219                         siginfo32_t si32;
2220 
2221                         siginfo_kto32((k_siginfo_t *)&killinfo.prk_info, &si32);
2222                         bcopy(&si32, &killinfo.prk_info, sizeof (si32));
2223                 }
2224 #endif
2225 
2226                 (void) core_write(vp, UIO_SYSSPACE, v[i].p_offset,
2227                     &killinfo, sizeof (killinfo), rlimit, credp);
2228 
2229                 /*
2230                  * For the segment on which we took the signal, indicate that
2231                  * its data now refers to a siginfo.
2232                  */
2233                 v[i].p_filesz = 0;
2234                 v[i].p_flags |= PF_SUNW_FAILURE | PF_SUNW_KILLED |
2235                     PF_SUNW_SIGINFO;
2236 
2237                 /*
2238                  * And for every other segment, indicate that its absence
2239                  * is due to a signal.
2240                  */
2241                 for (j = i + 1; j < nphdrs; j++) {
2242                         v[j].p_filesz = 0;
2243                         v[j].p_flags |= PF_SUNW_FAILURE | PF_SUNW_KILLED;
2244                 }
2245 
2246                 /*
2247                  * Finally, write out our modified program headers.
2248                  */
2249                 if ((error = core_write(vp, UIO_SYSSPACE,
2250                     poffset + sizeof (v[i]) * i, &v[i],
2251                     sizeof (v[i]) * (nphdrs - i), rlimit, credp)) != 0)
2252                         goto done;
2253 
2254                 break;
2255         }
2256 
2257         if (nshdrs > 0) {
2258                 bzero(&bigwad->shdr[0], shdrsz);
2259 
2260                 if (nshdrs >= SHN_LORESERVE)
2261                         bigwad->shdr[0].sh_size = nshdrs;
2262 
2263                 if (nshdrs - 1 >= SHN_LORESERVE)
2264                         bigwad->shdr[0].sh_link = nshdrs - 1;
2265 
2266                 if (nphdrs >= PN_XNUM)
2267                         bigwad->shdr[0].sh_info = nphdrs;
2268 
2269                 if (nshdrs > 1) {
2270                         AS_LOCK_ENTER(as, RW_WRITER);
2271                         if ((error = process_scns(content, p, credp, vp,
2272                             &bigwad->shdr[0], nshdrs, rlimit, &doffset,
2273                             NULL)) != 0) {
2274                                 AS_LOCK_EXIT(as);
2275                                 goto done;
2276                         }
2277                         AS_LOCK_EXIT(as);
2278                 }
2279 
2280                 if ((error = core_write(vp, UIO_SYSSPACE, soffset,
2281                     &bigwad->shdr[0], shdrsz, rlimit, credp)) != 0)
2282                         goto done;
2283         }
2284 
2285 done:
2286         kmem_free(bigwad, bigsize);
2287         return (error);
2288 }
2289 
2290 #ifndef _ELF32_COMPAT
2291 
2292 static struct execsw esw = {
2293 #ifdef  _LP64
2294         elf64magicstr,
2295 #else   /* _LP64 */
2296         elf32magicstr,
2297 #endif  /* _LP64 */
2298         0,
2299         5,
2300         elfexec,
2301         elfcore
2302 };
2303 
2304 static struct modlexec modlexec = {
2305         &mod_execops, "exec module for elf", &esw
2306 };
2307 
2308 #ifdef  _LP64
2309 extern int elf32exec(vnode_t *vp, execa_t *uap, uarg_t *args,
2310                         intpdata_t *idatap, int level, long *execsz,
2311                         int setid, caddr_t exec_file, cred_t *cred,
2312                         int brand_action);
2313 extern int elf32core(vnode_t *vp, proc_t *p, cred_t *credp,
2314                         rlim64_t rlimit, int sig, core_content_t content);
2315 
2316 static struct execsw esw32 = {
2317         elf32magicstr,
2318         0,
2319         5,
2320         elf32exec,
2321         elf32core
2322 };
2323 
2324 static struct modlexec modlexec32 = {
2325         &mod_execops, "32-bit exec module for elf", &esw32
2326 };
2327 #endif  /* _LP64 */
2328 
2329 static struct modlinkage modlinkage = {
2330         MODREV_1,
2331         (void *)&modlexec,
2332 #ifdef  _LP64
2333         (void *)&modlexec32,
2334 #endif  /* _LP64 */
2335         NULL
2336 };
2337 
2338 int
2339 _init(void)
2340 {
2341         return (mod_install(&modlinkage));
2342 }
2343 
2344 int
2345 _fini(void)
2346 {
2347         return (mod_remove(&modlinkage));
2348 }
2349 
2350 int
2351 _info(struct modinfo *modinfop)
2352 {
2353         return (mod_info(&modlinkage, modinfop));
2354 }
2355 
2356 #endif  /* !_ELF32_COMPAT */