1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
  24  */
  25 
  26 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T     */
  27 /*        All Rights Reserved   */
  28 /*
  29  * Copyright (c) 2013, Joyent, Inc.  All rights reserved.
  30  */
  31 
  32 #include <sys/types.h>
  33 #include <sys/param.h>
  34 #include <sys/thread.h>
  35 #include <sys/sysmacros.h>
  36 #include <sys/signal.h>
  37 #include <sys/cred.h>
  38 #include <sys/user.h>
  39 #include <sys/errno.h>
  40 #include <sys/vnode.h>
  41 #include <sys/mman.h>
  42 #include <sys/kmem.h>
  43 #include <sys/proc.h>
  44 #include <sys/pathname.h>
  45 #include <sys/policy.h>
  46 #include <sys/cmn_err.h>
  47 #include <sys/systm.h>
  48 #include <sys/elf.h>
  49 #include <sys/vmsystm.h>
  50 #include <sys/debug.h>
  51 #include <sys/auxv.h>
  52 #include <sys/exec.h>
  53 #include <sys/prsystm.h>
  54 #include <vm/as.h>
  55 #include <vm/rm.h>
  56 #include <vm/seg.h>
  57 #include <vm/seg_vn.h>
  58 #include <sys/modctl.h>
  59 #include <sys/systeminfo.h>
  60 #include <sys/vmparam.h>
  61 #include <sys/machelf.h>
  62 #include <sys/shm_impl.h>
  63 #include <sys/archsystm.h>
  64 #include <sys/fasttrap.h>
  65 #include <sys/brand.h>
  66 #include "elf_impl.h"
  67 #include <sys/sdt.h>
  68 #include <sys/siginfo.h>
  69 #include <sys/random.h>
  70 
  71 extern int at_flags;
  72 extern volatile size_t aslr_max_brk_skew;
  73 
  74 #define ORIGIN_STR      "ORIGIN"
  75 #define ORIGIN_STR_SIZE 6
  76 
  77 static int getelfhead(vnode_t *, cred_t *, Ehdr *, int *, int *, int *);
  78 static int getelfphdr(vnode_t *, cred_t *, const Ehdr *, int, caddr_t *,
  79     ssize_t *);
  80 static int getelfshdr(vnode_t *, cred_t *, const Ehdr *, int, int, caddr_t *,
  81     ssize_t *, caddr_t *, ssize_t *);
  82 static size_t elfsize(Ehdr *, int, caddr_t, uintptr_t *);
  83 static int mapelfexec(vnode_t *, Ehdr *, int, caddr_t,
  84     Phdr **, Phdr **, Phdr **, Phdr **, Phdr *,
  85     caddr_t *, caddr_t *, intptr_t *, intptr_t *, size_t, long *, size_t *);
  86 
  87 typedef enum {
  88         STR_CTF,
  89         STR_SYMTAB,
  90         STR_DYNSYM,
  91         STR_STRTAB,
  92         STR_DYNSTR,
  93         STR_SHSTRTAB,
  94         STR_NUM
  95 } shstrtype_t;
  96 
  97 static const char *shstrtab_data[] = {
  98         ".SUNW_ctf",
  99         ".symtab",
 100         ".dynsym",
 101         ".strtab",
 102         ".dynstr",
 103         ".shstrtab"
 104 };
 105 
 106 typedef struct shstrtab {
 107         int     sst_ndx[STR_NUM];
 108         int     sst_cur;
 109 } shstrtab_t;
 110 
 111 static void
 112 shstrtab_init(shstrtab_t *s)
 113 {
 114         bzero(&s->sst_ndx, sizeof (s->sst_ndx));
 115         s->sst_cur = 1;
 116 }
 117 
 118 static int
 119 shstrtab_ndx(shstrtab_t *s, shstrtype_t type)
 120 {
 121         int ret;
 122 
 123         if ((ret = s->sst_ndx[type]) != 0)
 124                 return (ret);
 125 
 126         ret = s->sst_ndx[type] = s->sst_cur;
 127         s->sst_cur += strlen(shstrtab_data[type]) + 1;
 128 
 129         return (ret);
 130 }
 131 
 132 static size_t
 133 shstrtab_size(const shstrtab_t *s)
 134 {
 135         return (s->sst_cur);
 136 }
 137 
 138 static void
 139 shstrtab_dump(const shstrtab_t *s, char *buf)
 140 {
 141         int i, ndx;
 142 
 143         *buf = '\0';
 144         for (i = 0; i < STR_NUM; i++) {
 145                 if ((ndx = s->sst_ndx[i]) != 0)
 146                         (void) strcpy(buf + ndx, shstrtab_data[i]);
 147         }
 148 }
 149 
 150 static int
 151 dtrace_safe_phdr(Phdr *phdrp, struct uarg *args, uintptr_t base)
 152 {
 153         ASSERT(phdrp->p_type == PT_SUNWDTRACE);
 154 
 155         /*
 156          * See the comment in fasttrap.h for information on how to safely
 157          * update this program header.
 158          */
 159         if (phdrp->p_memsz < PT_SUNWDTRACE_SIZE ||
 160             (phdrp->p_flags & (PF_R | PF_W | PF_X)) != (PF_R | PF_W | PF_X))
 161                 return (-1);
 162 
 163         args->thrptr = phdrp->p_vaddr + base;
 164 
 165         return (0);
 166 }
 167 
 168 static int
 169 handle_secflag_dt(proc_t *p, uint_t dt, uint_t val)
 170 {
 171         uint_t flag;
 172 
 173         switch (dt) {
 174         case DT_SUNW_ASLR:
 175                 flag = PROC_SEC_ASLR;
 176                 break;
 177         default:
 178                 return (EINVAL);
 179         }
 180 
 181         if (val == 0) {
 182                 if (secflag_isset(p->p_secflags.psf_lower, flag))
 183                         return (EPERM);
 184                 if ((secpolicy_psecflags(CRED(), p, p) != 0) &&
 185                     secflag_isset(p->p_secflags.psf_inherit, flag))
 186                         return (EPERM);
 187 
 188                 secflag_clear(&p->p_secflags.psf_inherit, flag);
 189                 secflag_clear(&p->p_secflags.psf_effective, flag);
 190         } else {
 191                 if (!secflag_isset(p->p_secflags.psf_upper, flag))
 192                         return (EPERM);
 193 
 194                 if ((secpolicy_psecflags(CRED(), p, p) != 0) &&
 195                     !secflag_isset(p->p_secflags.psf_inherit, flag))
 196                         return (EPERM);
 197 
 198                 secflag_set(&p->p_secflags.psf_inherit, flag);
 199                 secflag_set(&p->p_secflags.psf_effective, flag);
 200         }
 201 
 202         return (0);
 203 }
 204 
 205 /*
 206  * Map in the executable pointed to by vp. Returns 0 on success.
 207  */
 208 int
 209 mapexec_brand(vnode_t *vp, uarg_t *args, Ehdr *ehdr, Addr *uphdr_vaddr,
 210     intptr_t *voffset, caddr_t exec_file, int *interp, caddr_t *bssbase,
 211     caddr_t *brkbase, size_t *brksize, uintptr_t *lddatap)
 212 {
 213         size_t          len;
 214         struct vattr    vat;
 215         caddr_t         phdrbase = NULL;
 216         ssize_t         phdrsize;
 217         int             nshdrs, shstrndx, nphdrs;
 218         int             error = 0;
 219         Phdr            *uphdr = NULL;
 220         Phdr            *junk = NULL;
 221         Phdr            *dynphdr = NULL;
 222         Phdr            *dtrphdr = NULL;
 223         uintptr_t       lddata;
 224         long            execsz;
 225         intptr_t        minaddr;
 226 
 227         if (lddatap != NULL)
 228                 *lddatap = NULL;
 229 
 230         if (error = execpermissions(vp, &vat, args)) {
 231                 uprintf("%s: Cannot execute %s\n", exec_file, args->pathname);
 232                 return (error);
 233         }
 234 
 235         if ((error = getelfhead(vp, CRED(), ehdr, &nshdrs, &shstrndx,
 236             &nphdrs)) != 0 ||
 237             (error = getelfphdr(vp, CRED(), ehdr, nphdrs, &phdrbase,
 238             &phdrsize)) != 0) {
 239                 uprintf("%s: Cannot read %s\n", exec_file, args->pathname);
 240                 return (error);
 241         }
 242 
 243         if ((len = elfsize(ehdr, nphdrs, phdrbase, &lddata)) == 0) {
 244                 uprintf("%s: Nothing to load in %s", exec_file, args->pathname);
 245                 kmem_free(phdrbase, phdrsize);
 246                 return (ENOEXEC);
 247         }
 248         if (lddatap != NULL)
 249                 *lddatap = lddata;
 250 
 251         if (error = mapelfexec(vp, ehdr, nphdrs, phdrbase, &uphdr, &dynphdr,
 252             &junk, &dtrphdr, NULL, bssbase, brkbase, voffset, &minaddr,
 253             len, &execsz, brksize)) {
 254                 uprintf("%s: Cannot map %s\n", exec_file, args->pathname);
 255                 kmem_free(phdrbase, phdrsize);
 256                 return (error);
 257         }
 258 
 259         /*
 260          * Inform our caller if the executable needs an interpreter.
 261          */
 262         *interp = (dynphdr == NULL) ? 0 : 1;
 263 
 264         /*
 265          * If this is a statically linked executable, voffset should indicate
 266          * the address of the executable itself (it normally holds the address
 267          * of the interpreter).
 268          */
 269         if (ehdr->e_type == ET_EXEC && *interp == 0)
 270                 *voffset = minaddr;
 271 
 272         if (uphdr != NULL) {
 273                 *uphdr_vaddr = uphdr->p_vaddr;
 274         } else {
 275                 *uphdr_vaddr = (Addr)-1;
 276         }
 277 
 278         kmem_free(phdrbase, phdrsize);
 279         return (error);
 280 }
 281 
 282 /*ARGSUSED*/
 283 int
 284 elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap,
 285     int level, long *execsz, int setid, caddr_t exec_file, cred_t *cred,
 286     int brand_action)
 287 {
 288         caddr_t         phdrbase = NULL;
 289         caddr_t         bssbase = 0;
 290         caddr_t         brkbase = 0;
 291         size_t          brksize = 0;
 292         ssize_t         dlnsize;
 293         aux_entry_t     *aux;
 294         int             error;
 295         ssize_t         resid;
 296         int             fd = -1;
 297         intptr_t        voffset;
 298         Phdr            *intphdr = NULL;
 299         Phdr            *dynamicphdr = NULL;
 300         Phdr            *stphdr = NULL;
 301         Phdr            *uphdr = NULL;
 302         Phdr            *junk = NULL;
 303         size_t          len;
 304         ssize_t         phdrsize;
 305         int             postfixsize = 0;
 306         int             i, hsize;
 307         Phdr            *phdrp;
 308         Phdr            *dataphdrp = NULL;
 309         Phdr            *dtrphdr;
 310         Phdr            *capphdr = NULL;
 311         Cap             *cap = NULL;
 312         ssize_t         capsize;
 313         Dyn             *dyn = NULL;
 314         int             hasu = 0;
 315         int             hasauxv = 0;
 316         int             hasintp = 0;
 317         int             branded = 0;
 318 
 319         struct proc *p = ttoproc(curthread);
 320         struct user *up = PTOU(p);
 321         struct bigwad {
 322                 Ehdr    ehdr;
 323                 aux_entry_t     elfargs[__KERN_NAUXV_IMPL];
 324                 char            dl_name[MAXPATHLEN];
 325                 char            pathbuf[MAXPATHLEN];
 326                 struct vattr    vattr;
 327                 struct execenv  exenv;
 328         } *bigwad;      /* kmem_alloc this behemoth so we don't blow stack */
 329         Ehdr            *ehdrp;
 330         int             nshdrs, shstrndx, nphdrs;
 331         char            *dlnp;
 332         char            *pathbufp;
 333         rlim64_t        limit;
 334         rlim64_t        roundlimit;
 335 
 336         ASSERT(p->p_model == DATAMODEL_ILP32 || p->p_model == DATAMODEL_LP64);
 337 
 338         bigwad = kmem_alloc(sizeof (struct bigwad), KM_SLEEP);
 339         ehdrp = &bigwad->ehdr;
 340         dlnp = bigwad->dl_name;
 341         pathbufp = bigwad->pathbuf;
 342 
 343         /*
 344          * Obtain ELF and program header information.
 345          */
 346         if ((error = getelfhead(vp, CRED(), ehdrp, &nshdrs, &shstrndx,
 347             &nphdrs)) != 0 ||
 348             (error = getelfphdr(vp, CRED(), ehdrp, nphdrs, &phdrbase,
 349             &phdrsize)) != 0)
 350                 goto out;
 351 
 352         /*
 353          * Prevent executing an ELF file that has no entry point.
 354          */
 355         if (ehdrp->e_entry == 0) {
 356                 uprintf("%s: Bad entry point\n", exec_file);
 357                 goto bad;
 358         }
 359 
 360         /*
 361          * Put data model that we're exec-ing to into the args passed to
 362          * exec_args(), so it will know what it is copying to on new stack.
 363          * Now that we know whether we are exec-ing a 32-bit or 64-bit
 364          * executable, we can set execsz with the appropriate NCARGS.
 365          */
 366 #ifdef  _LP64
 367         if (ehdrp->e_ident[EI_CLASS] == ELFCLASS32) {
 368                 args->to_model = DATAMODEL_ILP32;
 369                 *execsz = btopr(SINCR) + btopr(SSIZE) + btopr(NCARGS32-1);
 370         } else {
 371                 args->to_model = DATAMODEL_LP64;
 372                 args->stk_prot &= ~PROT_EXEC;
 373 #if defined(__i386) || defined(__amd64)
 374                 args->dat_prot &= ~PROT_EXEC;
 375 #endif
 376                 *execsz = btopr(SINCR) + btopr(SSIZE) + btopr(NCARGS64-1);
 377         }
 378 #else   /* _LP64 */
 379         args->to_model = DATAMODEL_ILP32;
 380         *execsz = btopr(SINCR) + btopr(SSIZE) + btopr(NCARGS-1);
 381 #endif  /* _LP64 */
 382 
 383         /*
 384          * We delay invoking the brand callback until we've figured out
 385          * what kind of elf binary we're trying to run, 32-bit or 64-bit.
 386          * We do this because now the brand library can just check
 387          * args->to_model to see if the target is 32-bit or 64-bit without
 388          * having do duplicate all the code above.
 389          *
 390          * The level checks associated with brand handling below are used to
 391          * prevent a loop since the brand elfexec function typically comes back
 392          * through this function. We must check <= here since the nested
 393          * handling in the #! interpreter code will increment the level before
 394          * calling gexec to run the final elfexec interpreter.
 395          */
 396         if ((level <= INTP_MAXDEPTH) &&
 397             (brand_action != EBA_NATIVE) && (PROC_IS_BRANDED(p))) {
 398                 error = BROP(p)->b_elfexec(vp, uap, args,
 399                     idatap, level + 1, execsz, setid, exec_file, cred,
 400                     brand_action);
 401                 goto out;
 402         }
 403 
 404         /*
 405          * Determine aux size now so that stack can be built
 406          * in one shot (except actual copyout of aux image),
 407          * determine any non-default stack protections,
 408          * and still have this code be machine independent.
 409          */
 410         hsize = ehdrp->e_phentsize;
 411         phdrp = (Phdr *)phdrbase;
 412         for (i = nphdrs; i > 0; i--) {
 413                 switch (phdrp->p_type) {
 414                 case PT_INTERP:
 415                         hasauxv = hasintp = 1;
 416                         break;
 417                 case PT_PHDR:
 418                         hasu = 1;
 419                         break;
 420                 case PT_SUNWSTACK:
 421                         args->stk_prot = PROT_USER;
 422                         if (phdrp->p_flags & PF_R)
 423                                 args->stk_prot |= PROT_READ;
 424                         if (phdrp->p_flags & PF_W)
 425                                 args->stk_prot |= PROT_WRITE;
 426                         if (phdrp->p_flags & PF_X)
 427                                 args->stk_prot |= PROT_EXEC;
 428                         break;
 429                 case PT_LOAD:
 430                         dataphdrp = phdrp;
 431                         break;
 432                 case PT_SUNWCAP:
 433                         capphdr = phdrp;
 434                         break;
 435                 case PT_DYNAMIC:
 436                         dynamicphdr = phdrp;
 437                         break;
 438                 }
 439                 phdrp = (Phdr *)((caddr_t)phdrp + hsize);
 440         }
 441 
 442         if (ehdrp->e_type != ET_EXEC) {
 443                 dataphdrp = NULL;
 444                 hasauxv = 1;
 445         }
 446 
 447         /* Copy BSS permissions to args->dat_prot */
 448         if (dataphdrp != NULL) {
 449                 args->dat_prot = PROT_USER;
 450                 if (dataphdrp->p_flags & PF_R)
 451                         args->dat_prot |= PROT_READ;
 452                 if (dataphdrp->p_flags & PF_W)
 453                         args->dat_prot |= PROT_WRITE;
 454                 if (dataphdrp->p_flags & PF_X)
 455                         args->dat_prot |= PROT_EXEC;
 456         }
 457 
 458         /*
 459          * If a auxvector will be required - reserve the space for
 460          * it now.  This may be increased by exec_args if there are
 461          * ISA-specific types (included in __KERN_NAUXV_IMPL).
 462          */
 463         if (hasauxv) {
 464                 /*
 465                  * If a AUX vector is being built - the base AUX
 466                  * entries are:
 467                  *
 468                  *      AT_BASE
 469                  *      AT_FLAGS
 470                  *      AT_PAGESZ
 471                  *      AT_SUN_AUXFLAGS
 472                  *      AT_SUN_HWCAP
 473                  *      AT_SUN_HWCAP2
 474                  *      AT_SUN_PLATFORM (added in stk_copyout)
 475                  *      AT_SUN_EXECNAME (added in stk_copyout)
 476                  *      AT_NULL
 477                  *
 478                  * total == 9
 479                  */
 480                 if (hasintp && hasu) {
 481                         /*
 482                          * Has PT_INTERP & PT_PHDR - the auxvectors that
 483                          * will be built are:
 484                          *
 485                          *      AT_PHDR
 486                          *      AT_PHENT
 487                          *      AT_PHNUM
 488                          *      AT_ENTRY
 489                          *      AT_LDDATA
 490                          *
 491                          * total = 5
 492                          */
 493                         args->auxsize = (9 + 5) * sizeof (aux_entry_t);
 494                 } else if (hasintp) {
 495                         /*
 496                          * Has PT_INTERP but no PT_PHDR
 497                          *
 498                          *      AT_EXECFD
 499                          *      AT_LDDATA
 500                          *
 501                          * total = 2
 502                          */
 503                         args->auxsize = (9 + 2) * sizeof (aux_entry_t);
 504                 } else {
 505                         args->auxsize = 9 * sizeof (aux_entry_t);
 506                 }
 507         } else {
 508                 args->auxsize = 0;
 509         }
 510 
 511         /*
 512          * If this binary is using an emulator, we need to add an
 513          * AT_SUN_EMULATOR aux entry.
 514          */
 515         if (args->emulator != NULL)
 516                 args->auxsize += sizeof (aux_entry_t);
 517 
 518         if ((brand_action != EBA_NATIVE) && (PROC_IS_BRANDED(p))) {
 519                 branded = 1;
 520                 /*
 521                  * We will be adding 4 entries to the aux vectors.  One for
 522                  * the the brandname and 3 for the brand specific aux vectors.
 523                  */
 524                 args->auxsize += 4 * sizeof (aux_entry_t);
 525         }
 526 
 527         /* If the binary has an explicit ASLR flag, it must be honoured */
 528         if ((dynamicphdr != NULL) &&
 529             (dynamicphdr->p_filesz > 0)) {
 530                 Dyn *dp;
 531                 off_t i = 0;
 532 
 533 #define DYN_STRIDE      100
 534                 for (i = 0; i < dynamicphdr->p_filesz;
 535                     i += sizeof (*dyn) * DYN_STRIDE) {
 536                         int ndyns = (dynamicphdr->p_filesz - i) / sizeof (*dyn);
 537                         size_t dynsize;
 538 
 539                         ndyns = MIN(DYN_STRIDE, ndyns);
 540                         dynsize = ndyns * sizeof (*dyn);
 541 
 542                         dyn = kmem_alloc(dynsize, KM_SLEEP);
 543 
 544                         if ((error = vn_rdwr(UIO_READ, vp, (caddr_t)dyn,
 545                             dynsize, (offset_t)(dynamicphdr->p_offset + i),
 546                             UIO_SYSSPACE, 0, (rlim64_t)0,
 547                             CRED(), &resid)) != 0) {
 548                                 uprintf("%s: cannot read .dynamic section\n",
 549                                     exec_file);
 550                                 goto out;
 551                         }
 552 
 553                         for (dp = dyn; dp < (dyn + ndyns); dp++) {
 554                                 if (dp->d_tag == DT_SUNW_ASLR) {
 555                                         if ((error = handle_secflag_dt(p,
 556                                             DT_SUNW_ASLR,
 557                                             dp->d_un.d_val)) != 0) {
 558                                                 uprintf("%s: error setting "
 559                                                     "security-flag from "
 560                                                     "DT_SUNW_ASLR: %d\n",
 561                                                     exec_file, error);
 562                                                 goto out;
 563                                         }
 564                                 }
 565                         }
 566 
 567                         kmem_free(dyn, dynsize);
 568                 }
 569         }
 570 
 571         /* Hardware/Software capabilities */
 572         if (capphdr != NULL &&
 573             (capsize = capphdr->p_filesz) > 0 &&
 574             capsize <= 16 * sizeof (*cap)) {
 575                 int ncaps = capsize / sizeof (*cap);
 576                 Cap *cp;
 577 
 578                 cap = kmem_alloc(capsize, KM_SLEEP);
 579                 if ((error = vn_rdwr(UIO_READ, vp, (caddr_t)cap,
 580                     capsize, (offset_t)capphdr->p_offset,
 581                     UIO_SYSSPACE, 0, (rlim64_t)0, CRED(), &resid)) != 0) {
 582                         uprintf("%s: Cannot read capabilities section\n",
 583                             exec_file);
 584                         goto out;
 585                 }
 586                 for (cp = cap; cp < cap + ncaps; cp++) {
 587                         if (cp->c_tag == CA_SUNW_SF_1 &&
 588                             (cp->c_un.c_val & SF1_SUNW_ADDR32)) {
 589                                 if (args->to_model == DATAMODEL_LP64)
 590                                         args->addr32 = 1;
 591                                 break;
 592                         }
 593                 }
 594         }
 595 
 596         aux = bigwad->elfargs;
 597         /*
 598          * Move args to the user's stack.
 599          * This can fill in the AT_SUN_PLATFORM and AT_SUN_EXECNAME aux entries.
 600          */
 601         if ((error = exec_args(uap, args, idatap, (void **)&aux)) != 0) {
 602                 if (error == -1) {
 603                         error = ENOEXEC;
 604                         goto bad;
 605                 }
 606                 goto out;
 607         }
 608         /* we're single threaded after this point */
 609 
 610         /*
 611          * If this is an ET_DYN executable (shared object),
 612          * determine its memory size so that mapelfexec() can load it.
 613          */
 614         if (ehdrp->e_type == ET_DYN)
 615                 len = elfsize(ehdrp, nphdrs, phdrbase, NULL);
 616         else
 617                 len = 0;
 618 
 619         dtrphdr = NULL;
 620 
 621         if ((error = mapelfexec(vp, ehdrp, nphdrs, phdrbase, &uphdr, &intphdr,
 622             &stphdr, &dtrphdr, dataphdrp, &bssbase, &brkbase, &voffset, NULL,
 623             len, execsz, &brksize)) != 0)
 624                 goto bad;
 625 
 626         if (uphdr != NULL && intphdr == NULL)
 627                 goto bad;
 628 
 629         if (dtrphdr != NULL && dtrace_safe_phdr(dtrphdr, args, voffset) != 0) {
 630                 uprintf("%s: Bad DTrace phdr in %s\n", exec_file, exec_file);
 631                 goto bad;
 632         }
 633 
 634         if (intphdr != NULL) {
 635                 size_t          len;
 636                 uintptr_t       lddata;
 637                 char            *p;
 638                 struct vnode    *nvp;
 639 
 640                 dlnsize = intphdr->p_filesz;
 641 
 642                 if (dlnsize > MAXPATHLEN || dlnsize <= 0)
 643                         goto bad;
 644 
 645                 /*
 646                  * Read in "interpreter" pathname.
 647                  */
 648                 if ((error = vn_rdwr(UIO_READ, vp, dlnp, intphdr->p_filesz,
 649                     (offset_t)intphdr->p_offset, UIO_SYSSPACE, 0, (rlim64_t)0,
 650                     CRED(), &resid)) != 0) {
 651                         uprintf("%s: Cannot obtain interpreter pathname\n",
 652                             exec_file);
 653                         goto bad;
 654                 }
 655 
 656                 if (resid != 0 || dlnp[dlnsize - 1] != '\0')
 657                         goto bad;
 658 
 659                 /*
 660                  * Search for '$ORIGIN' token in interpreter path.
 661                  * If found, expand it.
 662                  */
 663                 for (p = dlnp; p = strchr(p, '$'); ) {
 664                         uint_t  len, curlen;
 665                         char    *_ptr;
 666 
 667                         if (strncmp(++p, ORIGIN_STR, ORIGIN_STR_SIZE))
 668                                 continue;
 669 
 670                         /*
 671                          * We don't support $ORIGIN on setid programs to close
 672                          * a potential attack vector.
 673                          */
 674                         if ((setid & EXECSETID_SETID) != 0) {
 675                                 error = ENOEXEC;
 676                                 goto bad;
 677                         }
 678 
 679                         curlen = 0;
 680                         len = p - dlnp - 1;
 681                         if (len) {
 682                                 bcopy(dlnp, pathbufp, len);
 683                                 curlen += len;
 684                         }
 685                         if (_ptr = strrchr(args->pathname, '/')) {
 686                                 len = _ptr - args->pathname;
 687                                 if ((curlen + len) > MAXPATHLEN)
 688                                         break;
 689 
 690                                 bcopy(args->pathname, &pathbufp[curlen], len);
 691                                 curlen += len;
 692                         } else {
 693                                 /*
 694                                  * executable is a basename found in the
 695                                  * current directory.  So - just substitue
 696                                  * '.' for ORIGIN.
 697                                  */
 698                                 pathbufp[curlen] = '.';
 699                                 curlen++;
 700                         }
 701                         p += ORIGIN_STR_SIZE;
 702                         len = strlen(p);
 703 
 704                         if ((curlen + len) > MAXPATHLEN)
 705                                 break;
 706                         bcopy(p, &pathbufp[curlen], len);
 707                         curlen += len;
 708                         pathbufp[curlen++] = '\0';
 709                         bcopy(pathbufp, dlnp, curlen);
 710                 }
 711 
 712                 /*
 713                  * /usr/lib/ld.so.1 is known to be a symlink to /lib/ld.so.1
 714                  * (and /usr/lib/64/ld.so.1 is a symlink to /lib/64/ld.so.1).
 715                  * Just in case /usr is not mounted, change it now.
 716                  */
 717                 if (strcmp(dlnp, USR_LIB_RTLD) == 0)
 718                         dlnp += 4;
 719                 error = lookupname(dlnp, UIO_SYSSPACE, FOLLOW, NULLVPP, &nvp);
 720                 if (error && dlnp != bigwad->dl_name) {
 721                         /* new kernel, old user-level */
 722                         error = lookupname(dlnp -= 4, UIO_SYSSPACE, FOLLOW,
 723                             NULLVPP, &nvp);
 724                 }
 725                 if (error) {
 726                         uprintf("%s: Cannot find %s\n", exec_file, dlnp);
 727                         goto bad;
 728                 }
 729 
 730                 /*
 731                  * Setup the "aux" vector.
 732                  */
 733                 if (uphdr) {
 734                         if (ehdrp->e_type == ET_DYN) {
 735                                 /* don't use the first page */
 736                                 bigwad->exenv.ex_brkbase = (caddr_t)PAGESIZE;
 737                                 bigwad->exenv.ex_bssbase = (caddr_t)PAGESIZE;
 738                         } else {
 739                                 bigwad->exenv.ex_bssbase = bssbase;
 740                                 bigwad->exenv.ex_brkbase = brkbase;
 741                         }
 742                         bigwad->exenv.ex_brksize = brksize;
 743                         bigwad->exenv.ex_magic = elfmagic;
 744                         bigwad->exenv.ex_vp = vp;
 745                         setexecenv(&bigwad->exenv);
 746 
 747                         ADDAUX(aux, AT_PHDR, uphdr->p_vaddr + voffset)
 748                         ADDAUX(aux, AT_PHENT, ehdrp->e_phentsize)
 749                         ADDAUX(aux, AT_PHNUM, nphdrs)
 750                         ADDAUX(aux, AT_ENTRY, ehdrp->e_entry + voffset)
 751                 } else {
 752                         if ((error = execopen(&vp, &fd)) != 0) {
 753                                 VN_RELE(nvp);
 754                                 goto bad;
 755                         }
 756 
 757                         ADDAUX(aux, AT_EXECFD, fd)
 758                 }
 759 
 760                 if ((error = execpermissions(nvp, &bigwad->vattr, args)) != 0) {
 761                         VN_RELE(nvp);
 762                         uprintf("%s: Cannot execute %s\n", exec_file, dlnp);
 763                         goto bad;
 764                 }
 765 
 766                 /*
 767                  * Now obtain the ELF header along with the entire program
 768                  * header contained in "nvp".
 769                  */
 770                 kmem_free(phdrbase, phdrsize);
 771                 phdrbase = NULL;
 772                 if ((error = getelfhead(nvp, CRED(), ehdrp, &nshdrs,
 773                     &shstrndx, &nphdrs)) != 0 ||
 774                     (error = getelfphdr(nvp, CRED(), ehdrp, nphdrs, &phdrbase,
 775                     &phdrsize)) != 0) {
 776                         VN_RELE(nvp);
 777                         uprintf("%s: Cannot read %s\n", exec_file, dlnp);
 778                         goto bad;
 779                 }
 780 
 781                 /*
 782                  * Determine memory size of the "interpreter's" loadable
 783                  * sections.  This size is then used to obtain the virtual
 784                  * address of a hole, in the user's address space, large
 785                  * enough to map the "interpreter".
 786                  */
 787                 if ((len = elfsize(ehdrp, nphdrs, phdrbase, &lddata)) == 0) {
 788                         VN_RELE(nvp);
 789                         uprintf("%s: Nothing to load in %s\n", exec_file, dlnp);
 790                         goto bad;
 791                 }
 792 
 793                 dtrphdr = NULL;
 794 
 795                 error = mapelfexec(nvp, ehdrp, nphdrs, phdrbase, &junk, &junk,
 796                     &junk, &dtrphdr, NULL, NULL, NULL, &voffset, NULL, len,
 797                     execsz, NULL);
 798                 if (error || junk != NULL) {
 799                         VN_RELE(nvp);
 800                         uprintf("%s: Cannot map %s\n", exec_file, dlnp);
 801                         goto bad;
 802                 }
 803 
 804                 /*
 805                  * We use the DTrace program header to initialize the
 806                  * architecture-specific user per-LWP location. The dtrace
 807                  * fasttrap provider requires ready access to per-LWP scratch
 808                  * space. We assume that there is only one such program header
 809                  * in the interpreter.
 810                  */
 811                 if (dtrphdr != NULL &&
 812                     dtrace_safe_phdr(dtrphdr, args, voffset) != 0) {
 813                         VN_RELE(nvp);
 814                         uprintf("%s: Bad DTrace phdr in %s\n", exec_file, dlnp);
 815                         goto bad;
 816                 }
 817 
 818                 VN_RELE(nvp);
 819                 ADDAUX(aux, AT_SUN_LDDATA, voffset + lddata)
 820         }
 821 
 822         if (hasauxv) {
 823                 int auxf = AF_SUN_HWCAPVERIFY;
 824                 /*
 825                  * Note: AT_SUN_PLATFORM and AT_SUN_EXECNAME were filled in via
 826                  * exec_args()
 827                  */
 828                 ADDAUX(aux, AT_BASE, voffset)
 829                 ADDAUX(aux, AT_FLAGS, at_flags)
 830                 ADDAUX(aux, AT_PAGESZ, PAGESIZE)
 831                 /*
 832                  * Linker flags. (security)
 833                  * p_flag not yet set at this time.
 834                  * We rely on gexec() to provide us with the information.
 835                  * If the application is set-uid but this is not reflected
 836                  * in a mismatch between real/effective uids/gids, then
 837                  * don't treat this as a set-uid exec.  So we care about
 838                  * the EXECSETID_UGIDS flag but not the ...SETID flag.
 839                  */
 840                 if ((setid &= ~EXECSETID_SETID) != 0)
 841                         auxf |= AF_SUN_SETUGID;
 842 
 843                 /*
 844                  * If we're running a native process from within a branded
 845                  * zone under pfexec then we clear the AF_SUN_SETUGID flag so
 846                  * that the native ld.so.1 is able to link with the native
 847                  * libraries instead of using the brand libraries that are
 848                  * installed in the zone.  We only do this for processes
 849                  * which we trust because we see they are already running
 850                  * under pfexec (where uid != euid).  This prevents a
 851                  * malicious user within the zone from crafting a wrapper to
 852                  * run native suid commands with unsecure libraries interposed.
 853                  */
 854                 if ((brand_action == EBA_NATIVE) && (PROC_IS_BRANDED(p) &&
 855                     (setid &= ~EXECSETID_SETID) != 0))
 856                         auxf &= ~AF_SUN_SETUGID;
 857 
 858                 /*
 859                  * Record the user addr of the auxflags aux vector entry
 860                  * since brands may optionally want to manipulate this field.
 861                  */
 862                 args->auxp_auxflags =
 863                     (char *)((char *)args->stackend +
 864                     ((char *)&aux->a_type -
 865                     (char *)bigwad->elfargs));
 866                 ADDAUX(aux, AT_SUN_AUXFLAGS, auxf);
 867 
 868                 /*
 869                  * Hardware capability flag word (performance hints)
 870                  * Used for choosing faster library routines.
 871                  * (Potentially different between 32-bit and 64-bit ABIs)
 872                  */
 873 #if defined(_LP64)
 874                 if (args->to_model == DATAMODEL_NATIVE) {
 875                         ADDAUX(aux, AT_SUN_HWCAP, auxv_hwcap)
 876                         ADDAUX(aux, AT_SUN_HWCAP2, auxv_hwcap_2)
 877                 } else {
 878                         ADDAUX(aux, AT_SUN_HWCAP, auxv_hwcap32)
 879                         ADDAUX(aux, AT_SUN_HWCAP2, auxv_hwcap32_2)
 880                 }
 881 #else
 882                 ADDAUX(aux, AT_SUN_HWCAP, auxv_hwcap)
 883                 ADDAUX(aux, AT_SUN_HWCAP2, auxv_hwcap_2)
 884 #endif
 885                 if (branded) {
 886                         /*
 887                          * Reserve space for the brand-private aux vectors,
 888                          * and record the user addr of that space.
 889                          */
 890                         args->auxp_brand =
 891                             (char *)((char *)args->stackend +
 892                             ((char *)&aux->a_type -
 893                             (char *)bigwad->elfargs));
 894                         ADDAUX(aux, AT_SUN_BRAND_AUX1, 0)
 895                         ADDAUX(aux, AT_SUN_BRAND_AUX2, 0)
 896                         ADDAUX(aux, AT_SUN_BRAND_AUX3, 0)
 897                 }
 898 
 899                 ADDAUX(aux, AT_NULL, 0)
 900                 postfixsize = (char *)aux - (char *)bigwad->elfargs;
 901 
 902                 /*
 903                  * We make assumptions above when we determine how many aux
 904                  * vector entries we will be adding. However, if we have an
 905                  * invalid elf file, it is possible that mapelfexec might
 906                  * behave differently (but not return an error), in which case
 907                  * the number of aux entries we actually add will be different.
 908                  * We detect that now and error out.
 909                  */
 910                 if (postfixsize != args->auxsize) {
 911                         DTRACE_PROBE2(elfexec_badaux, int, postfixsize,
 912                             int, args->auxsize);
 913                         goto bad;
 914                 }
 915                 ASSERT(postfixsize <= __KERN_NAUXV_IMPL * sizeof (aux_entry_t));
 916         }
 917 
 918         /*
 919          * For the 64-bit kernel, the limit is big enough that rounding it up
 920          * to a page can overflow the 64-bit limit, so we check for btopr()
 921          * overflowing here by comparing it with the unrounded limit in pages.
 922          * If it hasn't overflowed, compare the exec size with the rounded up
 923          * limit in pages.  Otherwise, just compare with the unrounded limit.
 924          */
 925         limit = btop(p->p_vmem_ctl);
 926         roundlimit = btopr(p->p_vmem_ctl);
 927         if ((roundlimit > limit && *execsz > roundlimit) ||
 928             (roundlimit < limit && *execsz > limit)) {
 929                 mutex_enter(&p->p_lock);
 930                 (void) rctl_action(rctlproc_legacy[RLIMIT_VMEM], p->p_rctls, p,
 931                     RCA_SAFE);
 932                 mutex_exit(&p->p_lock);
 933                 error = ENOMEM;
 934                 goto bad;
 935         }
 936 
 937         bzero(up->u_auxv, sizeof (up->u_auxv));
 938         if (postfixsize) {
 939                 int num_auxv;
 940 
 941                 /*
 942                  * Copy the aux vector to the user stack.
 943                  */
 944                 error = execpoststack(args, bigwad->elfargs, postfixsize);
 945                 if (error)
 946                         goto bad;
 947 
 948                 /*
 949                  * Copy auxv to the process's user structure for use by /proc.
 950                  * If this is a branded process, the brand's exec routine will
 951                  * copy it's private entries to the user structure later. It
 952                  * relies on the fact that the blank entries are at the end.
 953                  */
 954                 num_auxv = postfixsize / sizeof (aux_entry_t);
 955                 ASSERT(num_auxv <= sizeof (up->u_auxv) / sizeof (auxv_t));
 956                 aux = bigwad->elfargs;
 957                 for (i = 0; i < num_auxv; i++) {
 958                         up->u_auxv[i].a_type = aux[i].a_type;
 959                         up->u_auxv[i].a_un.a_val = (aux_val_t)aux[i].a_un.a_val;
 960                 }
 961         }
 962 
 963         /*
 964          * Pass back the starting address so we can set the program counter.
 965          */
 966         args->entry = (uintptr_t)(ehdrp->e_entry + voffset);
 967 
 968         if (!uphdr) {
 969                 if (ehdrp->e_type == ET_DYN) {
 970                         /*
 971                          * If we are executing a shared library which doesn't
 972                          * have a interpreter (probably ld.so.1) then
 973                          * we don't set the brkbase now.  Instead we
 974                          * delay it's setting until the first call
 975                          * via grow.c::brk().  This permits ld.so.1 to
 976                          * initialize brkbase to the tail of the executable it
 977                          * loads (which is where it needs to be).
 978                          */
 979                         bigwad->exenv.ex_brkbase = (caddr_t)0;
 980                         bigwad->exenv.ex_bssbase = (caddr_t)0;
 981                         bigwad->exenv.ex_brksize = 0;
 982                 } else {
 983                         bigwad->exenv.ex_brkbase = brkbase;
 984                         bigwad->exenv.ex_bssbase = bssbase;
 985                         bigwad->exenv.ex_brksize = brksize;
 986                 }
 987                 bigwad->exenv.ex_magic = elfmagic;
 988                 bigwad->exenv.ex_vp = vp;
 989                 setexecenv(&bigwad->exenv);
 990         }
 991 
 992         ASSERT(error == 0);
 993         goto out;
 994 
 995 bad:
 996         if (fd != -1)           /* did we open the a.out yet */
 997                 (void) execclose(fd);
 998 
 999         psignal(p, SIGKILL);
1000 
1001         if (error == 0)
1002                 error = ENOEXEC;
1003 out:
1004         if (phdrbase != NULL)
1005                 kmem_free(phdrbase, phdrsize);
1006         if (cap != NULL)
1007                 kmem_free(cap, capsize);
1008         kmem_free(bigwad, sizeof (struct bigwad));
1009         return (error);
1010 }
1011 
1012 /*
1013  * Compute the memory size requirement for the ELF file.
1014  */
1015 static size_t
1016 elfsize(Ehdr *ehdrp, int nphdrs, caddr_t phdrbase, uintptr_t *lddata)
1017 {
1018         size_t  len;
1019         Phdr    *phdrp = (Phdr *)phdrbase;
1020         int     hsize = ehdrp->e_phentsize;
1021         int     first = 1;
1022         int     dfirst = 1;     /* first data segment */
1023         uintptr_t loaddr = 0;
1024         uintptr_t hiaddr = 0;
1025         uintptr_t lo, hi;
1026         int     i;
1027 
1028         for (i = nphdrs; i > 0; i--) {
1029                 if (phdrp->p_type == PT_LOAD) {
1030                         lo = phdrp->p_vaddr;
1031                         hi = lo + phdrp->p_memsz;
1032                         if (first) {
1033                                 loaddr = lo;
1034                                 hiaddr = hi;
1035                                 first = 0;
1036                         } else {
1037                                 if (loaddr > lo)
1038                                         loaddr = lo;
1039                                 if (hiaddr < hi)
1040                                         hiaddr = hi;
1041                         }
1042 
1043                         /*
1044                          * save the address of the first data segment
1045                          * of a object - used for the AT_SUNW_LDDATA
1046                          * aux entry.
1047                          */
1048                         if ((lddata != NULL) && dfirst &&
1049                             (phdrp->p_flags & PF_W)) {
1050                                 *lddata = lo;
1051                                 dfirst = 0;
1052                         }
1053                 }
1054                 phdrp = (Phdr *)((caddr_t)phdrp + hsize);
1055         }
1056 
1057         len = hiaddr - (loaddr & PAGEMASK);
1058         len = roundup(len, PAGESIZE);
1059 
1060         return (len);
1061 }
1062 
1063 /*
1064  * Read in the ELF header and program header table.
1065  * SUSV3 requires:
1066  *      ENOEXEC File format is not recognized
1067  *      EINVAL  Format recognized but execution not supported
1068  */
1069 static int
1070 getelfhead(vnode_t *vp, cred_t *credp, Ehdr *ehdr, int *nshdrs, int *shstrndx,
1071     int *nphdrs)
1072 {
1073         int error;
1074         ssize_t resid;
1075 
1076         /*
1077          * We got here by the first two bytes in ident,
1078          * now read the entire ELF header.
1079          */
1080         if ((error = vn_rdwr(UIO_READ, vp, (caddr_t)ehdr,
1081             sizeof (Ehdr), (offset_t)0, UIO_SYSSPACE, 0,
1082             (rlim64_t)0, credp, &resid)) != 0)
1083                 return (error);
1084 
1085         /*
1086          * Since a separate version is compiled for handling 32-bit and
1087          * 64-bit ELF executables on a 64-bit kernel, the 64-bit version
1088          * doesn't need to be able to deal with 32-bit ELF files.
1089          */
1090         if (resid != 0 ||
1091             ehdr->e_ident[EI_MAG2] != ELFMAG2 ||
1092             ehdr->e_ident[EI_MAG3] != ELFMAG3)
1093                 return (ENOEXEC);
1094 
1095         if ((ehdr->e_type != ET_EXEC && ehdr->e_type != ET_DYN) ||
1096 #if defined(_ILP32) || defined(_ELF32_COMPAT)
1097             ehdr->e_ident[EI_CLASS] != ELFCLASS32 ||
1098 #else
1099             ehdr->e_ident[EI_CLASS] != ELFCLASS64 ||
1100 #endif
1101             !elfheadcheck(ehdr->e_ident[EI_DATA], ehdr->e_machine,
1102             ehdr->e_flags))
1103                 return (EINVAL);
1104 
1105         *nshdrs = ehdr->e_shnum;
1106         *shstrndx = ehdr->e_shstrndx;
1107         *nphdrs = ehdr->e_phnum;
1108 
1109         /*
1110          * If e_shnum, e_shstrndx, or e_phnum is its sentinel value, we need
1111          * to read in the section header at index zero to acces the true
1112          * values for those fields.
1113          */
1114         if ((*nshdrs == 0 && ehdr->e_shoff != 0) ||
1115             *shstrndx == SHN_XINDEX || *nphdrs == PN_XNUM) {
1116                 Shdr shdr;
1117 
1118                 if (ehdr->e_shoff == 0)
1119                         return (EINVAL);
1120 
1121                 if ((error = vn_rdwr(UIO_READ, vp, (caddr_t)&shdr,
1122                     sizeof (shdr), (offset_t)ehdr->e_shoff, UIO_SYSSPACE, 0,
1123                     (rlim64_t)0, credp, &resid)) != 0)
1124                         return (error);
1125 
1126                 if (*nshdrs == 0)
1127                         *nshdrs = shdr.sh_size;
1128                 if (*shstrndx == SHN_XINDEX)
1129                         *shstrndx = shdr.sh_link;
1130                 if (*nphdrs == PN_XNUM && shdr.sh_info != 0)
1131                         *nphdrs = shdr.sh_info;
1132         }
1133 
1134         return (0);
1135 }
1136 
1137 #ifdef _ELF32_COMPAT
1138 extern size_t elf_nphdr_max;
1139 #else
1140 size_t elf_nphdr_max = 1000;
1141 #endif
1142 
1143 static int
1144 getelfphdr(vnode_t *vp, cred_t *credp, const Ehdr *ehdr, int nphdrs,
1145     caddr_t *phbasep, ssize_t *phsizep)
1146 {
1147         ssize_t resid, minsize;
1148         int err;
1149 
1150         /*
1151          * Since we're going to be using e_phentsize to iterate down the
1152          * array of program headers, it must be 8-byte aligned or else
1153          * a we might cause a misaligned access. We use all members through
1154          * p_flags on 32-bit ELF files and p_memsz on 64-bit ELF files so
1155          * e_phentsize must be at least large enough to include those
1156          * members.
1157          */
1158 #if !defined(_LP64) || defined(_ELF32_COMPAT)
1159         minsize = offsetof(Phdr, p_flags) + sizeof (((Phdr *)NULL)->p_flags);
1160 #else
1161         minsize = offsetof(Phdr, p_memsz) + sizeof (((Phdr *)NULL)->p_memsz);
1162 #endif
1163         if (ehdr->e_phentsize < minsize || (ehdr->e_phentsize & 3))
1164                 return (EINVAL);
1165 
1166         *phsizep = nphdrs * ehdr->e_phentsize;
1167 
1168         if (*phsizep > sizeof (Phdr) * elf_nphdr_max) {
1169                 if ((*phbasep = kmem_alloc(*phsizep, KM_NOSLEEP)) == NULL)
1170                         return (ENOMEM);
1171         } else {
1172                 *phbasep = kmem_alloc(*phsizep, KM_SLEEP);
1173         }
1174 
1175         if ((err = vn_rdwr(UIO_READ, vp, *phbasep, *phsizep,
1176             (offset_t)ehdr->e_phoff, UIO_SYSSPACE, 0, (rlim64_t)0,
1177             credp, &resid)) != 0) {
1178                 kmem_free(*phbasep, *phsizep);
1179                 *phbasep = NULL;
1180                 return (err);
1181         }
1182 
1183         return (0);
1184 }
1185 
1186 #ifdef _ELF32_COMPAT
1187 extern size_t elf_nshdr_max;
1188 extern size_t elf_shstrtab_max;
1189 #else
1190 size_t elf_nshdr_max = 10000;
1191 size_t elf_shstrtab_max = 100 * 1024;
1192 #endif
1193 
1194 
1195 static int
1196 getelfshdr(vnode_t *vp, cred_t *credp, const Ehdr *ehdr,
1197     int nshdrs, int shstrndx, caddr_t *shbasep, ssize_t *shsizep,
1198     char **shstrbasep, ssize_t *shstrsizep)
1199 {
1200         ssize_t resid, minsize;
1201         int err;
1202         Shdr *shdr;
1203 
1204         /*
1205          * Since we're going to be using e_shentsize to iterate down the
1206          * array of section headers, it must be 8-byte aligned or else
1207          * a we might cause a misaligned access. We use all members through
1208          * sh_entsize (on both 32- and 64-bit ELF files) so e_shentsize
1209          * must be at least large enough to include that member. The index
1210          * of the string table section must also be valid.
1211          */
1212         minsize = offsetof(Shdr, sh_entsize) + sizeof (shdr->sh_entsize);
1213         if (ehdr->e_shentsize < minsize || (ehdr->e_shentsize & 3) ||
1214             shstrndx >= nshdrs)
1215                 return (EINVAL);
1216 
1217         *shsizep = nshdrs * ehdr->e_shentsize;
1218 
1219         if (*shsizep > sizeof (Shdr) * elf_nshdr_max) {
1220                 if ((*shbasep = kmem_alloc(*shsizep, KM_NOSLEEP)) == NULL)
1221                         return (ENOMEM);
1222         } else {
1223                 *shbasep = kmem_alloc(*shsizep, KM_SLEEP);
1224         }
1225 
1226         if ((err = vn_rdwr(UIO_READ, vp, *shbasep, *shsizep,
1227             (offset_t)ehdr->e_shoff, UIO_SYSSPACE, 0, (rlim64_t)0,
1228             credp, &resid)) != 0) {
1229                 kmem_free(*shbasep, *shsizep);
1230                 return (err);
1231         }
1232 
1233         /*
1234          * Pull the section string table out of the vnode; fail if the size
1235          * is zero.
1236          */
1237         shdr = (Shdr *)(*shbasep + shstrndx * ehdr->e_shentsize);
1238         if ((*shstrsizep = shdr->sh_size) == 0) {
1239                 kmem_free(*shbasep, *shsizep);
1240                 return (EINVAL);
1241         }
1242 
1243         if (*shstrsizep > elf_shstrtab_max) {
1244                 if ((*shstrbasep = kmem_alloc(*shstrsizep,
1245                     KM_NOSLEEP)) == NULL) {
1246                         kmem_free(*shbasep, *shsizep);
1247                         return (ENOMEM);
1248                 }
1249         } else {
1250                 *shstrbasep = kmem_alloc(*shstrsizep, KM_SLEEP);
1251         }
1252 
1253         if ((err = vn_rdwr(UIO_READ, vp, *shstrbasep, *shstrsizep,
1254             (offset_t)shdr->sh_offset, UIO_SYSSPACE, 0, (rlim64_t)0,
1255             credp, &resid)) != 0) {
1256                 kmem_free(*shbasep, *shsizep);
1257                 kmem_free(*shstrbasep, *shstrsizep);
1258                 return (err);
1259         }
1260 
1261         /*
1262          * Make sure the strtab is null-terminated to make sure we
1263          * don't run off the end of the table.
1264          */
1265         (*shstrbasep)[*shstrsizep - 1] = '\0';
1266 
1267         return (0);
1268 }
1269 
1270 static int
1271 mapelfexec(
1272         vnode_t *vp,
1273         Ehdr *ehdr,
1274         int nphdrs,
1275         caddr_t phdrbase,
1276         Phdr **uphdr,
1277         Phdr **intphdr,
1278         Phdr **stphdr,
1279         Phdr **dtphdr,
1280         Phdr *dataphdrp,
1281         caddr_t *bssbase,
1282         caddr_t *brkbase,
1283         intptr_t *voffset,
1284         intptr_t *minaddr,
1285         size_t len,
1286         long *execsz,
1287         size_t *brksize)
1288 {
1289         Phdr *phdr;
1290         int i, prot, error;
1291         caddr_t addr = NULL;
1292         size_t zfodsz;
1293         int ptload = 0;
1294         int page;
1295         off_t offset;
1296         int hsize = ehdr->e_phentsize;
1297         caddr_t mintmp = (caddr_t)-1;
1298         extern int use_brk_lpg;
1299 
1300         if (ehdr->e_type == ET_DYN) {
1301                 secflagset_t flags = 0;
1302                 /*
1303                  * Obtain the virtual address of a hole in the
1304                  * address space to map the "interpreter".
1305                  */
1306                 if (secflag_enabled(curproc, PROC_SEC_ASLR))
1307                         flags |= _MAP_RANDOMIZE;
1308 
1309                 map_addr(&addr, len, (offset_t)0, 1, flags);
1310                 if (addr == NULL)
1311                         return (ENOMEM);
1312                 *voffset = (intptr_t)addr;
1313 
1314                 /*
1315                  * Calculate the minimum vaddr so it can be subtracted out.
1316                  * According to the ELF specification, since PT_LOAD sections
1317                  * must be sorted by increasing p_vaddr values, this is
1318                  * guaranteed to be the first PT_LOAD section.
1319                  */
1320                 phdr = (Phdr *)phdrbase;
1321                 for (i = nphdrs; i > 0; i--) {
1322                         if (phdr->p_type == PT_LOAD) {
1323                                 *voffset -= (uintptr_t)phdr->p_vaddr;
1324                                 break;
1325                         }
1326                         phdr = (Phdr *)((caddr_t)phdr + hsize);
1327                 }
1328 
1329         } else {
1330                 *voffset = 0;
1331         }
1332         phdr = (Phdr *)phdrbase;
1333         for (i = nphdrs; i > 0; i--) {
1334                 switch (phdr->p_type) {
1335                 case PT_LOAD:
1336                         if ((*intphdr != NULL) && (*uphdr == NULL))
1337                                 return (0);
1338 
1339                         ptload = 1;
1340                         prot = PROT_USER;
1341                         if (phdr->p_flags & PF_R)
1342                                 prot |= PROT_READ;
1343                         if (phdr->p_flags & PF_W)
1344                                 prot |= PROT_WRITE;
1345                         if (phdr->p_flags & PF_X)
1346                                 prot |= PROT_EXEC;
1347 
1348                         addr = (caddr_t)((uintptr_t)phdr->p_vaddr + *voffset);
1349 
1350                         /*
1351                          * Keep track of the segment with the lowest starting
1352                          * address.
1353                          */
1354                         if (addr < mintmp)
1355                                 mintmp = addr;
1356 
1357                         zfodsz = (size_t)phdr->p_memsz - phdr->p_filesz;
1358 
1359                         offset = phdr->p_offset;
1360                         if (((uintptr_t)offset & PAGEOFFSET) ==
1361                             ((uintptr_t)addr & PAGEOFFSET) &&
1362                             (!(vp->v_flag & VNOMAP))) {
1363                                 page = 1;
1364                         } else {
1365                                 page = 0;
1366                         }
1367 
1368                         /*
1369                          * Set the heap pagesize for OOB when the bss size
1370                          * is known and use_brk_lpg is not 0.
1371                          */
1372                         if (brksize != NULL && use_brk_lpg &&
1373                             zfodsz != 0 && phdr == dataphdrp &&
1374                             (prot & PROT_WRITE)) {
1375                                 size_t tlen = P2NPHASE((uintptr_t)addr +
1376                                     phdr->p_filesz, PAGESIZE);
1377 
1378                                 if (zfodsz > tlen) {
1379                                         curproc->p_brkpageszc =
1380                                             page_szc(map_pgsz(MAPPGSZ_HEAP,
1381                                             curproc, addr + phdr->p_filesz +
1382                                             tlen, zfodsz - tlen, 0));
1383                                 }
1384                         }
1385 
1386                         if (curproc->p_brkpageszc != 0 && phdr == dataphdrp &&
1387                             (prot & PROT_WRITE)) {
1388                                 uint_t  szc = curproc->p_brkpageszc;
1389                                 size_t pgsz = page_get_pagesize(szc);
1390                                 caddr_t ebss = addr + phdr->p_memsz;
1391                                 /*
1392                                  * If we need extra space to keep the BSS an
1393                                  * integral number of pages in size, some of
1394                                  * that space may fall beyond p_brkbase, so we
1395                                  * need to set p_brksize to account for it
1396                                  * being (logically) part of the brk.
1397                                  */
1398                                 size_t extra_zfodsz;
1399 
1400                                 ASSERT(pgsz > PAGESIZE);
1401 
1402                                 extra_zfodsz = P2NPHASE((uintptr_t)ebss, pgsz);
1403 
1404                                 if (error = execmap(vp, addr, phdr->p_filesz,
1405                                     zfodsz + extra_zfodsz, phdr->p_offset,
1406                                     prot, page, szc))
1407                                         goto bad;
1408                                 if (brksize != NULL)
1409                                         *brksize = extra_zfodsz;
1410                         } else {
1411                                 if (error = execmap(vp, addr, phdr->p_filesz,
1412                                     zfodsz, phdr->p_offset, prot, page, 0))
1413                                         goto bad;
1414                         }
1415 
1416                         if (bssbase != NULL && addr >= *bssbase &&
1417                             phdr == dataphdrp) {
1418                                 *bssbase = addr + phdr->p_filesz;
1419                         }
1420                         if (brkbase != NULL && addr >= *brkbase) {
1421                                 *brkbase = addr + phdr->p_memsz;
1422                         }
1423 
1424                         *execsz += btopr(phdr->p_memsz);
1425                         break;
1426 
1427                 case PT_INTERP:
1428                         if (ptload)
1429                                 goto bad;
1430                         *intphdr = phdr;
1431                         break;
1432 
1433                 case PT_SHLIB:
1434                         *stphdr = phdr;
1435                         break;
1436 
1437                 case PT_PHDR:
1438                         if (ptload)
1439                                 goto bad;
1440                         *uphdr = phdr;
1441                         break;
1442 
1443                 case PT_NULL:
1444                 case PT_DYNAMIC:
1445                 case PT_NOTE:
1446                         break;
1447 
1448                 case PT_SUNWDTRACE:
1449                         if (dtphdr != NULL)
1450                                 *dtphdr = phdr;
1451                         break;
1452 
1453                 default:
1454                         break;
1455                 }
1456                 phdr = (Phdr *)((caddr_t)phdr + hsize);
1457         }
1458 
1459         if (minaddr != NULL) {
1460                 ASSERT(mintmp != (caddr_t)-1);
1461                 *minaddr = (intptr_t)mintmp;
1462         }
1463 
1464         if (brkbase != NULL && secflag_enabled(curproc, PROC_SEC_ASLR)) {
1465                 size_t off;
1466                 uintptr_t base = (uintptr_t)*brkbase;
1467                 uintptr_t oend = base + *brksize;
1468 
1469                 ASSERT(ISP2(aslr_max_brk_skew));
1470 
1471                 (void) random_get_pseudo_bytes((uint8_t *)&off, sizeof (off));
1472                 base += P2PHASE(off, aslr_max_brk_skew);
1473                 base = P2ROUNDUP(base, PAGESIZE);
1474                 *brkbase = (caddr_t)base;
1475                 /*
1476                  * Above, we set *brksize to account for the possibility we
1477                  * had to grow the 'brk' in padding out the BSS to a page
1478                  * boundary.
1479                  *
1480                  * We now need to adjust that based on where we now are
1481                  * actually putting the brk.
1482                  */
1483                 if (oend > base)
1484                         *brksize = oend - base;
1485                 else
1486                         *brksize = 0;
1487         }
1488 
1489         return (0);
1490 bad:
1491         if (error == 0)
1492                 error = EINVAL;
1493         return (error);
1494 }
1495 
1496 int
1497 elfnote(vnode_t *vp, offset_t *offsetp, int type, int descsz, void *desc,
1498     rlim64_t rlimit, cred_t *credp)
1499 {
1500         Note note;
1501         int error;
1502 
1503         bzero(&note, sizeof (note));
1504         bcopy("CORE", note.name, 4);
1505         note.nhdr.n_type = type;
1506         /*
1507          * The System V ABI states that n_namesz must be the length of the
1508          * string that follows the Nhdr structure including the terminating
1509          * null. The ABI also specifies that sufficient padding should be
1510          * included so that the description that follows the name string
1511          * begins on a 4- or 8-byte boundary for 32- and 64-bit binaries
1512          * respectively. However, since this change was not made correctly
1513          * at the time of the 64-bit port, both 32- and 64-bit binaries
1514          * descriptions are only guaranteed to begin on a 4-byte boundary.
1515          */
1516         note.nhdr.n_namesz = 5;
1517         note.nhdr.n_descsz = roundup(descsz, sizeof (Word));
1518 
1519         if (error = core_write(vp, UIO_SYSSPACE, *offsetp, &note,
1520             sizeof (note), rlimit, credp))
1521                 return (error);
1522 
1523         *offsetp += sizeof (note);
1524 
1525         if (error = core_write(vp, UIO_SYSSPACE, *offsetp, desc,
1526             note.nhdr.n_descsz, rlimit, credp))
1527                 return (error);
1528 
1529         *offsetp += note.nhdr.n_descsz;
1530         return (0);
1531 }
1532 
1533 /*
1534  * Copy the section data from one vnode to the section of another vnode.
1535  */
1536 static void
1537 copy_scn(Shdr *src, vnode_t *src_vp, Shdr *dst, vnode_t *dst_vp, Off *doffset,
1538     void *buf, size_t size, cred_t *credp, rlim64_t rlimit)
1539 {
1540         ssize_t resid;
1541         size_t len, n = src->sh_size;
1542         offset_t off = 0;
1543 
1544         while (n != 0) {
1545                 len = MIN(size, n);
1546                 if (vn_rdwr(UIO_READ, src_vp, buf, len, src->sh_offset + off,
1547                     UIO_SYSSPACE, 0, (rlim64_t)0, credp, &resid) != 0 ||
1548                     resid >= len ||
1549                     core_write(dst_vp, UIO_SYSSPACE, *doffset + off,
1550                     buf, len - resid, rlimit, credp) != 0) {
1551                         dst->sh_size = 0;
1552                         dst->sh_offset = 0;
1553                         return;
1554                 }
1555 
1556                 ASSERT(n >= len - resid);
1557 
1558                 n -= len - resid;
1559                 off += len - resid;
1560         }
1561 
1562         *doffset += src->sh_size;
1563 }
1564 
1565 #ifdef _ELF32_COMPAT
1566 extern size_t elf_datasz_max;
1567 #else
1568 size_t elf_datasz_max = 1 * 1024 * 1024;
1569 #endif
1570 
1571 /*
1572  * This function processes mappings that correspond to load objects to
1573  * examine their respective sections for elfcore(). It's called once with
1574  * v set to NULL to count the number of sections that we're going to need
1575  * and then again with v set to some allocated buffer that we fill in with
1576  * all the section data.
1577  */
1578 static int
1579 process_scns(core_content_t content, proc_t *p, cred_t *credp, vnode_t *vp,
1580     Shdr *v, int nv, rlim64_t rlimit, Off *doffsetp, int *nshdrsp)
1581 {
1582         vnode_t *lastvp = NULL;
1583         struct seg *seg;
1584         int i, j;
1585         void *data = NULL;
1586         size_t datasz = 0;
1587         shstrtab_t shstrtab;
1588         struct as *as = p->p_as;
1589         int error = 0;
1590 
1591         if (v != NULL)
1592                 shstrtab_init(&shstrtab);
1593 
1594         i = 1;
1595         for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
1596                 uint_t prot;
1597                 vnode_t *mvp;
1598                 void *tmp = NULL;
1599                 caddr_t saddr = seg->s_base;
1600                 caddr_t naddr;
1601                 caddr_t eaddr;
1602                 size_t segsize;
1603 
1604                 Ehdr ehdr;
1605                 int nshdrs, shstrndx, nphdrs;
1606                 caddr_t shbase;
1607                 ssize_t shsize;
1608                 char *shstrbase;
1609                 ssize_t shstrsize;
1610 
1611                 Shdr *shdr;
1612                 const char *name;
1613                 size_t sz;
1614                 uintptr_t off;
1615 
1616                 int ctf_ndx = 0;
1617                 int symtab_ndx = 0;
1618 
1619                 /*
1620                  * Since we're just looking for text segments of load
1621                  * objects, we only care about the protection bits; we don't
1622                  * care about the actual size of the segment so we use the
1623                  * reserved size. If the segment's size is zero, there's
1624                  * something fishy going on so we ignore this segment.
1625                  */
1626                 if (seg->s_ops != &segvn_ops ||
1627                     SEGOP_GETVP(seg, seg->s_base, &mvp) != 0 ||
1628                     mvp == lastvp || mvp == NULL || mvp->v_type != VREG ||
1629                     (segsize = pr_getsegsize(seg, 1)) == 0)
1630                         continue;
1631 
1632                 eaddr = saddr + segsize;
1633                 prot = pr_getprot(seg, 1, &tmp, &saddr, &naddr, eaddr);
1634                 pr_getprot_done(&tmp);
1635 
1636                 /*
1637                  * Skip this segment unless the protection bits look like
1638                  * what we'd expect for a text segment.
1639                  */
1640                 if ((prot & (PROT_WRITE | PROT_EXEC)) != PROT_EXEC)
1641                         continue;
1642 
1643                 if (getelfhead(mvp, credp, &ehdr, &nshdrs, &shstrndx,
1644                     &nphdrs) != 0 ||
1645                     getelfshdr(mvp, credp, &ehdr, nshdrs, shstrndx,
1646                     &shbase, &shsize, &shstrbase, &shstrsize) != 0)
1647                         continue;
1648 
1649                 off = ehdr.e_shentsize;
1650                 for (j = 1; j < nshdrs; j++, off += ehdr.e_shentsize) {
1651                         Shdr *symtab = NULL, *strtab;
1652 
1653                         shdr = (Shdr *)(shbase + off);
1654 
1655                         if (shdr->sh_name >= shstrsize)
1656                                 continue;
1657 
1658                         name = shstrbase + shdr->sh_name;
1659 
1660                         if (strcmp(name, shstrtab_data[STR_CTF]) == 0) {
1661                                 if ((content & CC_CONTENT_CTF) == 0 ||
1662                                     ctf_ndx != 0)
1663                                         continue;
1664 
1665                                 if (shdr->sh_link > 0 &&
1666                                     shdr->sh_link < nshdrs) {
1667                                         symtab = (Shdr *)(shbase +
1668                                             shdr->sh_link * ehdr.e_shentsize);
1669                                 }
1670 
1671                                 if (v != NULL && i < nv - 1) {
1672                                         if (shdr->sh_size > datasz &&
1673                                             shdr->sh_size <= elf_datasz_max) {
1674                                                 if (data != NULL)
1675                                                         kmem_free(data, datasz);
1676 
1677                                                 datasz = shdr->sh_size;
1678                                                 data = kmem_alloc(datasz,
1679                                                     KM_SLEEP);
1680                                         }
1681 
1682                                         v[i].sh_name = shstrtab_ndx(&shstrtab,
1683                                             STR_CTF);
1684                                         v[i].sh_addr = (Addr)(uintptr_t)saddr;
1685                                         v[i].sh_type = SHT_PROGBITS;
1686                                         v[i].sh_addralign = 4;
1687                                         *doffsetp = roundup(*doffsetp,
1688                                             v[i].sh_addralign);
1689                                         v[i].sh_offset = *doffsetp;
1690                                         v[i].sh_size = shdr->sh_size;
1691                                         if (symtab == NULL)  {
1692                                                 v[i].sh_link = 0;
1693                                         } else if (symtab->sh_type ==
1694                                             SHT_SYMTAB &&
1695                                             symtab_ndx != 0) {
1696                                                 v[i].sh_link =
1697                                                     symtab_ndx;
1698                                         } else {
1699                                                 v[i].sh_link = i + 1;
1700                                         }
1701 
1702                                         copy_scn(shdr, mvp, &v[i], vp,
1703                                             doffsetp, data, datasz, credp,
1704                                             rlimit);
1705                                 }
1706 
1707                                 ctf_ndx = i++;
1708 
1709                                 /*
1710                                  * We've already dumped the symtab.
1711                                  */
1712                                 if (symtab != NULL &&
1713                                     symtab->sh_type == SHT_SYMTAB &&
1714                                     symtab_ndx != 0)
1715                                         continue;
1716 
1717                         } else if (strcmp(name,
1718                             shstrtab_data[STR_SYMTAB]) == 0) {
1719                                 if ((content & CC_CONTENT_SYMTAB) == 0 ||
1720                                     symtab != 0)
1721                                         continue;
1722 
1723                                 symtab = shdr;
1724                         }
1725 
1726                         if (symtab != NULL) {
1727                                 if ((symtab->sh_type != SHT_DYNSYM &&
1728                                     symtab->sh_type != SHT_SYMTAB) ||
1729                                     symtab->sh_link == 0 ||
1730                                     symtab->sh_link >= nshdrs)
1731                                         continue;
1732 
1733                                 strtab = (Shdr *)(shbase +
1734                                     symtab->sh_link * ehdr.e_shentsize);
1735 
1736                                 if (strtab->sh_type != SHT_STRTAB)
1737                                         continue;
1738 
1739                                 if (v != NULL && i < nv - 2) {
1740                                         sz = MAX(symtab->sh_size,
1741                                             strtab->sh_size);
1742                                         if (sz > datasz &&
1743                                             sz <= elf_datasz_max) {
1744                                                 if (data != NULL)
1745                                                         kmem_free(data, datasz);
1746 
1747                                                 datasz = sz;
1748                                                 data = kmem_alloc(datasz,
1749                                                     KM_SLEEP);
1750                                         }
1751 
1752                                         if (symtab->sh_type == SHT_DYNSYM) {
1753                                                 v[i].sh_name = shstrtab_ndx(
1754                                                     &shstrtab, STR_DYNSYM);
1755                                                 v[i + 1].sh_name = shstrtab_ndx(
1756                                                     &shstrtab, STR_DYNSTR);
1757                                         } else {
1758                                                 v[i].sh_name = shstrtab_ndx(
1759                                                     &shstrtab, STR_SYMTAB);
1760                                                 v[i + 1].sh_name = shstrtab_ndx(
1761                                                     &shstrtab, STR_STRTAB);
1762                                         }
1763 
1764                                         v[i].sh_type = symtab->sh_type;
1765                                         v[i].sh_addr = symtab->sh_addr;
1766                                         if (ehdr.e_type == ET_DYN ||
1767                                             v[i].sh_addr == 0)
1768                                                 v[i].sh_addr +=
1769                                                     (Addr)(uintptr_t)saddr;
1770                                         v[i].sh_addralign =
1771                                             symtab->sh_addralign;
1772                                         *doffsetp = roundup(*doffsetp,
1773                                             v[i].sh_addralign);
1774                                         v[i].sh_offset = *doffsetp;
1775                                         v[i].sh_size = symtab->sh_size;
1776                                         v[i].sh_link = i + 1;
1777                                         v[i].sh_entsize = symtab->sh_entsize;
1778                                         v[i].sh_info = symtab->sh_info;
1779 
1780                                         copy_scn(symtab, mvp, &v[i], vp,
1781                                             doffsetp, data, datasz, credp,
1782                                             rlimit);
1783 
1784                                         v[i + 1].sh_type = SHT_STRTAB;
1785                                         v[i + 1].sh_flags = SHF_STRINGS;
1786                                         v[i + 1].sh_addr = symtab->sh_addr;
1787                                         if (ehdr.e_type == ET_DYN ||
1788                                             v[i + 1].sh_addr == 0)
1789                                                 v[i + 1].sh_addr +=
1790                                                     (Addr)(uintptr_t)saddr;
1791                                         v[i + 1].sh_addralign =
1792                                             strtab->sh_addralign;
1793                                         *doffsetp = roundup(*doffsetp,
1794                                             v[i + 1].sh_addralign);
1795                                         v[i + 1].sh_offset = *doffsetp;
1796                                         v[i + 1].sh_size = strtab->sh_size;
1797 
1798                                         copy_scn(strtab, mvp, &v[i + 1], vp,
1799                                             doffsetp, data, datasz, credp,
1800                                             rlimit);
1801                                 }
1802 
1803                                 if (symtab->sh_type == SHT_SYMTAB)
1804                                         symtab_ndx = i;
1805                                 i += 2;
1806                         }
1807                 }
1808 
1809                 kmem_free(shstrbase, shstrsize);
1810                 kmem_free(shbase, shsize);
1811 
1812                 lastvp = mvp;
1813         }
1814 
1815         if (v == NULL) {
1816                 if (i == 1)
1817                         *nshdrsp = 0;
1818                 else
1819                         *nshdrsp = i + 1;
1820                 goto done;
1821         }
1822 
1823         if (i != nv - 1) {
1824                 cmn_err(CE_WARN, "elfcore: core dump failed for "
1825                     "process %d; address space is changing", p->p_pid);
1826                 error = EIO;
1827                 goto done;
1828         }
1829 
1830         v[i].sh_name = shstrtab_ndx(&shstrtab, STR_SHSTRTAB);
1831         v[i].sh_size = shstrtab_size(&shstrtab);
1832         v[i].sh_addralign = 1;
1833         *doffsetp = roundup(*doffsetp, v[i].sh_addralign);
1834         v[i].sh_offset = *doffsetp;
1835         v[i].sh_flags = SHF_STRINGS;
1836         v[i].sh_type = SHT_STRTAB;
1837 
1838         if (v[i].sh_size > datasz) {
1839                 if (data != NULL)
1840                         kmem_free(data, datasz);
1841 
1842                 datasz = v[i].sh_size;
1843                 data = kmem_alloc(datasz,
1844                     KM_SLEEP);
1845         }
1846 
1847         shstrtab_dump(&shstrtab, data);
1848 
1849         if ((error = core_write(vp, UIO_SYSSPACE, *doffsetp,
1850             data, v[i].sh_size, rlimit, credp)) != 0)
1851                 goto done;
1852 
1853         *doffsetp += v[i].sh_size;
1854 
1855 done:
1856         if (data != NULL)
1857                 kmem_free(data, datasz);
1858 
1859         return (error);
1860 }
1861 
1862 int
1863 elfcore(vnode_t *vp, proc_t *p, cred_t *credp, rlim64_t rlimit, int sig,
1864     core_content_t content)
1865 {
1866         offset_t poffset, soffset;
1867         Off doffset;
1868         int error, i, nphdrs, nshdrs;
1869         int overflow = 0;
1870         struct seg *seg;
1871         struct as *as = p->p_as;
1872         union {
1873                 Ehdr ehdr;
1874                 Phdr phdr[1];
1875                 Shdr shdr[1];
1876         } *bigwad;
1877         size_t bigsize;
1878         size_t phdrsz, shdrsz;
1879         Ehdr *ehdr;
1880         Phdr *v;
1881         caddr_t brkbase;
1882         size_t brksize;
1883         caddr_t stkbase;
1884         size_t stksize;
1885         int ntries = 0;
1886         klwp_t *lwp = ttolwp(curthread);
1887 
1888 top:
1889         /*
1890          * Make sure we have everything we need (registers, etc.).
1891          * All other lwps have already stopped and are in an orderly state.
1892          */
1893         ASSERT(p == ttoproc(curthread));
1894         prstop(0, 0);
1895 
1896         AS_LOCK_ENTER(as, RW_WRITER);
1897         nphdrs = prnsegs(as, 0) + 2;            /* two CORE note sections */
1898 
1899         /*
1900          * Count the number of section headers we're going to need.
1901          */
1902         nshdrs = 0;
1903         if (content & (CC_CONTENT_CTF | CC_CONTENT_SYMTAB)) {
1904                 (void) process_scns(content, p, credp, NULL, NULL, NULL, 0,
1905                     NULL, &nshdrs);
1906         }
1907         AS_LOCK_EXIT(as);
1908 
1909         ASSERT(nshdrs == 0 || nshdrs > 1);
1910 
1911         /*
1912          * The core file contents may required zero section headers, but if
1913          * we overflow the 16 bits allotted to the program header count in
1914          * the ELF header, we'll need that program header at index zero.
1915          */
1916         if (nshdrs == 0 && nphdrs >= PN_XNUM)
1917                 nshdrs = 1;
1918 
1919         phdrsz = nphdrs * sizeof (Phdr);
1920         shdrsz = nshdrs * sizeof (Shdr);
1921 
1922         bigsize = MAX(sizeof (*bigwad), MAX(phdrsz, shdrsz));
1923         bigwad = kmem_alloc(bigsize, KM_SLEEP);
1924 
1925         ehdr = &bigwad->ehdr;
1926         bzero(ehdr, sizeof (*ehdr));
1927 
1928         ehdr->e_ident[EI_MAG0] = ELFMAG0;
1929         ehdr->e_ident[EI_MAG1] = ELFMAG1;
1930         ehdr->e_ident[EI_MAG2] = ELFMAG2;
1931         ehdr->e_ident[EI_MAG3] = ELFMAG3;
1932         ehdr->e_ident[EI_CLASS] = ELFCLASS;
1933         ehdr->e_type = ET_CORE;
1934 
1935 #if !defined(_LP64) || defined(_ELF32_COMPAT)
1936 
1937 #if defined(__sparc)
1938         ehdr->e_ident[EI_DATA] = ELFDATA2MSB;
1939         ehdr->e_machine = EM_SPARC;
1940 #elif defined(__i386) || defined(__i386_COMPAT)
1941         ehdr->e_ident[EI_DATA] = ELFDATA2LSB;
1942         ehdr->e_machine = EM_386;
1943 #else
1944 #error "no recognized machine type is defined"
1945 #endif
1946 
1947 #else   /* !defined(_LP64) || defined(_ELF32_COMPAT) */
1948 
1949 #if defined(__sparc)
1950         ehdr->e_ident[EI_DATA] = ELFDATA2MSB;
1951         ehdr->e_machine = EM_SPARCV9;
1952 #elif defined(__amd64)
1953         ehdr->e_ident[EI_DATA] = ELFDATA2LSB;
1954         ehdr->e_machine = EM_AMD64;
1955 #else
1956 #error "no recognized 64-bit machine type is defined"
1957 #endif
1958 
1959 #endif  /* !defined(_LP64) || defined(_ELF32_COMPAT) */
1960 
1961         /*
1962          * If the count of program headers or section headers or the index
1963          * of the section string table can't fit in the mere 16 bits
1964          * shortsightedly allotted to them in the ELF header, we use the
1965          * extended formats and put the real values in the section header
1966          * as index 0.
1967          */
1968         ehdr->e_version = EV_CURRENT;
1969         ehdr->e_ehsize = sizeof (Ehdr);
1970 
1971         if (nphdrs >= PN_XNUM)
1972                 ehdr->e_phnum = PN_XNUM;
1973         else
1974                 ehdr->e_phnum = (unsigned short)nphdrs;
1975 
1976         ehdr->e_phoff = sizeof (Ehdr);
1977         ehdr->e_phentsize = sizeof (Phdr);
1978 
1979         if (nshdrs > 0) {
1980                 if (nshdrs >= SHN_LORESERVE)
1981                         ehdr->e_shnum = 0;
1982                 else
1983                         ehdr->e_shnum = (unsigned short)nshdrs;
1984 
1985                 if (nshdrs - 1 >= SHN_LORESERVE)
1986                         ehdr->e_shstrndx = SHN_XINDEX;
1987                 else
1988                         ehdr->e_shstrndx = (unsigned short)(nshdrs - 1);
1989 
1990                 ehdr->e_shoff = ehdr->e_phoff + ehdr->e_phentsize * nphdrs;
1991                 ehdr->e_shentsize = sizeof (Shdr);
1992         }
1993 
1994         if (error = core_write(vp, UIO_SYSSPACE, (offset_t)0, ehdr,
1995             sizeof (Ehdr), rlimit, credp))
1996                 goto done;
1997 
1998         poffset = sizeof (Ehdr);
1999         soffset = sizeof (Ehdr) + phdrsz;
2000         doffset = sizeof (Ehdr) + phdrsz + shdrsz;
2001 
2002         v = &bigwad->phdr[0];
2003         bzero(v, phdrsz);
2004 
2005         setup_old_note_header(&v[0], p);
2006         v[0].p_offset = doffset = roundup(doffset, sizeof (Word));
2007         doffset += v[0].p_filesz;
2008 
2009         setup_note_header(&v[1], p);
2010         v[1].p_offset = doffset = roundup(doffset, sizeof (Word));
2011         doffset += v[1].p_filesz;
2012 
2013         mutex_enter(&p->p_lock);
2014 
2015         brkbase = p->p_brkbase;
2016         brksize = p->p_brksize;
2017 
2018         stkbase = p->p_usrstack - p->p_stksize;
2019         stksize = p->p_stksize;
2020 
2021         mutex_exit(&p->p_lock);
2022 
2023         AS_LOCK_ENTER(as, RW_WRITER);
2024         i = 2;
2025         for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
2026                 caddr_t eaddr = seg->s_base + pr_getsegsize(seg, 0);
2027                 caddr_t saddr, naddr;
2028                 void *tmp = NULL;
2029                 extern struct seg_ops segspt_shmops;
2030 
2031                 for (saddr = seg->s_base; saddr < eaddr; saddr = naddr) {
2032                         uint_t prot;
2033                         size_t size;
2034                         int type;
2035                         vnode_t *mvp;
2036 
2037                         prot = pr_getprot(seg, 0, &tmp, &saddr, &naddr, eaddr);
2038                         prot &= PROT_READ | PROT_WRITE | PROT_EXEC;
2039                         if ((size = (size_t)(naddr - saddr)) == 0)
2040                                 continue;
2041                         if (i == nphdrs) {
2042                                 overflow++;
2043                                 continue;
2044                         }
2045                         v[i].p_type = PT_LOAD;
2046                         v[i].p_vaddr = (Addr)(uintptr_t)saddr;
2047                         v[i].p_memsz = size;
2048                         if (prot & PROT_READ)
2049                                 v[i].p_flags |= PF_R;
2050                         if (prot & PROT_WRITE)
2051                                 v[i].p_flags |= PF_W;
2052                         if (prot & PROT_EXEC)
2053                                 v[i].p_flags |= PF_X;
2054 
2055                         /*
2056                          * Figure out which mappings to include in the core.
2057                          */
2058                         type = SEGOP_GETTYPE(seg, saddr);
2059 
2060                         if (saddr == stkbase && size == stksize) {
2061                                 if (!(content & CC_CONTENT_STACK))
2062                                         goto exclude;
2063 
2064                         } else if (saddr == brkbase && size == brksize) {
2065                                 if (!(content & CC_CONTENT_HEAP))
2066                                         goto exclude;
2067 
2068                         } else if (seg->s_ops == &segspt_shmops) {
2069                                 if (type & MAP_NORESERVE) {
2070                                         if (!(content & CC_CONTENT_DISM))
2071                                                 goto exclude;
2072                                 } else {
2073                                         if (!(content & CC_CONTENT_ISM))
2074                                                 goto exclude;
2075                                 }
2076 
2077                         } else if (seg->s_ops != &segvn_ops) {
2078                                 goto exclude;
2079 
2080                         } else if (type & MAP_SHARED) {
2081                                 if (shmgetid(p, saddr) != SHMID_NONE) {
2082                                         if (!(content & CC_CONTENT_SHM))
2083                                                 goto exclude;
2084 
2085                                 } else if (SEGOP_GETVP(seg, seg->s_base,
2086                                     &mvp) != 0 || mvp == NULL ||
2087                                     mvp->v_type != VREG) {
2088                                         if (!(content & CC_CONTENT_SHANON))
2089                                                 goto exclude;
2090 
2091                                 } else {
2092                                         if (!(content & CC_CONTENT_SHFILE))
2093                                                 goto exclude;
2094                                 }
2095 
2096                         } else if (SEGOP_GETVP(seg, seg->s_base, &mvp) != 0 ||
2097                             mvp == NULL || mvp->v_type != VREG) {
2098                                 if (!(content & CC_CONTENT_ANON))
2099                                         goto exclude;
2100 
2101                         } else if (prot == (PROT_READ | PROT_EXEC)) {
2102                                 if (!(content & CC_CONTENT_TEXT))
2103                                         goto exclude;
2104 
2105                         } else if (prot == PROT_READ) {
2106                                 if (!(content & CC_CONTENT_RODATA))
2107                                         goto exclude;
2108 
2109                         } else {
2110                                 if (!(content & CC_CONTENT_DATA))
2111                                         goto exclude;
2112                         }
2113 
2114                         doffset = roundup(doffset, sizeof (Word));
2115                         v[i].p_offset = doffset;
2116                         v[i].p_filesz = size;
2117                         doffset += size;
2118 exclude:
2119                         i++;
2120                 }
2121                 ASSERT(tmp == NULL);
2122         }
2123         AS_LOCK_EXIT(as);
2124 
2125         if (overflow || i != nphdrs) {
2126                 if (ntries++ == 0) {
2127                         kmem_free(bigwad, bigsize);
2128                         overflow = 0;
2129                         goto top;
2130                 }
2131                 cmn_err(CE_WARN, "elfcore: core dump failed for "
2132                     "process %d; address space is changing", p->p_pid);
2133                 error = EIO;
2134                 goto done;
2135         }
2136 
2137         if ((error = core_write(vp, UIO_SYSSPACE, poffset,
2138             v, phdrsz, rlimit, credp)) != 0)
2139                 goto done;
2140 
2141         if ((error = write_old_elfnotes(p, sig, vp, v[0].p_offset, rlimit,
2142             credp)) != 0)
2143                 goto done;
2144 
2145         if ((error = write_elfnotes(p, sig, vp, v[1].p_offset, rlimit,
2146             credp, content)) != 0)
2147                 goto done;
2148 
2149         for (i = 2; i < nphdrs; i++) {
2150                 prkillinfo_t killinfo;
2151                 sigqueue_t *sq;
2152                 int sig, j;
2153 
2154                 if (v[i].p_filesz == 0)
2155                         continue;
2156 
2157                 /*
2158                  * If dumping out this segment fails, rather than failing
2159                  * the core dump entirely, we reset the size of the mapping
2160                  * to zero to indicate that the data is absent from the core
2161                  * file and or in the PF_SUNW_FAILURE flag to differentiate
2162                  * this from mappings that were excluded due to the core file
2163                  * content settings.
2164                  */
2165                 if ((error = core_seg(p, vp, v[i].p_offset,
2166                     (caddr_t)(uintptr_t)v[i].p_vaddr, v[i].p_filesz,
2167                     rlimit, credp)) == 0) {
2168                         continue;
2169                 }
2170 
2171                 if ((sig = lwp->lwp_cursig) == 0) {
2172                         /*
2173                          * We failed due to something other than a signal.
2174                          * Since the space reserved for the segment is now
2175                          * unused, we stash the errno in the first four
2176                          * bytes. This undocumented interface will let us
2177                          * understand the nature of the failure.
2178                          */
2179                         (void) core_write(vp, UIO_SYSSPACE, v[i].p_offset,
2180                             &error, sizeof (error), rlimit, credp);
2181 
2182                         v[i].p_filesz = 0;
2183                         v[i].p_flags |= PF_SUNW_FAILURE;
2184                         if ((error = core_write(vp, UIO_SYSSPACE,
2185                             poffset + sizeof (v[i]) * i, &v[i], sizeof (v[i]),
2186                             rlimit, credp)) != 0)
2187                                 goto done;
2188 
2189                         continue;
2190                 }
2191 
2192                 /*
2193                  * We took a signal.  We want to abort the dump entirely, but
2194                  * we also want to indicate what failed and why.  We therefore
2195                  * use the space reserved for the first failing segment to
2196                  * write our error (which, for purposes of compatability with
2197                  * older core dump readers, we set to EINTR) followed by any
2198                  * siginfo associated with the signal.
2199                  */
2200                 bzero(&killinfo, sizeof (killinfo));
2201                 killinfo.prk_error = EINTR;
2202 
2203                 sq = sig == SIGKILL ? curproc->p_killsqp : lwp->lwp_curinfo;
2204 
2205                 if (sq != NULL) {
2206                         bcopy(&sq->sq_info, &killinfo.prk_info,
2207                             sizeof (sq->sq_info));
2208                 } else {
2209                         killinfo.prk_info.si_signo = lwp->lwp_cursig;
2210                         killinfo.prk_info.si_code = SI_NOINFO;
2211                 }
2212 
2213 #if (defined(_SYSCALL32_IMPL) || defined(_LP64))
2214                 /*
2215                  * If this is a 32-bit process, we need to translate from the
2216                  * native siginfo to the 32-bit variant.  (Core readers must
2217                  * always have the same data model as their target or must
2218                  * be aware of -- and compensate for -- data model differences.)
2219                  */
2220                 if (curproc->p_model == DATAMODEL_ILP32) {
2221                         siginfo32_t si32;
2222 
2223                         siginfo_kto32((k_siginfo_t *)&killinfo.prk_info, &si32);
2224                         bcopy(&si32, &killinfo.prk_info, sizeof (si32));
2225                 }
2226 #endif
2227 
2228                 (void) core_write(vp, UIO_SYSSPACE, v[i].p_offset,
2229                     &killinfo, sizeof (killinfo), rlimit, credp);
2230 
2231                 /*
2232                  * For the segment on which we took the signal, indicate that
2233                  * its data now refers to a siginfo.
2234                  */
2235                 v[i].p_filesz = 0;
2236                 v[i].p_flags |= PF_SUNW_FAILURE | PF_SUNW_KILLED |
2237                     PF_SUNW_SIGINFO;
2238 
2239                 /*
2240                  * And for every other segment, indicate that its absence
2241                  * is due to a signal.
2242                  */
2243                 for (j = i + 1; j < nphdrs; j++) {
2244                         v[j].p_filesz = 0;
2245                         v[j].p_flags |= PF_SUNW_FAILURE | PF_SUNW_KILLED;
2246                 }
2247 
2248                 /*
2249                  * Finally, write out our modified program headers.
2250                  */
2251                 if ((error = core_write(vp, UIO_SYSSPACE,
2252                     poffset + sizeof (v[i]) * i, &v[i],
2253                     sizeof (v[i]) * (nphdrs - i), rlimit, credp)) != 0)
2254                         goto done;
2255 
2256                 break;
2257         }
2258 
2259         if (nshdrs > 0) {
2260                 bzero(&bigwad->shdr[0], shdrsz);
2261 
2262                 if (nshdrs >= SHN_LORESERVE)
2263                         bigwad->shdr[0].sh_size = nshdrs;
2264 
2265                 if (nshdrs - 1 >= SHN_LORESERVE)
2266                         bigwad->shdr[0].sh_link = nshdrs - 1;
2267 
2268                 if (nphdrs >= PN_XNUM)
2269                         bigwad->shdr[0].sh_info = nphdrs;
2270 
2271                 if (nshdrs > 1) {
2272                         AS_LOCK_ENTER(as, RW_WRITER);
2273                         if ((error = process_scns(content, p, credp, vp,
2274                             &bigwad->shdr[0], nshdrs, rlimit, &doffset,
2275                             NULL)) != 0) {
2276                                 AS_LOCK_EXIT(as);
2277                                 goto done;
2278                         }
2279                         AS_LOCK_EXIT(as);
2280                 }
2281 
2282                 if ((error = core_write(vp, UIO_SYSSPACE, soffset,
2283                     &bigwad->shdr[0], shdrsz, rlimit, credp)) != 0)
2284                         goto done;
2285         }
2286 
2287 done:
2288         kmem_free(bigwad, bigsize);
2289         return (error);
2290 }
2291 
2292 #ifndef _ELF32_COMPAT
2293 
2294 static struct execsw esw = {
2295 #ifdef  _LP64
2296         elf64magicstr,
2297 #else   /* _LP64 */
2298         elf32magicstr,
2299 #endif  /* _LP64 */
2300         0,
2301         5,
2302         elfexec,
2303         elfcore
2304 };
2305 
2306 static struct modlexec modlexec = {
2307         &mod_execops, "exec module for elf", &esw
2308 };
2309 
2310 #ifdef  _LP64
2311 extern int elf32exec(vnode_t *vp, execa_t *uap, uarg_t *args,
2312                         intpdata_t *idatap, int level, long *execsz,
2313                         int setid, caddr_t exec_file, cred_t *cred,
2314                         int brand_action);
2315 extern int elf32core(vnode_t *vp, proc_t *p, cred_t *credp,
2316                         rlim64_t rlimit, int sig, core_content_t content);
2317 
2318 static struct execsw esw32 = {
2319         elf32magicstr,
2320         0,
2321         5,
2322         elf32exec,
2323         elf32core
2324 };
2325 
2326 static struct modlexec modlexec32 = {
2327         &mod_execops, "32-bit exec module for elf", &esw32
2328 };
2329 #endif  /* _LP64 */
2330 
2331 static struct modlinkage modlinkage = {
2332         MODREV_1,
2333         (void *)&modlexec,
2334 #ifdef  _LP64
2335         (void *)&modlexec32,
2336 #endif  /* _LP64 */
2337         NULL
2338 };
2339 
2340 int
2341 _init(void)
2342 {
2343         return (mod_install(&modlinkage));
2344 }
2345 
2346 int
2347 _fini(void)
2348 {
2349         return (mod_remove(&modlinkage));
2350 }
2351 
2352 int
2353 _info(struct modinfo *modinfop)
2354 {
2355         return (mod_info(&modlinkage, modinfop));
2356 }
2357 
2358 #endif  /* !_ELF32_COMPAT */