1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 * Copyright 2014 Joyent, Inc. All rights reserved. 25 */ 26 27 #include <sys/types.h> 28 #include <sys/sysmacros.h> 29 #include <sys/kmem.h> 30 #include <sys/param.h> 31 #include <sys/systm.h> 32 #include <sys/errno.h> 33 #include <sys/mman.h> 34 #include <sys/cmn_err.h> 35 #include <sys/cred.h> 36 #include <sys/vmsystm.h> 37 #include <sys/machsystm.h> 38 #include <sys/debug.h> 39 #include <vm/as.h> 40 #include <vm/seg.h> 41 #include <sys/vmparam.h> 42 #include <sys/vfs.h> 43 #include <sys/elf.h> 44 #include <sys/machelf.h> 45 #include <sys/corectl.h> 46 #include <sys/exec.h> 47 #include <sys/exechdr.h> 48 #include <sys/autoconf.h> 49 #include <sys/mem.h> 50 #include <vm/seg_dev.h> 51 #include <sys/vmparam.h> 52 #include <sys/mmapobj.h> 53 #include <sys/atomic.h> 54 55 /* 56 * Theory statement: 57 * 58 * The main driving force behind mmapobj is to interpret and map ELF files 59 * inside of the kernel instead of having the linker be responsible for this. 60 * 61 * mmapobj also supports the AOUT 4.x binary format as well as flat files in 62 * a read only manner. 63 * 64 * When interpreting and mapping an ELF file, mmapobj will map each PT_LOAD 65 * or PT_SUNWBSS segment according to the ELF standard. Refer to the "Linker 66 * and Libraries Guide" for more information about the standard and mapping 67 * rules. 68 * 69 * Having mmapobj interpret and map objects will allow the kernel to make the 70 * best decision for where to place the mappings for said objects. Thus, we 71 * can make optimizations inside of the kernel for specific platforms or 72 * cache mapping information to make mapping objects faster. 73 * 74 * The lib_va_hash will be one such optimization. For each ELF object that 75 * mmapobj is asked to interpret, we will attempt to cache the information 76 * about the PT_LOAD and PT_SUNWBSS sections to speed up future mappings of 77 * the same objects. We will cache up to LIBVA_CACHED_SEGS (see below) program 78 * headers which should cover a majority of the libraries out there without 79 * wasting space. In order to make sure that the cached information is valid, 80 * we check the passed in vnode's mtime and ctime to make sure the vnode 81 * has not been modified since the last time we used it. 82 * 83 * In addition, the lib_va_hash may contain a preferred starting VA for the 84 * object which can be useful for platforms which support a shared context. 85 * This will increase the likelyhood that library text can be shared among 86 * many different processes. We limit the reserved VA space for 32 bit objects 87 * in order to minimize fragmenting the processes address space. 88 * 89 * In addition to the above, the mmapobj interface allows for padding to be 90 * requested before the first mapping and after the last mapping created. 91 * When padding is requested, no additional optimizations will be made for 92 * that request. 93 */ 94 95 /* 96 * Threshold to prevent allocating too much kernel memory to read in the 97 * program headers for an object. If it requires more than below, 98 * we will use a KM_NOSLEEP allocation to allocate memory to hold all of the 99 * program headers which could possibly fail. If less memory than below is 100 * needed, then we use a KM_SLEEP allocation and are willing to wait for the 101 * memory if we need to. 102 */ 103 size_t mmapobj_alloc_threshold = 65536; 104 105 /* Debug stats for test coverage */ 106 #ifdef DEBUG 107 struct mobj_stats { 108 uint_t mobjs_unmap_called; 109 uint_t mobjs_remap_devnull; 110 uint_t mobjs_lookup_start; 111 uint_t mobjs_alloc_start; 112 uint_t mobjs_alloc_vmem; 113 uint_t mobjs_add_collision; 114 uint_t mobjs_get_addr; 115 uint_t mobjs_map_flat_no_padding; 116 uint_t mobjs_map_flat_padding; 117 uint_t mobjs_map_ptload_text; 118 uint_t mobjs_map_ptload_initdata; 119 uint_t mobjs_map_ptload_preread; 120 uint_t mobjs_map_ptload_unaligned_text; 121 uint_t mobjs_map_ptload_unaligned_map_fail; 122 uint_t mobjs_map_ptload_unaligned_read_fail; 123 uint_t mobjs_zfoddiff; 124 uint_t mobjs_zfoddiff_nowrite; 125 uint_t mobjs_zfodextra; 126 uint_t mobjs_ptload_failed; 127 uint_t mobjs_map_elf_no_holes; 128 uint_t mobjs_unmap_hole; 129 uint_t mobjs_nomem_header; 130 uint_t mobjs_inval_header; 131 uint_t mobjs_overlap_header; 132 uint_t mobjs_np2_align; 133 uint_t mobjs_np2_align_overflow; 134 uint_t mobjs_exec_padding; 135 uint_t mobjs_exec_addr_mapped; 136 uint_t mobjs_exec_addr_devnull; 137 uint_t mobjs_exec_addr_in_use; 138 uint_t mobjs_lvp_found; 139 uint_t mobjs_no_loadable_yet; 140 uint_t mobjs_nothing_to_map; 141 uint_t mobjs_e2big; 142 uint_t mobjs_dyn_pad_align; 143 uint_t mobjs_dyn_pad_noalign; 144 uint_t mobjs_alloc_start_fail; 145 uint_t mobjs_lvp_nocache; 146 uint_t mobjs_extra_padding; 147 uint_t mobjs_lvp_not_needed; 148 uint_t mobjs_no_mem_map_sz; 149 uint_t mobjs_check_exec_failed; 150 uint_t mobjs_lvp_used; 151 uint_t mobjs_wrong_model; 152 uint_t mobjs_noexec_fs; 153 uint_t mobjs_e2big_et_rel; 154 uint_t mobjs_et_rel_mapped; 155 uint_t mobjs_unknown_elf_type; 156 uint_t mobjs_phent32_too_small; 157 uint_t mobjs_phent64_too_small; 158 uint_t mobjs_inval_elf_class; 159 uint_t mobjs_too_many_phdrs; 160 uint_t mobjs_no_phsize; 161 uint_t mobjs_phsize_large; 162 uint_t mobjs_phsize_xtralarge; 163 uint_t mobjs_fast_wrong_model; 164 uint_t mobjs_fast_e2big; 165 uint_t mobjs_fast; 166 uint_t mobjs_fast_success; 167 uint_t mobjs_fast_not_now; 168 uint_t mobjs_small_file; 169 uint_t mobjs_read_error; 170 uint_t mobjs_unsupported; 171 uint_t mobjs_flat_e2big; 172 uint_t mobjs_phent_align32; 173 uint_t mobjs_phent_align64; 174 uint_t mobjs_lib_va_find_hit; 175 uint_t mobjs_lib_va_find_delay_delete; 176 uint_t mobjs_lib_va_find_delete; 177 uint_t mobjs_lib_va_add_delay_delete; 178 uint_t mobjs_lib_va_add_delete; 179 uint_t mobjs_lib_va_create_failure; 180 uint_t mobjs_min_align; 181 #if defined(__sparc) 182 uint_t mobjs_aout_uzero_fault; 183 uint_t mobjs_aout_64bit_try; 184 uint_t mobjs_aout_noexec; 185 uint_t mobjs_aout_e2big; 186 uint_t mobjs_aout_lib; 187 uint_t mobjs_aout_fixed; 188 uint_t mobjs_aout_zfoddiff; 189 uint_t mobjs_aout_map_bss; 190 uint_t mobjs_aout_bss_fail; 191 uint_t mobjs_aout_nlist; 192 uint_t mobjs_aout_addr_in_use; 193 #endif 194 } mobj_stats; 195 196 #define MOBJ_STAT_ADD(stat) ((mobj_stats.mobjs_##stat)++) 197 #else 198 #define MOBJ_STAT_ADD(stat) 199 #endif 200 201 /* 202 * Check if addr is at or above the address space reserved for the stack. 203 * The stack is at the top of the address space for all sparc processes 204 * and 64 bit x86 processes. For 32 bit x86, the stack is not at the top 205 * of the address space and thus this check wil always return false for 206 * 32 bit x86 processes. 207 */ 208 #if defined(__sparc) 209 #define OVERLAPS_STACK(addr, p) \ 210 (addr >= (p->p_usrstack - ((p->p_stk_ctl + PAGEOFFSET) & PAGEMASK))) 211 #elif defined(__amd64) 212 #define OVERLAPS_STACK(addr, p) \ 213 ((p->p_model == DATAMODEL_LP64) && \ 214 (addr >= (p->p_usrstack - ((p->p_stk_ctl + PAGEOFFSET) & PAGEMASK)))) 215 #elif defined(__i386) 216 #define OVERLAPS_STACK(addr, p) 0 217 #endif 218 219 /* lv_flags values - bitmap */ 220 #define LV_ELF32 0x1 /* 32 bit ELF file */ 221 #define LV_ELF64 0x2 /* 64 bit ELF file */ 222 #define LV_DEL 0x4 /* delete when lv_refcnt hits zero */ 223 224 /* 225 * Note: lv_num_segs will denote how many segments this file has and will 226 * only be set after the lv_mps array has been filled out. 227 * lv_mps can only be valid if lv_num_segs is non-zero. 228 */ 229 struct lib_va { 230 struct lib_va *lv_next; 231 caddr_t lv_base_va; /* start va for library */ 232 ssize_t lv_len; /* total va span of library */ 233 size_t lv_align; /* minimum alignment */ 234 uint64_t lv_nodeid; /* filesystem node id */ 235 uint64_t lv_fsid; /* filesystem id */ 236 timestruc_t lv_ctime; /* last time file was changed */ 237 timestruc_t lv_mtime; /* or modified */ 238 mmapobj_result_t lv_mps[LIBVA_CACHED_SEGS]; /* cached pheaders */ 239 int lv_num_segs; /* # segs for this file */ 240 int lv_flags; 241 uint_t lv_refcnt; /* number of holds on struct */ 242 }; 243 244 #define LIB_VA_SIZE 1024 245 #define LIB_VA_MASK (LIB_VA_SIZE - 1) 246 #define LIB_VA_MUTEX_SHIFT 3 247 248 #if (LIB_VA_SIZE & (LIB_VA_SIZE - 1)) 249 #error "LIB_VA_SIZE is not a power of 2" 250 #endif 251 252 static struct lib_va *lib_va_hash[LIB_VA_SIZE]; 253 static kmutex_t lib_va_hash_mutex[LIB_VA_SIZE >> LIB_VA_MUTEX_SHIFT]; 254 255 #define LIB_VA_HASH_MUTEX(index) \ 256 (&lib_va_hash_mutex[index >> LIB_VA_MUTEX_SHIFT]) 257 258 #define LIB_VA_HASH(nodeid) \ 259 (((nodeid) ^ ((nodeid) << 7) ^ ((nodeid) << 13)) & LIB_VA_MASK) 260 261 #define LIB_VA_MATCH_ID(arg1, arg2) \ 262 ((arg1)->lv_nodeid == (arg2)->va_nodeid && \ 263 (arg1)->lv_fsid == (arg2)->va_fsid) 264 265 #define LIB_VA_MATCH_TIME(arg1, arg2) \ 266 ((arg1)->lv_ctime.tv_sec == (arg2)->va_ctime.tv_sec && \ 267 (arg1)->lv_mtime.tv_sec == (arg2)->va_mtime.tv_sec && \ 268 (arg1)->lv_ctime.tv_nsec == (arg2)->va_ctime.tv_nsec && \ 269 (arg1)->lv_mtime.tv_nsec == (arg2)->va_mtime.tv_nsec) 270 271 #define LIB_VA_MATCH(arg1, arg2) \ 272 (LIB_VA_MATCH_ID(arg1, arg2) && LIB_VA_MATCH_TIME(arg1, arg2)) 273 274 /* 275 * lib_va will be used for optimized allocation of address ranges for 276 * libraries, such that subsequent mappings of the same library will attempt 277 * to use the same VA as previous mappings of that library. 278 * In order to map libraries at the same VA in many processes, we need to carve 279 * out our own address space for them which is unique across many processes. 280 * We use different arenas for 32 bit and 64 bit libraries. 281 * 282 * Since the 32 bit address space is relatively small, we limit the number of 283 * libraries which try to use consistent virtual addresses to lib_threshold. 284 * For 64 bit libraries there is no such limit since the address space is large. 285 */ 286 static vmem_t *lib_va_32_arena; 287 static vmem_t *lib_va_64_arena; 288 uint_t lib_threshold = 20; /* modifiable via /etc/system */ 289 290 static kmutex_t lib_va_init_mutex; /* no need to initialize */ 291 292 /* 293 * Number of 32 bit and 64 bit libraries in lib_va hash. 294 */ 295 static uint_t libs_mapped_32 = 0; 296 static uint_t libs_mapped_64 = 0; 297 298 /* 299 * Free up the resources associated with lvp as well as lvp itself. 300 * We also decrement the number of libraries mapped via a lib_va 301 * cached virtual address. 302 */ 303 void 304 lib_va_free(struct lib_va *lvp) 305 { 306 int is_64bit = lvp->lv_flags & LV_ELF64; 307 ASSERT(lvp->lv_refcnt == 0); 308 309 if (lvp->lv_base_va != NULL) { 310 vmem_xfree(is_64bit ? lib_va_64_arena : lib_va_32_arena, 311 lvp->lv_base_va, lvp->lv_len); 312 if (is_64bit) { 313 atomic_dec_32(&libs_mapped_64); 314 } else { 315 atomic_dec_32(&libs_mapped_32); 316 } 317 } 318 kmem_free(lvp, sizeof (struct lib_va)); 319 } 320 321 /* 322 * See if the file associated with the vap passed in is in the lib_va hash. 323 * If it is and the file has not been modified since last use, then 324 * return a pointer to that data. Otherwise, return NULL if the file has 325 * changed or the file was not found in the hash. 326 */ 327 static struct lib_va * 328 lib_va_find(vattr_t *vap) 329 { 330 struct lib_va *lvp; 331 struct lib_va *del = NULL; 332 struct lib_va **tmp; 333 uint_t index; 334 index = LIB_VA_HASH(vap->va_nodeid); 335 336 mutex_enter(LIB_VA_HASH_MUTEX(index)); 337 tmp = &lib_va_hash[index]; 338 while (*tmp != NULL) { 339 lvp = *tmp; 340 if (LIB_VA_MATCH_ID(lvp, vap)) { 341 if (LIB_VA_MATCH_TIME(lvp, vap)) { 342 ASSERT((lvp->lv_flags & LV_DEL) == 0); 343 lvp->lv_refcnt++; 344 MOBJ_STAT_ADD(lib_va_find_hit); 345 } else { 346 /* 347 * file was updated since last use. 348 * need to remove it from list. 349 */ 350 del = lvp; 351 *tmp = del->lv_next; 352 del->lv_next = NULL; 353 /* 354 * If we can't delete it now, mark it for later 355 */ 356 if (del->lv_refcnt) { 357 MOBJ_STAT_ADD(lib_va_find_delay_delete); 358 del->lv_flags |= LV_DEL; 359 del = NULL; 360 } 361 lvp = NULL; 362 } 363 mutex_exit(LIB_VA_HASH_MUTEX(index)); 364 if (del) { 365 ASSERT(del->lv_refcnt == 0); 366 MOBJ_STAT_ADD(lib_va_find_delete); 367 lib_va_free(del); 368 } 369 return (lvp); 370 } 371 tmp = &lvp->lv_next; 372 } 373 mutex_exit(LIB_VA_HASH_MUTEX(index)); 374 return (NULL); 375 } 376 377 /* 378 * Add a new entry to the lib_va hash. 379 * Search the hash while holding the appropriate mutex to make sure that the 380 * data is not already in the cache. If we find data that is in the cache 381 * already and has not been modified since last use, we return NULL. If it 382 * has been modified since last use, we will remove that entry from 383 * the hash and it will be deleted once it's reference count reaches zero. 384 * If there is no current entry in the hash we will add the new entry and 385 * return it to the caller who is responsible for calling lib_va_release to 386 * drop their reference count on it. 387 * 388 * lv_num_segs will be set to zero since the caller needs to add that 389 * information to the data structure. 390 */ 391 static struct lib_va * 392 lib_va_add_hash(caddr_t base_va, ssize_t len, size_t align, vattr_t *vap) 393 { 394 struct lib_va *lvp; 395 uint_t index; 396 model_t model; 397 struct lib_va **tmp; 398 struct lib_va *del = NULL; 399 400 model = get_udatamodel(); 401 index = LIB_VA_HASH(vap->va_nodeid); 402 403 lvp = kmem_alloc(sizeof (struct lib_va), KM_SLEEP); 404 405 mutex_enter(LIB_VA_HASH_MUTEX(index)); 406 407 /* 408 * Make sure not adding same data a second time. 409 * The hash chains should be relatively short and adding 410 * is a relatively rare event, so it's worth the check. 411 */ 412 tmp = &lib_va_hash[index]; 413 while (*tmp != NULL) { 414 if (LIB_VA_MATCH_ID(*tmp, vap)) { 415 if (LIB_VA_MATCH_TIME(*tmp, vap)) { 416 mutex_exit(LIB_VA_HASH_MUTEX(index)); 417 kmem_free(lvp, sizeof (struct lib_va)); 418 return (NULL); 419 } 420 421 /* 422 * We have the same nodeid and fsid but the file has 423 * been modified since we last saw it. 424 * Need to remove the old node and add this new 425 * one. 426 * Could probably use a callback mechanism to make 427 * this cleaner. 428 */ 429 ASSERT(del == NULL); 430 del = *tmp; 431 *tmp = del->lv_next; 432 del->lv_next = NULL; 433 434 /* 435 * Check to see if we can free it. If lv_refcnt 436 * is greater than zero, than some other thread 437 * has a reference to the one we want to delete 438 * and we can not delete it. All of this is done 439 * under the lib_va_hash_mutex lock so it is atomic. 440 */ 441 if (del->lv_refcnt) { 442 MOBJ_STAT_ADD(lib_va_add_delay_delete); 443 del->lv_flags |= LV_DEL; 444 del = NULL; 445 } 446 /* tmp is already advanced */ 447 continue; 448 } 449 tmp = &((*tmp)->lv_next); 450 } 451 452 lvp->lv_base_va = base_va; 453 lvp->lv_len = len; 454 lvp->lv_align = align; 455 lvp->lv_nodeid = vap->va_nodeid; 456 lvp->lv_fsid = vap->va_fsid; 457 lvp->lv_ctime.tv_sec = vap->va_ctime.tv_sec; 458 lvp->lv_ctime.tv_nsec = vap->va_ctime.tv_nsec; 459 lvp->lv_mtime.tv_sec = vap->va_mtime.tv_sec; 460 lvp->lv_mtime.tv_nsec = vap->va_mtime.tv_nsec; 461 lvp->lv_next = NULL; 462 lvp->lv_refcnt = 1; 463 464 /* Caller responsible for filling this and lv_mps out */ 465 lvp->lv_num_segs = 0; 466 467 if (model == DATAMODEL_LP64) { 468 lvp->lv_flags = LV_ELF64; 469 } else { 470 ASSERT(model == DATAMODEL_ILP32); 471 lvp->lv_flags = LV_ELF32; 472 } 473 474 if (base_va != NULL) { 475 if (model == DATAMODEL_LP64) { 476 atomic_inc_32(&libs_mapped_64); 477 } else { 478 ASSERT(model == DATAMODEL_ILP32); 479 atomic_inc_32(&libs_mapped_32); 480 } 481 } 482 ASSERT(*tmp == NULL); 483 *tmp = lvp; 484 mutex_exit(LIB_VA_HASH_MUTEX(index)); 485 if (del) { 486 ASSERT(del->lv_refcnt == 0); 487 MOBJ_STAT_ADD(lib_va_add_delete); 488 lib_va_free(del); 489 } 490 return (lvp); 491 } 492 493 /* 494 * Release the hold on lvp which was acquired by lib_va_find or lib_va_add_hash. 495 * In addition, if this is the last hold and lvp is marked for deletion, 496 * free up it's reserved address space and free the structure. 497 */ 498 static void 499 lib_va_release(struct lib_va *lvp) 500 { 501 uint_t index; 502 int to_del = 0; 503 504 ASSERT(lvp->lv_refcnt > 0); 505 506 index = LIB_VA_HASH(lvp->lv_nodeid); 507 mutex_enter(LIB_VA_HASH_MUTEX(index)); 508 if (--lvp->lv_refcnt == 0 && (lvp->lv_flags & LV_DEL)) { 509 to_del = 1; 510 } 511 mutex_exit(LIB_VA_HASH_MUTEX(index)); 512 if (to_del) { 513 ASSERT(lvp->lv_next == 0); 514 lib_va_free(lvp); 515 } 516 } 517 518 /* 519 * Dummy function for mapping through /dev/null 520 * Normally I would have used mmmmap in common/io/mem.c 521 * but that is a static function, and for /dev/null, it 522 * just returns -1. 523 */ 524 /* ARGSUSED */ 525 static int 526 mmapobj_dummy(dev_t dev, off_t off, int prot) 527 { 528 return (-1); 529 } 530 531 /* 532 * Called when an error occurred which requires mmapobj to return failure. 533 * All mapped objects will be unmapped and /dev/null mappings will be 534 * reclaimed if necessary. 535 * num_mapped is the number of elements of mrp which have been mapped, and 536 * num_segs is the total number of elements in mrp. 537 * For e_type ET_EXEC, we need to unmap all of the elements in mrp since 538 * we had already made reservations for them. 539 * If num_mapped equals num_segs, then we know that we had fully mapped 540 * the file and only need to clean up the segments described. 541 * If they are not equal, then for ET_DYN we will unmap the range from the 542 * end of the last mapped segment to the end of the last segment in mrp 543 * since we would have made a reservation for that memory earlier. 544 * If e_type is passed in as zero, num_mapped must equal num_segs. 545 */ 546 void 547 mmapobj_unmap(mmapobj_result_t *mrp, int num_mapped, int num_segs, 548 ushort_t e_type) 549 { 550 int i; 551 struct as *as = curproc->p_as; 552 caddr_t addr; 553 size_t size; 554 555 if (e_type == ET_EXEC) { 556 num_mapped = num_segs; 557 } 558 #ifdef DEBUG 559 if (e_type == 0) { 560 ASSERT(num_mapped == num_segs); 561 } 562 #endif 563 564 MOBJ_STAT_ADD(unmap_called); 565 for (i = 0; i < num_mapped; i++) { 566 567 /* 568 * If we are going to have to create a mapping we need to 569 * make sure that no one else will use the address we 570 * need to remap between the time it is unmapped and 571 * mapped below. 572 */ 573 if (mrp[i].mr_flags & MR_RESV) { 574 as_rangelock(as); 575 } 576 /* Always need to unmap what we mapped */ 577 (void) as_unmap(as, mrp[i].mr_addr, mrp[i].mr_msize); 578 579 /* Need to reclaim /dev/null reservation from earlier */ 580 if (mrp[i].mr_flags & MR_RESV) { 581 struct segdev_crargs dev_a; 582 583 ASSERT(e_type != ET_DYN); 584 /* 585 * Use seg_dev segment driver for /dev/null mapping. 586 */ 587 dev_a.mapfunc = mmapobj_dummy; 588 dev_a.dev = makedevice(mm_major, M_NULL); 589 dev_a.offset = 0; 590 dev_a.type = 0; /* neither PRIVATE nor SHARED */ 591 dev_a.prot = dev_a.maxprot = (uchar_t)PROT_NONE; 592 dev_a.hat_attr = 0; 593 dev_a.hat_flags = 0; 594 595 (void) as_map(as, mrp[i].mr_addr, mrp[i].mr_msize, 596 segdev_create, &dev_a); 597 MOBJ_STAT_ADD(remap_devnull); 598 as_rangeunlock(as); 599 } 600 } 601 602 if (num_mapped != num_segs) { 603 ASSERT(e_type == ET_DYN); 604 /* Need to unmap any reservation made after last mapped seg */ 605 if (num_mapped == 0) { 606 addr = mrp[0].mr_addr; 607 } else { 608 addr = mrp[num_mapped - 1].mr_addr + 609 mrp[num_mapped - 1].mr_msize; 610 } 611 size = (size_t)mrp[num_segs - 1].mr_addr + 612 mrp[num_segs - 1].mr_msize - (size_t)addr; 613 (void) as_unmap(as, addr, size); 614 615 /* 616 * Now we need to unmap the holes between mapped segs. 617 * Note that we have not mapped all of the segments and thus 618 * the holes between segments would not have been unmapped 619 * yet. If num_mapped == num_segs, then all of the holes 620 * between segments would have already been unmapped. 621 */ 622 623 for (i = 1; i < num_mapped; i++) { 624 addr = mrp[i - 1].mr_addr + mrp[i - 1].mr_msize; 625 size = mrp[i].mr_addr - addr; 626 (void) as_unmap(as, addr, size); 627 } 628 } 629 } 630 631 /* 632 * We need to add the start address into mrp so that the unmap function 633 * has absolute addresses to use. 634 */ 635 static void 636 mmapobj_unmap_exec(mmapobj_result_t *mrp, int num_mapped, caddr_t start_addr) 637 { 638 int i; 639 640 for (i = 0; i < num_mapped; i++) { 641 mrp[i].mr_addr += (size_t)start_addr; 642 } 643 mmapobj_unmap(mrp, num_mapped, num_mapped, ET_EXEC); 644 } 645 646 static caddr_t 647 mmapobj_lookup_start_addr(struct lib_va *lvp) 648 { 649 proc_t *p = curproc; 650 struct as *as = p->p_as; 651 struct segvn_crargs crargs = SEGVN_ZFOD_ARGS(PROT_USER, PROT_ALL); 652 int error; 653 uint_t ma_flags = _MAP_LOW32; 654 caddr_t base = NULL; 655 size_t len; 656 size_t align; 657 658 ASSERT(lvp != NULL); 659 MOBJ_STAT_ADD(lookup_start); 660 661 as_rangelock(as); 662 663 base = lvp->lv_base_va; 664 len = lvp->lv_len; 665 666 /* 667 * If we don't have an expected base address, or the one that we want 668 * to use is not available or acceptable, go get an acceptable 669 * address range. 670 */ 671 if (base == NULL || as_gap(as, len, &base, &len, 0, NULL) || 672 valid_usr_range(base, len, PROT_ALL, as, as->a_userlimit) != 673 RANGE_OKAY || OVERLAPS_STACK(base + len, p)) { 674 if (lvp->lv_flags & LV_ELF64) { 675 ma_flags = 0; 676 } 677 678 align = lvp->lv_align; 679 if (align > 1) { 680 ma_flags |= MAP_ALIGN; 681 } 682 683 base = (caddr_t)align; 684 map_addr(&base, len, 0, 1, ma_flags); 685 } 686 687 /* 688 * Need to reserve the address space we're going to use. 689 * Don't reserve swap space since we'll be mapping over this. 690 */ 691 if (base != NULL) { 692 crargs.flags |= MAP_NORESERVE; 693 error = as_map(as, base, len, segvn_create, &crargs); 694 if (error) { 695 base = NULL; 696 } 697 } 698 699 as_rangeunlock(as); 700 return (base); 701 } 702 703 /* 704 * Get the starting address for a given file to be mapped and return it 705 * to the caller. If we're using lib_va and we need to allocate an address, 706 * we will attempt to allocate it from the global reserved pool such that the 707 * same address can be used in the future for this file. If we can't use the 708 * reserved address then we just get one that will fit in our address space. 709 * 710 * Returns the starting virtual address for the range to be mapped or NULL 711 * if an error is encountered. If we successfully insert the requested info 712 * into the lib_va hash, then *lvpp will be set to point to this lib_va 713 * structure. The structure will have a hold on it and thus lib_va_release 714 * needs to be called on it by the caller. This function will not fill out 715 * lv_mps or lv_num_segs since it does not have enough information to do so. 716 * The caller is responsible for doing this making sure that any modifications 717 * to lv_mps are visible before setting lv_num_segs. 718 */ 719 static caddr_t 720 mmapobj_alloc_start_addr(struct lib_va **lvpp, size_t len, int use_lib_va, 721 size_t align, vattr_t *vap) 722 { 723 proc_t *p = curproc; 724 struct as *as = p->p_as; 725 struct segvn_crargs crargs = SEGVN_ZFOD_ARGS(PROT_USER, PROT_ALL); 726 int error; 727 model_t model; 728 uint_t ma_flags = _MAP_LOW32; 729 caddr_t base = NULL; 730 vmem_t *model_vmem; 731 size_t lib_va_start; 732 size_t lib_va_end; 733 size_t lib_va_len; 734 735 ASSERT(lvpp != NULL); 736 737 MOBJ_STAT_ADD(alloc_start); 738 model = get_udatamodel(); 739 740 if (model == DATAMODEL_LP64) { 741 ma_flags = 0; 742 model_vmem = lib_va_64_arena; 743 } else { 744 ASSERT(model == DATAMODEL_ILP32); 745 model_vmem = lib_va_32_arena; 746 } 747 748 if (align > 1) { 749 ma_flags |= MAP_ALIGN; 750 } 751 if (use_lib_va) { 752 /* 753 * The first time through, we need to setup the lib_va arenas. 754 * We call map_addr to find a suitable range of memory to map 755 * the given library, and we will set the highest address 756 * in our vmem arena to the end of this adddress range. 757 * We allow up to half of the address space to be used 758 * for lib_va addresses but we do not prevent any allocations 759 * in this range from other allocation paths. 760 */ 761 if (lib_va_64_arena == NULL && model == DATAMODEL_LP64) { 762 mutex_enter(&lib_va_init_mutex); 763 if (lib_va_64_arena == NULL) { 764 base = (caddr_t)align; 765 as_rangelock(as); 766 map_addr(&base, len, 0, 1, ma_flags); 767 as_rangeunlock(as); 768 if (base == NULL) { 769 mutex_exit(&lib_va_init_mutex); 770 MOBJ_STAT_ADD(lib_va_create_failure); 771 goto nolibva; 772 } 773 lib_va_end = (size_t)base + len; 774 lib_va_len = lib_va_end >> 1; 775 lib_va_len = P2ROUNDUP(lib_va_len, PAGESIZE); 776 lib_va_start = lib_va_end - lib_va_len; 777 778 /* 779 * Need to make sure we avoid the address hole. 780 * We know lib_va_end is valid but we need to 781 * make sure lib_va_start is as well. 782 */ 783 if ((lib_va_end > (size_t)hole_end) && 784 (lib_va_start < (size_t)hole_end)) { 785 lib_va_start = P2ROUNDUP( 786 (size_t)hole_end, PAGESIZE); 787 lib_va_len = lib_va_end - lib_va_start; 788 } 789 lib_va_64_arena = vmem_create("lib_va_64", 790 (void *)lib_va_start, lib_va_len, PAGESIZE, 791 NULL, NULL, NULL, 0, 792 VM_NOSLEEP | VMC_IDENTIFIER); 793 if (lib_va_64_arena == NULL) { 794 mutex_exit(&lib_va_init_mutex); 795 goto nolibva; 796 } 797 } 798 model_vmem = lib_va_64_arena; 799 mutex_exit(&lib_va_init_mutex); 800 } else if (lib_va_32_arena == NULL && 801 model == DATAMODEL_ILP32) { 802 mutex_enter(&lib_va_init_mutex); 803 if (lib_va_32_arena == NULL) { 804 base = (caddr_t)align; 805 as_rangelock(as); 806 map_addr(&base, len, 0, 1, ma_flags); 807 as_rangeunlock(as); 808 if (base == NULL) { 809 mutex_exit(&lib_va_init_mutex); 810 MOBJ_STAT_ADD(lib_va_create_failure); 811 goto nolibva; 812 } 813 lib_va_end = (size_t)base + len; 814 lib_va_len = lib_va_end >> 1; 815 lib_va_len = P2ROUNDUP(lib_va_len, PAGESIZE); 816 lib_va_start = lib_va_end - lib_va_len; 817 lib_va_32_arena = vmem_create("lib_va_32", 818 (void *)lib_va_start, lib_va_len, PAGESIZE, 819 NULL, NULL, NULL, 0, 820 VM_NOSLEEP | VMC_IDENTIFIER); 821 if (lib_va_32_arena == NULL) { 822 mutex_exit(&lib_va_init_mutex); 823 goto nolibva; 824 } 825 } 826 model_vmem = lib_va_32_arena; 827 mutex_exit(&lib_va_init_mutex); 828 } 829 830 if (model == DATAMODEL_LP64 || libs_mapped_32 < lib_threshold) { 831 base = vmem_xalloc(model_vmem, len, align, 0, 0, NULL, 832 NULL, VM_NOSLEEP | VM_ENDALLOC); 833 MOBJ_STAT_ADD(alloc_vmem); 834 } 835 836 /* 837 * Even if the address fails to fit in our address space, 838 * or we can't use a reserved address, 839 * we should still save it off in lib_va_hash. 840 */ 841 *lvpp = lib_va_add_hash(base, len, align, vap); 842 843 /* 844 * Check for collision on insertion and free up our VA space. 845 * This is expected to be rare, so we'll just reset base to 846 * NULL instead of looking it up in the lib_va hash. 847 */ 848 if (*lvpp == NULL) { 849 if (base != NULL) { 850 vmem_xfree(model_vmem, base, len); 851 base = NULL; 852 MOBJ_STAT_ADD(add_collision); 853 } 854 } 855 } 856 857 nolibva: 858 as_rangelock(as); 859 860 /* 861 * If we don't have an expected base address, or the one that we want 862 * to use is not available or acceptable, go get an acceptable 863 * address range. 864 */ 865 if (base == NULL || as_gap(as, len, &base, &len, 0, NULL) || 866 valid_usr_range(base, len, PROT_ALL, as, as->a_userlimit) != 867 RANGE_OKAY || OVERLAPS_STACK(base + len, p)) { 868 MOBJ_STAT_ADD(get_addr); 869 base = (caddr_t)align; 870 map_addr(&base, len, 0, 1, ma_flags); 871 } 872 873 /* 874 * Need to reserve the address space we're going to use. 875 * Don't reserve swap space since we'll be mapping over this. 876 */ 877 if (base != NULL) { 878 /* Don't reserve swap space since we'll be mapping over this */ 879 crargs.flags |= MAP_NORESERVE; 880 error = as_map(as, base, len, segvn_create, &crargs); 881 if (error) { 882 base = NULL; 883 } 884 } 885 886 as_rangeunlock(as); 887 return (base); 888 } 889 890 /* 891 * Map the file associated with vp into the address space as a single 892 * read only private mapping. 893 * Returns 0 for success, and non-zero for failure to map the file. 894 */ 895 static int 896 mmapobj_map_flat(vnode_t *vp, mmapobj_result_t *mrp, size_t padding, 897 cred_t *fcred) 898 { 899 int error = 0; 900 struct as *as = curproc->p_as; 901 caddr_t addr = NULL; 902 caddr_t start_addr; 903 size_t len; 904 size_t pad_len; 905 int prot = PROT_USER | PROT_READ; 906 uint_t ma_flags = _MAP_LOW32; 907 vattr_t vattr; 908 struct segvn_crargs crargs = SEGVN_ZFOD_ARGS(PROT_USER, PROT_ALL); 909 910 if (get_udatamodel() == DATAMODEL_LP64) { 911 ma_flags = 0; 912 } 913 914 vattr.va_mask = AT_SIZE; 915 error = VOP_GETATTR(vp, &vattr, 0, fcred, NULL); 916 if (error) { 917 return (error); 918 } 919 920 len = vattr.va_size; 921 922 ma_flags |= MAP_PRIVATE; 923 if (padding == 0) { 924 MOBJ_STAT_ADD(map_flat_no_padding); 925 error = VOP_MAP(vp, 0, as, &addr, len, prot, PROT_ALL, 926 ma_flags, fcred, NULL); 927 if (error == 0) { 928 mrp[0].mr_addr = addr; 929 mrp[0].mr_msize = len; 930 mrp[0].mr_fsize = len; 931 mrp[0].mr_offset = 0; 932 mrp[0].mr_prot = prot; 933 mrp[0].mr_flags = 0; 934 } 935 return (error); 936 } 937 938 /* padding was requested so there's more work to be done */ 939 MOBJ_STAT_ADD(map_flat_padding); 940 941 /* No need to reserve swap space now since it will be reserved later */ 942 crargs.flags |= MAP_NORESERVE; 943 944 /* Need to setup padding which can only be in PAGESIZE increments. */ 945 ASSERT((padding & PAGEOFFSET) == 0); 946 pad_len = len + (2 * padding); 947 948 as_rangelock(as); 949 map_addr(&addr, pad_len, 0, 1, ma_flags); 950 error = as_map(as, addr, pad_len, segvn_create, &crargs); 951 as_rangeunlock(as); 952 if (error) { 953 return (error); 954 } 955 start_addr = addr; 956 addr += padding; 957 ma_flags |= MAP_FIXED; 958 error = VOP_MAP(vp, 0, as, &addr, len, prot, PROT_ALL, ma_flags, 959 fcred, NULL); 960 if (error == 0) { 961 mrp[0].mr_addr = start_addr; 962 mrp[0].mr_msize = padding; 963 mrp[0].mr_fsize = 0; 964 mrp[0].mr_offset = 0; 965 mrp[0].mr_prot = 0; 966 mrp[0].mr_flags = MR_PADDING; 967 968 mrp[1].mr_addr = addr; 969 mrp[1].mr_msize = len; 970 mrp[1].mr_fsize = len; 971 mrp[1].mr_offset = 0; 972 mrp[1].mr_prot = prot; 973 mrp[1].mr_flags = 0; 974 975 mrp[2].mr_addr = addr + P2ROUNDUP(len, PAGESIZE); 976 mrp[2].mr_msize = padding; 977 mrp[2].mr_fsize = 0; 978 mrp[2].mr_offset = 0; 979 mrp[2].mr_prot = 0; 980 mrp[2].mr_flags = MR_PADDING; 981 } else { 982 /* Need to cleanup the as_map from earlier */ 983 (void) as_unmap(as, start_addr, pad_len); 984 } 985 return (error); 986 } 987 988 /* 989 * Map a PT_LOAD or PT_SUNWBSS section of an executable file into the user's 990 * address space. 991 * vp - vnode to be mapped in 992 * addr - start address 993 * len - length of vp to be mapped 994 * zfodlen - length of zero filled memory after len above 995 * offset - offset into file where mapping should start 996 * prot - protections for this mapping 997 * fcred - credentials for the file associated with vp at open time. 998 */ 999 static int 1000 mmapobj_map_ptload(struct vnode *vp, caddr_t addr, size_t len, size_t zfodlen, 1001 off_t offset, int prot, cred_t *fcred) 1002 { 1003 int error = 0; 1004 caddr_t zfodbase, oldaddr; 1005 size_t oldlen; 1006 size_t end; 1007 size_t zfoddiff; 1008 label_t ljb; 1009 struct as *as = curproc->p_as; 1010 model_t model; 1011 int full_page; 1012 1013 /* 1014 * See if addr and offset are aligned such that we can map in 1015 * full pages instead of partial pages. 1016 */ 1017 full_page = (((uintptr_t)addr & PAGEOFFSET) == 1018 ((uintptr_t)offset & PAGEOFFSET)); 1019 1020 model = get_udatamodel(); 1021 1022 oldaddr = addr; 1023 addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 1024 if (len) { 1025 spgcnt_t availm, npages; 1026 int preread; 1027 uint_t mflag = MAP_PRIVATE | MAP_FIXED; 1028 1029 if (model == DATAMODEL_ILP32) { 1030 mflag |= _MAP_LOW32; 1031 } 1032 /* We may need to map in extra bytes */ 1033 oldlen = len; 1034 len += ((size_t)oldaddr & PAGEOFFSET); 1035 1036 if (full_page) { 1037 offset = (off_t)((uintptr_t)offset & PAGEMASK); 1038 if ((prot & (PROT_WRITE | PROT_EXEC)) == PROT_EXEC) { 1039 mflag |= MAP_TEXT; 1040 MOBJ_STAT_ADD(map_ptload_text); 1041 } else { 1042 mflag |= MAP_INITDATA; 1043 MOBJ_STAT_ADD(map_ptload_initdata); 1044 } 1045 1046 /* 1047 * maxprot is passed as PROT_ALL so that mdb can 1048 * write to this segment. 1049 */ 1050 if (error = VOP_MAP(vp, (offset_t)offset, as, &addr, 1051 len, prot, PROT_ALL, mflag, fcred, NULL)) { 1052 return (error); 1053 } 1054 1055 /* 1056 * If the segment can fit and is relatively small, then 1057 * we prefault the entire segment in. This is based 1058 * on the model that says the best working set of a 1059 * small program is all of its pages. 1060 * We only do this if freemem will not drop below 1061 * lotsfree since we don't want to induce paging. 1062 */ 1063 npages = (spgcnt_t)btopr(len); 1064 availm = freemem - lotsfree; 1065 preread = (npages < availm && len < PGTHRESH) ? 1 : 0; 1066 1067 /* 1068 * If we aren't prefaulting the segment, 1069 * increment "deficit", if necessary to ensure 1070 * that pages will become available when this 1071 * process starts executing. 1072 */ 1073 if (preread == 0 && npages > availm && 1074 deficit < lotsfree) { 1075 deficit += MIN((pgcnt_t)(npages - availm), 1076 lotsfree - deficit); 1077 } 1078 1079 if (preread) { 1080 (void) as_faulta(as, addr, len); 1081 MOBJ_STAT_ADD(map_ptload_preread); 1082 } 1083 } else { 1084 /* 1085 * addr and offset were not aligned such that we could 1086 * use VOP_MAP, thus we need to as_map the memory we 1087 * need and then read the data in from disk. 1088 * This code path is a corner case which should never 1089 * be taken, but hand crafted binaries could trigger 1090 * this logic and it needs to work correctly. 1091 */ 1092 MOBJ_STAT_ADD(map_ptload_unaligned_text); 1093 as_rangelock(as); 1094 (void) as_unmap(as, addr, len); 1095 1096 /* 1097 * We use zfod_argsp because we need to be able to 1098 * write to the mapping and then we'll change the 1099 * protections later if they are incorrect. 1100 */ 1101 error = as_map(as, addr, len, segvn_create, zfod_argsp); 1102 as_rangeunlock(as); 1103 if (error) { 1104 MOBJ_STAT_ADD(map_ptload_unaligned_map_fail); 1105 return (error); 1106 } 1107 1108 /* Now read in the data from disk */ 1109 error = vn_rdwr(UIO_READ, vp, oldaddr, oldlen, offset, 1110 UIO_USERSPACE, 0, (rlim64_t)0, fcred, NULL); 1111 if (error) { 1112 MOBJ_STAT_ADD(map_ptload_unaligned_read_fail); 1113 return (error); 1114 } 1115 1116 /* 1117 * Now set protections. 1118 */ 1119 if (prot != PROT_ZFOD) { 1120 (void) as_setprot(as, addr, len, prot); 1121 } 1122 } 1123 } 1124 1125 if (zfodlen) { 1126 end = (size_t)addr + len; 1127 zfodbase = (caddr_t)P2ROUNDUP(end, PAGESIZE); 1128 zfoddiff = (uintptr_t)zfodbase - end; 1129 if (zfoddiff) { 1130 /* 1131 * Before we go to zero the remaining space on the last 1132 * page, make sure we have write permission. 1133 * 1134 * We need to be careful how we zero-fill the last page 1135 * if the protection does not include PROT_WRITE. Using 1136 * as_setprot() can cause the VM segment code to call 1137 * segvn_vpage(), which must allocate a page struct for 1138 * each page in the segment. If we have a very large 1139 * segment, this may fail, so we check for that, even 1140 * though we ignore other return values from as_setprot. 1141 */ 1142 MOBJ_STAT_ADD(zfoddiff); 1143 if ((prot & PROT_WRITE) == 0) { 1144 if (as_setprot(as, (caddr_t)end, zfoddiff, 1145 prot | PROT_WRITE) == ENOMEM) 1146 return (ENOMEM); 1147 MOBJ_STAT_ADD(zfoddiff_nowrite); 1148 } 1149 if (on_fault(&ljb)) { 1150 no_fault(); 1151 if ((prot & PROT_WRITE) == 0) { 1152 (void) as_setprot(as, (caddr_t)end, 1153 zfoddiff, prot); 1154 } 1155 return (EFAULT); 1156 } 1157 uzero((void *)end, zfoddiff); 1158 no_fault(); 1159 1160 /* 1161 * Remove write protection to return to original state 1162 */ 1163 if ((prot & PROT_WRITE) == 0) { 1164 (void) as_setprot(as, (caddr_t)end, 1165 zfoddiff, prot); 1166 } 1167 } 1168 if (zfodlen > zfoddiff) { 1169 struct segvn_crargs crargs = 1170 SEGVN_ZFOD_ARGS(prot, PROT_ALL); 1171 1172 MOBJ_STAT_ADD(zfodextra); 1173 zfodlen -= zfoddiff; 1174 crargs.szc = AS_MAP_NO_LPOOB; 1175 1176 1177 as_rangelock(as); 1178 (void) as_unmap(as, (caddr_t)zfodbase, zfodlen); 1179 error = as_map(as, (caddr_t)zfodbase, 1180 zfodlen, segvn_create, &crargs); 1181 as_rangeunlock(as); 1182 if (error) { 1183 return (error); 1184 } 1185 } 1186 } 1187 return (0); 1188 } 1189 1190 /* 1191 * Map the ELF file represented by vp into the users address space. The 1192 * first mapping will start at start_addr and there will be num_elements 1193 * mappings. The mappings are described by the data in mrp which may be 1194 * modified upon returning from this function. 1195 * Returns 0 for success or errno for failure. 1196 */ 1197 static int 1198 mmapobj_map_elf(struct vnode *vp, caddr_t start_addr, mmapobj_result_t *mrp, 1199 int num_elements, cred_t *fcred, ushort_t e_type) 1200 { 1201 int i; 1202 int ret; 1203 caddr_t lo; 1204 caddr_t hi; 1205 struct as *as = curproc->p_as; 1206 1207 for (i = 0; i < num_elements; i++) { 1208 caddr_t addr; 1209 size_t p_memsz; 1210 size_t p_filesz; 1211 size_t zfodlen; 1212 offset_t p_offset; 1213 size_t dif; 1214 int prot; 1215 1216 /* Always need to adjust mr_addr */ 1217 addr = start_addr + (size_t)(mrp[i].mr_addr); 1218 mrp[i].mr_addr = 1219 (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 1220 1221 /* Padding has already been mapped */ 1222 if (MR_GET_TYPE(mrp[i].mr_flags) == MR_PADDING) { 1223 continue; 1224 } 1225 p_memsz = mrp[i].mr_msize; 1226 p_filesz = mrp[i].mr_fsize; 1227 zfodlen = p_memsz - p_filesz; 1228 p_offset = mrp[i].mr_offset; 1229 dif = (uintptr_t)(addr) & PAGEOFFSET; 1230 prot = mrp[i].mr_prot | PROT_USER; 1231 ret = mmapobj_map_ptload(vp, addr, p_filesz, zfodlen, 1232 p_offset, prot, fcred); 1233 if (ret != 0) { 1234 MOBJ_STAT_ADD(ptload_failed); 1235 mmapobj_unmap(mrp, i, num_elements, e_type); 1236 return (ret); 1237 } 1238 1239 /* Need to cleanup mrp to reflect the actual values used */ 1240 mrp[i].mr_msize += dif; 1241 mrp[i].mr_offset = (size_t)addr & PAGEOFFSET; 1242 } 1243 1244 /* Also need to unmap any holes created above */ 1245 if (num_elements == 1) { 1246 MOBJ_STAT_ADD(map_elf_no_holes); 1247 return (0); 1248 } 1249 if (e_type == ET_EXEC) { 1250 return (0); 1251 } 1252 1253 as_rangelock(as); 1254 lo = start_addr; 1255 hi = mrp[0].mr_addr; 1256 1257 /* Remove holes made by the rest of the segments */ 1258 for (i = 0; i < num_elements - 1; i++) { 1259 lo = (caddr_t)P2ROUNDUP((size_t)(mrp[i].mr_addr) + 1260 mrp[i].mr_msize, PAGESIZE); 1261 hi = mrp[i + 1].mr_addr; 1262 if (lo < hi) { 1263 /* 1264 * If as_unmap fails we just use up a bit of extra 1265 * space 1266 */ 1267 (void) as_unmap(as, (caddr_t)lo, 1268 (size_t)hi - (size_t)lo); 1269 MOBJ_STAT_ADD(unmap_hole); 1270 } 1271 } 1272 as_rangeunlock(as); 1273 1274 return (0); 1275 } 1276 1277 /* Ugly hack to get STRUCT_* macros to work below */ 1278 struct myphdr { 1279 Phdr x; /* native version */ 1280 }; 1281 1282 struct myphdr32 { 1283 Elf32_Phdr x; 1284 }; 1285 1286 /* 1287 * Calculate and return the number of loadable segments in the ELF Phdr 1288 * represented by phdrbase as well as the len of the total mapping and 1289 * the max alignment that is needed for a given segment. On success, 1290 * 0 is returned, and *len, *loadable and *align have been filled out. 1291 * On failure, errno will be returned, which in this case is ENOTSUP 1292 * if we were passed an ELF file with overlapping segments. 1293 */ 1294 static int 1295 calc_loadable(Ehdr *ehdrp, caddr_t phdrbase, int nphdrs, size_t *len, 1296 int *loadable, size_t *align) 1297 { 1298 int i; 1299 int hsize; 1300 model_t model; 1301 ushort_t e_type = ehdrp->e_type; /* same offset 32 and 64 bit */ 1302 uint_t p_type; 1303 offset_t p_offset; 1304 size_t p_memsz; 1305 size_t p_align; 1306 caddr_t vaddr; 1307 int num_segs = 0; 1308 caddr_t start_addr = NULL; 1309 caddr_t p_end = NULL; 1310 size_t max_align = 0; 1311 size_t min_align = PAGESIZE; /* needed for vmem_xalloc */ 1312 STRUCT_HANDLE(myphdr, mph); 1313 #if defined(__sparc) 1314 extern int vac_size; 1315 1316 /* 1317 * Want to prevent aliasing by making the start address at least be 1318 * aligned to vac_size. 1319 */ 1320 min_align = MAX(PAGESIZE, vac_size); 1321 #endif 1322 1323 model = get_udatamodel(); 1324 STRUCT_SET_HANDLE(mph, model, (struct myphdr *)phdrbase); 1325 1326 /* hsize alignment should have been checked before calling this func */ 1327 if (model == DATAMODEL_LP64) { 1328 hsize = ehdrp->e_phentsize; 1329 if (hsize & 7) { 1330 return (ENOTSUP); 1331 } 1332 } else { 1333 ASSERT(model == DATAMODEL_ILP32); 1334 hsize = ((Elf32_Ehdr *)ehdrp)->e_phentsize; 1335 if (hsize & 3) { 1336 return (ENOTSUP); 1337 } 1338 } 1339 1340 /* 1341 * Determine the span of all loadable segments and calculate the 1342 * number of loadable segments. 1343 */ 1344 for (i = 0; i < nphdrs; i++) { 1345 p_type = STRUCT_FGET(mph, x.p_type); 1346 if (p_type == PT_LOAD || p_type == PT_SUNWBSS) { 1347 vaddr = (caddr_t)(uintptr_t)STRUCT_FGET(mph, x.p_vaddr); 1348 p_memsz = STRUCT_FGET(mph, x.p_memsz); 1349 1350 /* 1351 * Skip this header if it requests no memory to be 1352 * mapped. 1353 */ 1354 if (p_memsz == 0) { 1355 STRUCT_SET_HANDLE(mph, model, 1356 (struct myphdr *)((size_t)STRUCT_BUF(mph) + 1357 hsize)); 1358 MOBJ_STAT_ADD(nomem_header); 1359 continue; 1360 } 1361 if (num_segs++ == 0) { 1362 /* 1363 * The p_vaddr of the first PT_LOAD segment 1364 * must either be NULL or within the first 1365 * page in order to be interpreted. 1366 * Otherwise, its an invalid file. 1367 */ 1368 if (e_type == ET_DYN && 1369 ((caddr_t)((uintptr_t)vaddr & 1370 (uintptr_t)PAGEMASK) != NULL)) { 1371 MOBJ_STAT_ADD(inval_header); 1372 return (ENOTSUP); 1373 } 1374 start_addr = vaddr; 1375 /* 1376 * For the first segment, we need to map from 1377 * the beginning of the file, so we will 1378 * adjust the size of the mapping to include 1379 * this memory. 1380 */ 1381 p_offset = STRUCT_FGET(mph, x.p_offset); 1382 } else { 1383 p_offset = 0; 1384 } 1385 /* 1386 * Check to make sure that this mapping wouldn't 1387 * overlap a previous mapping. 1388 */ 1389 if (vaddr < p_end) { 1390 MOBJ_STAT_ADD(overlap_header); 1391 return (ENOTSUP); 1392 } 1393 1394 p_end = vaddr + p_memsz + p_offset; 1395 p_end = (caddr_t)P2ROUNDUP((size_t)p_end, PAGESIZE); 1396 1397 p_align = STRUCT_FGET(mph, x.p_align); 1398 if (p_align > 1 && p_align > max_align) { 1399 max_align = p_align; 1400 if (max_align < min_align) { 1401 max_align = min_align; 1402 MOBJ_STAT_ADD(min_align); 1403 } 1404 } 1405 } 1406 STRUCT_SET_HANDLE(mph, model, 1407 (struct myphdr *)((size_t)STRUCT_BUF(mph) + hsize)); 1408 } 1409 1410 /* 1411 * The alignment should be a power of 2, if it isn't we forgive it 1412 * and round up. On overflow, we'll set the alignment to max_align 1413 * rounded down to the nearest power of 2. 1414 */ 1415 if (max_align > 0 && !ISP2(max_align)) { 1416 MOBJ_STAT_ADD(np2_align); 1417 *align = 2 * (1L << (highbit(max_align) - 1)); 1418 if (*align < max_align || 1419 (*align > UINT_MAX && model == DATAMODEL_ILP32)) { 1420 MOBJ_STAT_ADD(np2_align_overflow); 1421 *align = 1L << (highbit(max_align) - 1); 1422 } 1423 } else { 1424 *align = max_align; 1425 } 1426 1427 ASSERT(*align >= PAGESIZE || *align == 0); 1428 1429 *loadable = num_segs; 1430 *len = p_end - start_addr; 1431 return (0); 1432 } 1433 1434 /* 1435 * Check the address space to see if the virtual addresses to be used are 1436 * available. If they are not, return errno for failure. On success, 0 1437 * will be returned, and the virtual addresses for each mmapobj_result_t 1438 * will be reserved. Note that a reservation could have earlier been made 1439 * for a given segment via a /dev/null mapping. If that is the case, then 1440 * we can use that VA space for our mappings. 1441 * Note: this function will only be used for ET_EXEC binaries. 1442 */ 1443 int 1444 check_exec_addrs(int loadable, mmapobj_result_t *mrp, caddr_t start_addr) 1445 { 1446 int i; 1447 struct as *as = curproc->p_as; 1448 struct segvn_crargs crargs = SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL); 1449 int ret; 1450 caddr_t myaddr; 1451 size_t mylen; 1452 struct seg *seg; 1453 1454 /* No need to reserve swap space now since it will be reserved later */ 1455 crargs.flags |= MAP_NORESERVE; 1456 as_rangelock(as); 1457 for (i = 0; i < loadable; i++) { 1458 1459 myaddr = start_addr + (size_t)mrp[i].mr_addr; 1460 mylen = mrp[i].mr_msize; 1461 1462 /* See if there is a hole in the as for this range */ 1463 if (as_gap(as, mylen, &myaddr, &mylen, 0, NULL) == 0) { 1464 ASSERT(myaddr == start_addr + (size_t)mrp[i].mr_addr); 1465 ASSERT(mylen == mrp[i].mr_msize); 1466 1467 #ifdef DEBUG 1468 if (MR_GET_TYPE(mrp[i].mr_flags) == MR_PADDING) { 1469 MOBJ_STAT_ADD(exec_padding); 1470 } 1471 #endif 1472 ret = as_map(as, myaddr, mylen, segvn_create, &crargs); 1473 if (ret) { 1474 as_rangeunlock(as); 1475 mmapobj_unmap_exec(mrp, i, start_addr); 1476 return (ret); 1477 } 1478 } else { 1479 /* 1480 * There is a mapping that exists in the range 1481 * so check to see if it was a "reservation" 1482 * from /dev/null. The mapping is from 1483 * /dev/null if the mapping comes from 1484 * segdev and the type is neither MAP_SHARED 1485 * nor MAP_PRIVATE. 1486 */ 1487 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 1488 seg = as_findseg(as, myaddr, 0); 1489 MOBJ_STAT_ADD(exec_addr_mapped); 1490 if (seg && seg->s_ops == &segdev_ops && 1491 ((SEGOP_GETTYPE(seg, myaddr) & 1492 (MAP_SHARED | MAP_PRIVATE)) == 0) && 1493 myaddr >= seg->s_base && 1494 myaddr + mylen <= 1495 seg->s_base + seg->s_size) { 1496 MOBJ_STAT_ADD(exec_addr_devnull); 1497 AS_LOCK_EXIT(as, &as->a_lock); 1498 (void) as_unmap(as, myaddr, mylen); 1499 ret = as_map(as, myaddr, mylen, segvn_create, 1500 &crargs); 1501 mrp[i].mr_flags |= MR_RESV; 1502 if (ret) { 1503 as_rangeunlock(as); 1504 /* Need to remap what we unmapped */ 1505 mmapobj_unmap_exec(mrp, i + 1, 1506 start_addr); 1507 return (ret); 1508 } 1509 } else { 1510 AS_LOCK_EXIT(as, &as->a_lock); 1511 as_rangeunlock(as); 1512 mmapobj_unmap_exec(mrp, i, start_addr); 1513 MOBJ_STAT_ADD(exec_addr_in_use); 1514 return (EADDRINUSE); 1515 } 1516 } 1517 } 1518 as_rangeunlock(as); 1519 return (0); 1520 } 1521 1522 /* 1523 * Walk through the ELF program headers and extract all useful information 1524 * for PT_LOAD and PT_SUNWBSS segments into mrp. 1525 * Return 0 on success or error on failure. 1526 */ 1527 static int 1528 process_phdr(Ehdr *ehdrp, caddr_t phdrbase, int nphdrs, mmapobj_result_t *mrp, 1529 vnode_t *vp, uint_t *num_mapped, size_t padding, cred_t *fcred) 1530 { 1531 int i; 1532 caddr_t start_addr = NULL; 1533 caddr_t vaddr; 1534 size_t len = 0; 1535 size_t lib_len = 0; 1536 int ret; 1537 int prot; 1538 struct lib_va *lvp = NULL; 1539 vattr_t vattr; 1540 struct as *as = curproc->p_as; 1541 int error; 1542 int loadable = 0; 1543 int current = 0; 1544 int use_lib_va = 1; 1545 size_t align = 0; 1546 size_t add_pad = 0; 1547 int hdr_seen = 0; 1548 ushort_t e_type = ehdrp->e_type; /* same offset 32 and 64 bit */ 1549 uint_t p_type; 1550 offset_t p_offset; 1551 size_t p_memsz; 1552 size_t p_filesz; 1553 uint_t p_flags; 1554 int hsize; 1555 model_t model; 1556 STRUCT_HANDLE(myphdr, mph); 1557 1558 model = get_udatamodel(); 1559 STRUCT_SET_HANDLE(mph, model, (struct myphdr *)phdrbase); 1560 1561 /* 1562 * Need to make sure that hsize is aligned properly. 1563 * For 32bit processes, 4 byte alignment is required. 1564 * For 64bit processes, 8 byte alignment is required. 1565 * If the alignment isn't correct, we need to return failure 1566 * since it could cause an alignment error panic while walking 1567 * the phdr array. 1568 */ 1569 if (model == DATAMODEL_LP64) { 1570 hsize = ehdrp->e_phentsize; 1571 if (hsize & 7) { 1572 MOBJ_STAT_ADD(phent_align64); 1573 return (ENOTSUP); 1574 } 1575 } else { 1576 ASSERT(model == DATAMODEL_ILP32); 1577 hsize = ((Elf32_Ehdr *)ehdrp)->e_phentsize; 1578 if (hsize & 3) { 1579 MOBJ_STAT_ADD(phent_align32); 1580 return (ENOTSUP); 1581 } 1582 } 1583 1584 if (padding != 0) { 1585 use_lib_va = 0; 1586 } 1587 if (e_type == ET_DYN) { 1588 vattr.va_mask = AT_FSID | AT_NODEID | AT_CTIME | AT_MTIME; 1589 error = VOP_GETATTR(vp, &vattr, 0, fcred, NULL); 1590 if (error) { 1591 return (error); 1592 } 1593 /* Check to see if we already have a description for this lib */ 1594 lvp = lib_va_find(&vattr); 1595 1596 if (lvp != NULL) { 1597 MOBJ_STAT_ADD(lvp_found); 1598 if (use_lib_va) { 1599 start_addr = mmapobj_lookup_start_addr(lvp); 1600 if (start_addr == NULL) { 1601 lib_va_release(lvp); 1602 return (ENOMEM); 1603 } 1604 } 1605 1606 /* 1607 * loadable may be zero if the original allocator 1608 * of lvp hasn't finished setting it up but the rest 1609 * of the fields will be accurate. 1610 */ 1611 loadable = lvp->lv_num_segs; 1612 len = lvp->lv_len; 1613 align = lvp->lv_align; 1614 } 1615 } 1616 1617 /* 1618 * Determine the span of all loadable segments and calculate the 1619 * number of loadable segments, the total len spanned by the mappings 1620 * and the max alignment, if we didn't get them above. 1621 */ 1622 if (loadable == 0) { 1623 MOBJ_STAT_ADD(no_loadable_yet); 1624 ret = calc_loadable(ehdrp, phdrbase, nphdrs, &len, 1625 &loadable, &align); 1626 if (ret != 0) { 1627 /* 1628 * Since it'd be an invalid file, we shouldn't have 1629 * cached it previously. 1630 */ 1631 ASSERT(lvp == NULL); 1632 return (ret); 1633 } 1634 #ifdef DEBUG 1635 if (lvp) { 1636 ASSERT(len == lvp->lv_len); 1637 ASSERT(align == lvp->lv_align); 1638 } 1639 #endif 1640 } 1641 1642 /* Make sure there's something to map. */ 1643 if (len == 0 || loadable == 0) { 1644 /* 1645 * Since it'd be an invalid file, we shouldn't have 1646 * cached it previously. 1647 */ 1648 ASSERT(lvp == NULL); 1649 MOBJ_STAT_ADD(nothing_to_map); 1650 return (ENOTSUP); 1651 } 1652 1653 lib_len = len; 1654 if (padding != 0) { 1655 loadable += 2; 1656 } 1657 if (loadable > *num_mapped) { 1658 *num_mapped = loadable; 1659 /* cleanup previous reservation */ 1660 if (start_addr) { 1661 (void) as_unmap(as, start_addr, lib_len); 1662 } 1663 MOBJ_STAT_ADD(e2big); 1664 if (lvp) { 1665 lib_va_release(lvp); 1666 } 1667 return (E2BIG); 1668 } 1669 1670 /* 1671 * We now know the size of the object to map and now we need to 1672 * get the start address to map it at. It's possible we already 1673 * have it if we found all the info we need in the lib_va cache. 1674 */ 1675 if (e_type == ET_DYN && start_addr == NULL) { 1676 /* 1677 * Need to make sure padding does not throw off 1678 * required alignment. We can only specify an 1679 * alignment for the starting address to be mapped, 1680 * so we round padding up to the alignment and map 1681 * from there and then throw out the extra later. 1682 */ 1683 if (padding != 0) { 1684 if (align > 1) { 1685 add_pad = P2ROUNDUP(padding, align); 1686 len += add_pad; 1687 MOBJ_STAT_ADD(dyn_pad_align); 1688 } else { 1689 MOBJ_STAT_ADD(dyn_pad_noalign); 1690 len += padding; /* at beginning */ 1691 } 1692 len += padding; /* at end of mapping */ 1693 } 1694 /* 1695 * At this point, if lvp is non-NULL, then above we 1696 * already found it in the cache but did not get 1697 * the start address since we were not going to use lib_va. 1698 * Since we know that lib_va will not be used, it's safe 1699 * to call mmapobj_alloc_start_addr and know that lvp 1700 * will not be modified. 1701 */ 1702 ASSERT(lvp ? use_lib_va == 0 : 1); 1703 start_addr = mmapobj_alloc_start_addr(&lvp, len, 1704 use_lib_va, align, &vattr); 1705 if (start_addr == NULL) { 1706 if (lvp) { 1707 lib_va_release(lvp); 1708 } 1709 MOBJ_STAT_ADD(alloc_start_fail); 1710 return (ENOMEM); 1711 } 1712 /* 1713 * If we can't cache it, no need to hang on to it. 1714 * Setting lv_num_segs to non-zero will make that 1715 * field active and since there are too many segments 1716 * to cache, all future users will not try to use lv_mps. 1717 */ 1718 if (lvp != NULL && loadable > LIBVA_CACHED_SEGS && use_lib_va) { 1719 lvp->lv_num_segs = loadable; 1720 lib_va_release(lvp); 1721 lvp = NULL; 1722 MOBJ_STAT_ADD(lvp_nocache); 1723 } 1724 /* 1725 * Free the beginning of the mapping if the padding 1726 * was not aligned correctly. 1727 */ 1728 if (padding != 0 && add_pad != padding) { 1729 (void) as_unmap(as, start_addr, 1730 add_pad - padding); 1731 start_addr += (add_pad - padding); 1732 MOBJ_STAT_ADD(extra_padding); 1733 } 1734 } 1735 1736 /* 1737 * At this point, we have reserved the virtual address space 1738 * for our mappings. Now we need to start filling out the mrp 1739 * array to describe all of the individual mappings we are going 1740 * to return. 1741 * For ET_EXEC there has been no memory reservation since we are 1742 * using fixed addresses. While filling in the mrp array below, 1743 * we will have the first segment biased to start at addr 0 1744 * and the rest will be biased by this same amount. Thus if there 1745 * is padding, the first padding will start at addr 0, and the next 1746 * segment will start at the value of padding. 1747 */ 1748 1749 /* We'll fill out padding later, so start filling in mrp at index 1 */ 1750 if (padding != 0) { 1751 current = 1; 1752 } 1753 1754 /* If we have no more need for lvp let it go now */ 1755 if (lvp != NULL && use_lib_va == 0) { 1756 lib_va_release(lvp); 1757 MOBJ_STAT_ADD(lvp_not_needed); 1758 lvp = NULL; 1759 } 1760 1761 /* Now fill out the mrp structs from the program headers */ 1762 STRUCT_SET_HANDLE(mph, model, (struct myphdr *)phdrbase); 1763 for (i = 0; i < nphdrs; i++) { 1764 p_type = STRUCT_FGET(mph, x.p_type); 1765 if (p_type == PT_LOAD || p_type == PT_SUNWBSS) { 1766 vaddr = (caddr_t)(uintptr_t)STRUCT_FGET(mph, x.p_vaddr); 1767 p_memsz = STRUCT_FGET(mph, x.p_memsz); 1768 p_filesz = STRUCT_FGET(mph, x.p_filesz); 1769 p_offset = STRUCT_FGET(mph, x.p_offset); 1770 p_flags = STRUCT_FGET(mph, x.p_flags); 1771 1772 /* 1773 * Skip this header if it requests no memory to be 1774 * mapped. 1775 */ 1776 if (p_memsz == 0) { 1777 STRUCT_SET_HANDLE(mph, model, 1778 (struct myphdr *)((size_t)STRUCT_BUF(mph) + 1779 hsize)); 1780 MOBJ_STAT_ADD(no_mem_map_sz); 1781 continue; 1782 } 1783 1784 prot = 0; 1785 if (p_flags & PF_R) 1786 prot |= PROT_READ; 1787 if (p_flags & PF_W) 1788 prot |= PROT_WRITE; 1789 if (p_flags & PF_X) 1790 prot |= PROT_EXEC; 1791 1792 ASSERT(current < loadable); 1793 mrp[current].mr_msize = p_memsz; 1794 mrp[current].mr_fsize = p_filesz; 1795 mrp[current].mr_offset = p_offset; 1796 mrp[current].mr_prot = prot; 1797 1798 if (hdr_seen == 0 && p_filesz != 0) { 1799 mrp[current].mr_flags = MR_HDR_ELF; 1800 /* 1801 * We modify mr_offset because we 1802 * need to map the ELF header as well, and if 1803 * we didn't then the header could be left out 1804 * of the mapping that we will create later. 1805 * Since we're removing the offset, we need to 1806 * account for that in the other fields as well 1807 * since we will be mapping the memory from 0 1808 * to p_offset. 1809 */ 1810 if (e_type == ET_DYN) { 1811 mrp[current].mr_offset = 0; 1812 mrp[current].mr_msize += p_offset; 1813 mrp[current].mr_fsize += p_offset; 1814 } else { 1815 ASSERT(e_type == ET_EXEC); 1816 /* 1817 * Save off the start addr which will be 1818 * our bias for the rest of the 1819 * ET_EXEC mappings. 1820 */ 1821 start_addr = vaddr - padding; 1822 } 1823 mrp[current].mr_addr = (caddr_t)padding; 1824 hdr_seen = 1; 1825 } else { 1826 if (e_type == ET_EXEC) { 1827 /* bias mr_addr */ 1828 mrp[current].mr_addr = 1829 vaddr - (size_t)start_addr; 1830 } else { 1831 mrp[current].mr_addr = vaddr + padding; 1832 } 1833 mrp[current].mr_flags = 0; 1834 } 1835 current++; 1836 } 1837 1838 /* Move to next phdr */ 1839 STRUCT_SET_HANDLE(mph, model, 1840 (struct myphdr *)((size_t)STRUCT_BUF(mph) + 1841 hsize)); 1842 } 1843 1844 /* Now fill out the padding segments */ 1845 if (padding != 0) { 1846 mrp[0].mr_addr = NULL; 1847 mrp[0].mr_msize = padding; 1848 mrp[0].mr_fsize = 0; 1849 mrp[0].mr_offset = 0; 1850 mrp[0].mr_prot = 0; 1851 mrp[0].mr_flags = MR_PADDING; 1852 1853 /* Setup padding for the last segment */ 1854 ASSERT(current == loadable - 1); 1855 mrp[current].mr_addr = (caddr_t)lib_len + padding; 1856 mrp[current].mr_msize = padding; 1857 mrp[current].mr_fsize = 0; 1858 mrp[current].mr_offset = 0; 1859 mrp[current].mr_prot = 0; 1860 mrp[current].mr_flags = MR_PADDING; 1861 } 1862 1863 /* 1864 * Need to make sure address ranges desired are not in use or 1865 * are previously allocated reservations from /dev/null. For 1866 * ET_DYN, we already made sure our address range was free. 1867 */ 1868 if (e_type == ET_EXEC) { 1869 ret = check_exec_addrs(loadable, mrp, start_addr); 1870 if (ret != 0) { 1871 ASSERT(lvp == NULL); 1872 MOBJ_STAT_ADD(check_exec_failed); 1873 return (ret); 1874 } 1875 } 1876 1877 /* Finish up our business with lvp. */ 1878 if (lvp) { 1879 ASSERT(e_type == ET_DYN); 1880 if (lvp->lv_num_segs == 0 && loadable <= LIBVA_CACHED_SEGS) { 1881 bcopy(mrp, lvp->lv_mps, 1882 loadable * sizeof (mmapobj_result_t)); 1883 membar_producer(); 1884 } 1885 /* 1886 * Setting lv_num_segs to a non-zero value indicates that 1887 * lv_mps is now valid and can be used by other threads. 1888 * So, the above stores need to finish before lv_num_segs 1889 * is updated. lv_mps is only valid if lv_num_segs is 1890 * greater than LIBVA_CACHED_SEGS. 1891 */ 1892 lvp->lv_num_segs = loadable; 1893 lib_va_release(lvp); 1894 MOBJ_STAT_ADD(lvp_used); 1895 } 1896 1897 /* Now that we have mrp completely filled out go map it */ 1898 ret = mmapobj_map_elf(vp, start_addr, mrp, loadable, fcred, e_type); 1899 if (ret == 0) { 1900 *num_mapped = loadable; 1901 } 1902 1903 return (ret); 1904 } 1905 1906 /* 1907 * Take the ELF file passed in, and do the work of mapping it. 1908 * num_mapped in - # elements in user buffer 1909 * num_mapped out - # sections mapped and length of mrp array if 1910 * no errors. 1911 */ 1912 static int 1913 doelfwork(Ehdr *ehdrp, vnode_t *vp, mmapobj_result_t *mrp, 1914 uint_t *num_mapped, size_t padding, cred_t *fcred) 1915 { 1916 int error; 1917 offset_t phoff; 1918 int nphdrs; 1919 unsigned char ei_class; 1920 unsigned short phentsize; 1921 ssize_t phsizep; 1922 caddr_t phbasep; 1923 int to_map; 1924 model_t model; 1925 1926 ei_class = ehdrp->e_ident[EI_CLASS]; 1927 model = get_udatamodel(); 1928 if ((model == DATAMODEL_ILP32 && ei_class == ELFCLASS64) || 1929 (model == DATAMODEL_LP64 && ei_class == ELFCLASS32)) { 1930 MOBJ_STAT_ADD(wrong_model); 1931 return (ENOTSUP); 1932 } 1933 1934 /* Can't execute code from "noexec" mounted filesystem. */ 1935 if (ehdrp->e_type == ET_EXEC && 1936 (vp->v_vfsp->vfs_flag & VFS_NOEXEC) != 0) { 1937 MOBJ_STAT_ADD(noexec_fs); 1938 return (EACCES); 1939 } 1940 1941 /* 1942 * Relocatable and core files are mapped as a single flat file 1943 * since no interpretation is done on them by mmapobj. 1944 */ 1945 if (ehdrp->e_type == ET_REL || ehdrp->e_type == ET_CORE) { 1946 to_map = padding ? 3 : 1; 1947 if (*num_mapped < to_map) { 1948 *num_mapped = to_map; 1949 MOBJ_STAT_ADD(e2big_et_rel); 1950 return (E2BIG); 1951 } 1952 error = mmapobj_map_flat(vp, mrp, padding, fcred); 1953 if (error == 0) { 1954 *num_mapped = to_map; 1955 mrp[padding ? 1 : 0].mr_flags = MR_HDR_ELF; 1956 MOBJ_STAT_ADD(et_rel_mapped); 1957 } 1958 return (error); 1959 } 1960 1961 /* Check for an unknown ELF type */ 1962 if (ehdrp->e_type != ET_EXEC && ehdrp->e_type != ET_DYN) { 1963 MOBJ_STAT_ADD(unknown_elf_type); 1964 return (ENOTSUP); 1965 } 1966 1967 if (ei_class == ELFCLASS32) { 1968 Elf32_Ehdr *e32hdr = (Elf32_Ehdr *)ehdrp; 1969 ASSERT(model == DATAMODEL_ILP32); 1970 nphdrs = e32hdr->e_phnum; 1971 phentsize = e32hdr->e_phentsize; 1972 if (phentsize < sizeof (Elf32_Phdr)) { 1973 MOBJ_STAT_ADD(phent32_too_small); 1974 return (ENOTSUP); 1975 } 1976 phoff = e32hdr->e_phoff; 1977 } else if (ei_class == ELFCLASS64) { 1978 Elf64_Ehdr *e64hdr = (Elf64_Ehdr *)ehdrp; 1979 ASSERT(model == DATAMODEL_LP64); 1980 nphdrs = e64hdr->e_phnum; 1981 phentsize = e64hdr->e_phentsize; 1982 if (phentsize < sizeof (Elf64_Phdr)) { 1983 MOBJ_STAT_ADD(phent64_too_small); 1984 return (ENOTSUP); 1985 } 1986 phoff = e64hdr->e_phoff; 1987 } else { 1988 /* fallthrough case for an invalid ELF class */ 1989 MOBJ_STAT_ADD(inval_elf_class); 1990 return (ENOTSUP); 1991 } 1992 1993 /* 1994 * nphdrs should only have this value for core files which are handled 1995 * above as a single mapping. If other file types ever use this 1996 * sentinel, then we'll add the support needed to handle this here. 1997 */ 1998 if (nphdrs == PN_XNUM) { 1999 MOBJ_STAT_ADD(too_many_phdrs); 2000 return (ENOTSUP); 2001 } 2002 2003 phsizep = nphdrs * phentsize; 2004 2005 if (phsizep == 0) { 2006 MOBJ_STAT_ADD(no_phsize); 2007 return (ENOTSUP); 2008 } 2009 2010 /* Make sure we only wait for memory if it's a reasonable request */ 2011 if (phsizep > mmapobj_alloc_threshold) { 2012 MOBJ_STAT_ADD(phsize_large); 2013 if ((phbasep = kmem_alloc(phsizep, KM_NOSLEEP)) == NULL) { 2014 MOBJ_STAT_ADD(phsize_xtralarge); 2015 return (ENOMEM); 2016 } 2017 } else { 2018 phbasep = kmem_alloc(phsizep, KM_SLEEP); 2019 } 2020 2021 if ((error = vn_rdwr(UIO_READ, vp, phbasep, phsizep, 2022 (offset_t)phoff, UIO_SYSSPACE, 0, (rlim64_t)0, 2023 fcred, NULL)) != 0) { 2024 kmem_free(phbasep, phsizep); 2025 return (error); 2026 } 2027 2028 /* Now process the phdr's */ 2029 error = process_phdr(ehdrp, phbasep, nphdrs, mrp, vp, num_mapped, 2030 padding, fcred); 2031 kmem_free(phbasep, phsizep); 2032 return (error); 2033 } 2034 2035 #if defined(__sparc) 2036 /* 2037 * Hack to support 64 bit kernels running AOUT 4.x programs. 2038 * This is the sizeof (struct nlist) for a 32 bit kernel. 2039 * Since AOUT programs are 32 bit only, they will never use the 64 bit 2040 * sizeof (struct nlist) and thus creating a #define is the simplest 2041 * way around this since this is a format which is not being updated. 2042 * This will be used in the place of sizeof (struct nlist) below. 2043 */ 2044 #define NLIST_SIZE (0xC) 2045 2046 static int 2047 doaoutwork(vnode_t *vp, mmapobj_result_t *mrp, 2048 uint_t *num_mapped, struct exec *hdr, cred_t *fcred) 2049 { 2050 int error; 2051 size_t size; 2052 size_t osize; 2053 size_t nsize; /* nlist size */ 2054 size_t msize; 2055 size_t zfoddiff; 2056 caddr_t addr; 2057 caddr_t start_addr; 2058 struct as *as = curproc->p_as; 2059 int prot = PROT_USER | PROT_READ | PROT_EXEC; 2060 uint_t mflag = MAP_PRIVATE | _MAP_LOW32; 2061 offset_t off = 0; 2062 int segnum = 0; 2063 uint_t to_map; 2064 int is_library = 0; 2065 struct segvn_crargs crargs = SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL); 2066 2067 /* Only 32bit apps supported by this file format */ 2068 if (get_udatamodel() != DATAMODEL_ILP32) { 2069 MOBJ_STAT_ADD(aout_64bit_try); 2070 return (ENOTSUP); 2071 } 2072 2073 /* Check to see if this is a library */ 2074 if (hdr->a_magic == ZMAGIC && hdr->a_entry < PAGESIZE) { 2075 is_library = 1; 2076 } 2077 2078 /* Can't execute code from "noexec" mounted filesystem. */ 2079 if (((vp->v_vfsp->vfs_flag & VFS_NOEXEC) != 0) && (is_library == 0)) { 2080 MOBJ_STAT_ADD(aout_noexec); 2081 return (EACCES); 2082 } 2083 2084 /* 2085 * There are 2 ways to calculate the mapped size of executable: 2086 * 1) rounded text size + data size + bss size. 2087 * 2) starting offset for text + text size + data size + text relocation 2088 * size + data relocation size + room for nlist data structure. 2089 * 2090 * The larger of the two sizes will be used to map this binary. 2091 */ 2092 osize = P2ROUNDUP(hdr->a_text, PAGESIZE) + hdr->a_data + hdr->a_bss; 2093 2094 off = hdr->a_magic == ZMAGIC ? 0 : sizeof (struct exec); 2095 2096 nsize = off + hdr->a_text + hdr->a_data + hdr->a_trsize + 2097 hdr->a_drsize + NLIST_SIZE; 2098 2099 size = MAX(osize, nsize); 2100 if (size != nsize) { 2101 nsize = 0; 2102 } 2103 2104 /* 2105 * 1 seg for text and 1 seg for initialized data. 2106 * 1 seg for bss (if can't fit in leftover space of init data) 2107 * 1 seg for nlist if needed. 2108 */ 2109 to_map = 2 + (nsize ? 1 : 0) + 2110 (hdr->a_bss > PAGESIZE - P2PHASE(hdr->a_data, PAGESIZE) ? 1 : 0); 2111 if (*num_mapped < to_map) { 2112 *num_mapped = to_map; 2113 MOBJ_STAT_ADD(aout_e2big); 2114 return (E2BIG); 2115 } 2116 2117 /* Reserve address space for the whole mapping */ 2118 if (is_library) { 2119 /* We'll let VOP_MAP below pick our address for us */ 2120 addr = NULL; 2121 MOBJ_STAT_ADD(aout_lib); 2122 } else { 2123 /* 2124 * default start address for fixed binaries from AOUT 4.x 2125 * standard. 2126 */ 2127 MOBJ_STAT_ADD(aout_fixed); 2128 mflag |= MAP_FIXED; 2129 addr = (caddr_t)0x2000; 2130 as_rangelock(as); 2131 if (as_gap(as, size, &addr, &size, 0, NULL) != 0) { 2132 as_rangeunlock(as); 2133 MOBJ_STAT_ADD(aout_addr_in_use); 2134 return (EADDRINUSE); 2135 } 2136 crargs.flags |= MAP_NORESERVE; 2137 error = as_map(as, addr, size, segvn_create, &crargs); 2138 ASSERT(addr == (caddr_t)0x2000); 2139 as_rangeunlock(as); 2140 } 2141 2142 start_addr = addr; 2143 osize = size; 2144 2145 /* 2146 * Map as large as we need, backed by file, this will be text, and 2147 * possibly the nlist segment. We map over this mapping for bss and 2148 * initialized data segments. 2149 */ 2150 error = VOP_MAP(vp, off, as, &addr, size, prot, PROT_ALL, 2151 mflag, fcred, NULL); 2152 if (error) { 2153 if (!is_library) { 2154 (void) as_unmap(as, start_addr, osize); 2155 } 2156 return (error); 2157 } 2158 2159 /* pickup the value of start_addr and osize for libraries */ 2160 start_addr = addr; 2161 osize = size; 2162 2163 /* 2164 * We have our initial reservation/allocation so we need to use fixed 2165 * addresses from now on. 2166 */ 2167 mflag |= MAP_FIXED; 2168 2169 mrp[0].mr_addr = addr; 2170 mrp[0].mr_msize = hdr->a_text; 2171 mrp[0].mr_fsize = hdr->a_text; 2172 mrp[0].mr_offset = 0; 2173 mrp[0].mr_prot = PROT_READ | PROT_EXEC; 2174 mrp[0].mr_flags = MR_HDR_AOUT; 2175 2176 2177 /* 2178 * Map initialized data. We are mapping over a portion of the 2179 * previous mapping which will be unmapped in VOP_MAP below. 2180 */ 2181 off = P2ROUNDUP((offset_t)(hdr->a_text), PAGESIZE); 2182 msize = off; 2183 addr += off; 2184 size = hdr->a_data; 2185 error = VOP_MAP(vp, off, as, &addr, size, PROT_ALL, PROT_ALL, 2186 mflag, fcred, NULL); 2187 if (error) { 2188 (void) as_unmap(as, start_addr, osize); 2189 return (error); 2190 } 2191 msize += size; 2192 mrp[1].mr_addr = addr; 2193 mrp[1].mr_msize = size; 2194 mrp[1].mr_fsize = size; 2195 mrp[1].mr_offset = 0; 2196 mrp[1].mr_prot = PROT_READ | PROT_WRITE | PROT_EXEC; 2197 mrp[1].mr_flags = 0; 2198 2199 /* Need to zero out remainder of page */ 2200 addr += hdr->a_data; 2201 zfoddiff = P2PHASE((size_t)addr, PAGESIZE); 2202 if (zfoddiff) { 2203 label_t ljb; 2204 2205 MOBJ_STAT_ADD(aout_zfoddiff); 2206 zfoddiff = PAGESIZE - zfoddiff; 2207 if (on_fault(&ljb)) { 2208 no_fault(); 2209 MOBJ_STAT_ADD(aout_uzero_fault); 2210 (void) as_unmap(as, start_addr, osize); 2211 return (EFAULT); 2212 } 2213 uzero(addr, zfoddiff); 2214 no_fault(); 2215 } 2216 msize += zfoddiff; 2217 segnum = 2; 2218 2219 /* Map bss */ 2220 if (hdr->a_bss > zfoddiff) { 2221 struct segvn_crargs crargs = 2222 SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL); 2223 MOBJ_STAT_ADD(aout_map_bss); 2224 addr += zfoddiff; 2225 size = hdr->a_bss - zfoddiff; 2226 as_rangelock(as); 2227 (void) as_unmap(as, addr, size); 2228 error = as_map(as, addr, size, segvn_create, &crargs); 2229 as_rangeunlock(as); 2230 msize += size; 2231 2232 if (error) { 2233 MOBJ_STAT_ADD(aout_bss_fail); 2234 (void) as_unmap(as, start_addr, osize); 2235 return (error); 2236 } 2237 mrp[2].mr_addr = addr; 2238 mrp[2].mr_msize = size; 2239 mrp[2].mr_fsize = 0; 2240 mrp[2].mr_offset = 0; 2241 mrp[2].mr_prot = PROT_READ | PROT_WRITE | PROT_EXEC; 2242 mrp[2].mr_flags = 0; 2243 2244 addr += size; 2245 segnum = 3; 2246 } 2247 2248 /* 2249 * If we have extra bits left over, we need to include that in how 2250 * much we mapped to make sure the nlist logic is correct 2251 */ 2252 msize = P2ROUNDUP(msize, PAGESIZE); 2253 2254 if (nsize && msize < nsize) { 2255 MOBJ_STAT_ADD(aout_nlist); 2256 mrp[segnum].mr_addr = addr; 2257 mrp[segnum].mr_msize = nsize - msize; 2258 mrp[segnum].mr_fsize = 0; 2259 mrp[segnum].mr_offset = 0; 2260 mrp[segnum].mr_prot = PROT_READ | PROT_EXEC; 2261 mrp[segnum].mr_flags = 0; 2262 } 2263 2264 *num_mapped = to_map; 2265 return (0); 2266 } 2267 #endif 2268 2269 /* 2270 * These are the two types of files that we can interpret and we want to read 2271 * in enough info to cover both types when looking at the initial header. 2272 */ 2273 #define MAX_HEADER_SIZE (MAX(sizeof (Ehdr), sizeof (struct exec))) 2274 2275 /* 2276 * Map vp passed in in an interpreted manner. ELF and AOUT files will be 2277 * interpreted and mapped appropriately for execution. 2278 * num_mapped in - # elements in mrp 2279 * num_mapped out - # sections mapped and length of mrp array if 2280 * no errors or E2BIG returned. 2281 * 2282 * Returns 0 on success, errno value on failure. 2283 */ 2284 static int 2285 mmapobj_map_interpret(vnode_t *vp, mmapobj_result_t *mrp, 2286 uint_t *num_mapped, size_t padding, cred_t *fcred) 2287 { 2288 int error = 0; 2289 vattr_t vattr; 2290 struct lib_va *lvp; 2291 caddr_t start_addr; 2292 model_t model; 2293 2294 /* 2295 * header has to be aligned to the native size of ulong_t in order 2296 * to avoid an unaligned access when dereferencing the header as 2297 * a ulong_t. Thus we allocate our array on the stack of type 2298 * ulong_t and then have header, which we dereference later as a char 2299 * array point at lheader. 2300 */ 2301 ulong_t lheader[(MAX_HEADER_SIZE / (sizeof (ulong_t))) + 1]; 2302 caddr_t header = (caddr_t)&lheader; 2303 2304 vattr.va_mask = AT_FSID | AT_NODEID | AT_CTIME | AT_MTIME | AT_SIZE; 2305 error = VOP_GETATTR(vp, &vattr, 0, fcred, NULL); 2306 if (error) { 2307 return (error); 2308 } 2309 2310 /* 2311 * Check lib_va to see if we already have a full description 2312 * for this library. This is the fast path and only used for 2313 * ET_DYN ELF files (dynamic libraries). 2314 */ 2315 if (padding == 0 && (lvp = lib_va_find(&vattr)) != NULL) { 2316 int num_segs; 2317 2318 model = get_udatamodel(); 2319 if ((model == DATAMODEL_ILP32 && 2320 lvp->lv_flags & LV_ELF64) || 2321 (model == DATAMODEL_LP64 && 2322 lvp->lv_flags & LV_ELF32)) { 2323 lib_va_release(lvp); 2324 MOBJ_STAT_ADD(fast_wrong_model); 2325 return (ENOTSUP); 2326 } 2327 num_segs = lvp->lv_num_segs; 2328 if (*num_mapped < num_segs) { 2329 *num_mapped = num_segs; 2330 lib_va_release(lvp); 2331 MOBJ_STAT_ADD(fast_e2big); 2332 return (E2BIG); 2333 } 2334 2335 /* 2336 * Check to see if we have all the mappable program headers 2337 * cached. 2338 */ 2339 if (num_segs <= LIBVA_CACHED_SEGS && num_segs != 0) { 2340 MOBJ_STAT_ADD(fast); 2341 start_addr = mmapobj_lookup_start_addr(lvp); 2342 if (start_addr == NULL) { 2343 lib_va_release(lvp); 2344 return (ENOMEM); 2345 } 2346 2347 bcopy(lvp->lv_mps, mrp, 2348 num_segs * sizeof (mmapobj_result_t)); 2349 2350 error = mmapobj_map_elf(vp, start_addr, mrp, 2351 num_segs, fcred, ET_DYN); 2352 2353 lib_va_release(lvp); 2354 if (error == 0) { 2355 *num_mapped = num_segs; 2356 MOBJ_STAT_ADD(fast_success); 2357 } 2358 return (error); 2359 } 2360 MOBJ_STAT_ADD(fast_not_now); 2361 2362 /* Release it for now since we'll look it up below */ 2363 lib_va_release(lvp); 2364 } 2365 2366 /* 2367 * Time to see if this is a file we can interpret. If it's smaller 2368 * than this, then we can't interpret it. 2369 */ 2370 if (vattr.va_size < MAX_HEADER_SIZE) { 2371 MOBJ_STAT_ADD(small_file); 2372 return (ENOTSUP); 2373 } 2374 2375 if ((error = vn_rdwr(UIO_READ, vp, header, MAX_HEADER_SIZE, 0, 2376 UIO_SYSSPACE, 0, (rlim64_t)0, fcred, NULL)) != 0) { 2377 MOBJ_STAT_ADD(read_error); 2378 return (error); 2379 } 2380 2381 /* Verify file type */ 2382 if (header[EI_MAG0] == ELFMAG0 && header[EI_MAG1] == ELFMAG1 && 2383 header[EI_MAG2] == ELFMAG2 && header[EI_MAG3] == ELFMAG3) { 2384 return (doelfwork((Ehdr *)lheader, vp, mrp, num_mapped, 2385 padding, fcred)); 2386 } 2387 2388 #if defined(__sparc) 2389 /* On sparc, check for 4.X AOUT format */ 2390 switch (((struct exec *)header)->a_magic) { 2391 case OMAGIC: 2392 case ZMAGIC: 2393 case NMAGIC: 2394 return (doaoutwork(vp, mrp, num_mapped, 2395 (struct exec *)lheader, fcred)); 2396 } 2397 #endif 2398 2399 /* Unsupported type */ 2400 MOBJ_STAT_ADD(unsupported); 2401 return (ENOTSUP); 2402 } 2403 2404 /* 2405 * Given a vnode, map it as either a flat file or interpret it and map 2406 * it according to the rules of the file type. 2407 * *num_mapped will contain the size of the mmapobj_result_t array passed in. 2408 * If padding is non-zero, the mappings will be padded by that amount 2409 * rounded up to the nearest pagesize. 2410 * If the mapping is successful, *num_mapped will contain the number of 2411 * distinct mappings created, and mrp will point to the array of 2412 * mmapobj_result_t's which describe these mappings. 2413 * 2414 * On error, -1 is returned and errno is set appropriately. 2415 * A special error case will set errno to E2BIG when there are more than 2416 * *num_mapped mappings to be created and *num_mapped will be set to the 2417 * number of mappings needed. 2418 */ 2419 int 2420 mmapobj(vnode_t *vp, uint_t flags, mmapobj_result_t *mrp, 2421 uint_t *num_mapped, size_t padding, cred_t *fcred) 2422 { 2423 int to_map; 2424 int error = 0; 2425 2426 ASSERT((padding & PAGEOFFSET) == 0); 2427 ASSERT((flags & ~MMOBJ_ALL_FLAGS) == 0); 2428 ASSERT(num_mapped != NULL); 2429 ASSERT((flags & MMOBJ_PADDING) ? padding != 0 : padding == 0); 2430 2431 if ((flags & MMOBJ_INTERPRET) == 0) { 2432 to_map = padding ? 3 : 1; 2433 if (*num_mapped < to_map) { 2434 *num_mapped = to_map; 2435 MOBJ_STAT_ADD(flat_e2big); 2436 return (E2BIG); 2437 } 2438 error = mmapobj_map_flat(vp, mrp, padding, fcred); 2439 2440 if (error) { 2441 return (error); 2442 } 2443 *num_mapped = to_map; 2444 return (0); 2445 } 2446 2447 error = mmapobj_map_interpret(vp, mrp, num_mapped, padding, fcred); 2448 return (error); 2449 }