1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #include <sys/types.h> 28 #include <sys/kmem.h> 29 #include <sys/errno.h> 30 #include <sys/thread.h> 31 #include <sys/systm.h> 32 #include <sys/syscall.h> 33 #include <sys/proc.h> 34 #include <sys/modctl.h> 35 #include <sys/cmn_err.h> 36 #include <sys/model.h> 37 #include <sys/exec.h> 38 #include <sys/lx_impl.h> 39 #include <sys/machbrand.h> 40 #include <sys/lx_syscalls.h> 41 #include <sys/lx_pid.h> 42 #include <sys/lx_futex.h> 43 #include <sys/lx_brand.h> 44 #include <sys/termios.h> 45 #include <sys/sunddi.h> 46 #include <sys/ddi.h> 47 #include <sys/vnode.h> 48 #include <sys/pathname.h> 49 #include <sys/auxv.h> 50 #include <sys/priv.h> 51 #include <sys/regset.h> 52 #include <sys/privregs.h> 53 #include <sys/archsystm.h> 54 #include <sys/zone.h> 55 #include <sys/brand.h> 56 57 int lx_debug = 0; 58 59 void lx_init_brand_data(zone_t *); 60 void lx_free_brand_data(zone_t *); 61 void lx_setbrand(proc_t *); 62 int lx_getattr(zone_t *, int, void *, size_t *); 63 int lx_setattr(zone_t *, int, void *, size_t); 64 int lx_brandsys(int, int64_t *, uintptr_t, uintptr_t, uintptr_t, 65 uintptr_t, uintptr_t, uintptr_t); 66 int lx_get_kern_version(void); 67 void lx_set_kern_version(zone_t *, int); 68 void lx_copy_procdata(proc_t *, proc_t *); 69 70 extern void lx_setrval(klwp_t *, int, int); 71 extern void lx_proc_exit(proc_t *, klwp_t *); 72 extern void lx_exec(); 73 extern int lx_initlwp(klwp_t *); 74 extern void lx_forklwp(klwp_t *, klwp_t *); 75 extern void lx_exitlwp(klwp_t *); 76 extern void lx_freelwp(klwp_t *); 77 extern greg_t lx_fixsegreg(greg_t, model_t); 78 extern int lx_sched_affinity(int, uintptr_t, int, uintptr_t, int64_t *); 79 80 int lx_systrace_brand_enabled; 81 82 lx_systrace_f *lx_systrace_entry_ptr; 83 lx_systrace_f *lx_systrace_return_ptr; 84 85 static int lx_systrace_enabled; 86 87 static int lx_elfexec(struct vnode *vp, struct execa *uap, struct uarg *args, 88 struct intpdata *idata, int level, long *execsz, int setid, 89 caddr_t exec_file, struct cred *cred, int brand_action); 90 91 /* lx brand */ 92 struct brand_ops lx_brops = { 93 lx_init_brand_data, 94 lx_free_brand_data, 95 lx_brandsys, 96 lx_setbrand, 97 lx_getattr, 98 lx_setattr, 99 lx_copy_procdata, 100 lx_proc_exit, 101 lx_exec, 102 lx_setrval, 103 lx_initlwp, 104 lx_forklwp, 105 lx_freelwp, 106 lx_exitlwp, 107 lx_elfexec, 108 NULL, 109 NULL, 110 NSIG, 111 }; 112 113 struct brand_mach_ops lx_mops = { 114 NULL, 115 lx_brand_int80_callback, 116 NULL, 117 NULL, 118 NULL, 119 lx_fixsegreg, 120 }; 121 122 struct brand lx_brand = { 123 BRAND_VER_1, 124 "lx", 125 &lx_brops, 126 &lx_mops 127 }; 128 129 static struct modlbrand modlbrand = { 130 &mod_brandops, "lx brand", &lx_brand 131 }; 132 133 static struct modlinkage modlinkage = { 134 MODREV_1, (void *)&modlbrand, NULL 135 }; 136 137 void 138 lx_proc_exit(proc_t *p, klwp_t *lwp) 139 { 140 zone_t *z = p->p_zone; 141 142 ASSERT(p->p_brand != NULL); 143 ASSERT(p->p_brand_data != NULL); 144 145 /* 146 * If init is dying and we aren't explicitly shutting down the zone 147 * or the system, then Solaris is about to restart init. The Linux 148 * init is not designed to handle a restart, which it interprets as 149 * a reboot. To give it a sane environment in which to run, we 150 * reboot the zone. 151 */ 152 if (p->p_pid == z->zone_proc_initpid) { 153 if (z->zone_boot_err == 0 && 154 z->zone_restart_init && 155 zone_status_get(z) < ZONE_IS_SHUTTING_DOWN && 156 zone_status_get(global_zone) < ZONE_IS_SHUTTING_DOWN) 157 (void) zone_kadmin(A_REBOOT, 0, NULL, CRED()); 158 } 159 lx_exitlwp(lwp); 160 kmem_free(p->p_brand_data, sizeof (struct lx_proc_data)); 161 p->p_brand_data = NULL; 162 } 163 164 void 165 lx_setbrand(proc_t *p) 166 { 167 kthread_t *t = p->p_tlist; 168 int err; 169 170 ASSERT(p->p_brand_data == NULL); 171 ASSERT(ttolxlwp(curthread) == NULL); 172 173 p->p_brand_data = kmem_zalloc(sizeof (struct lx_proc_data), KM_SLEEP); 174 175 /* 176 * This routine can only be called for single-threaded processes. 177 * Since lx_initlwp() can only fail if we run out of PIDs for 178 * multithreaded processes, we know that this can never fail. 179 */ 180 err = lx_initlwp(t->t_lwp); 181 ASSERT(err == 0); 182 } 183 184 /* ARGSUSED */ 185 int 186 lx_setattr(zone_t *zone, int attr, void *buf, size_t bufsize) 187 { 188 boolean_t val; 189 int num; 190 191 if (attr == LX_ATTR_RESTART_INIT) { 192 if (bufsize > sizeof (boolean_t)) 193 return (ERANGE); 194 if (copyin(buf, &val, sizeof (val)) != 0) 195 return (EFAULT); 196 if (val != B_TRUE && val != B_FALSE) 197 return (EINVAL); 198 zone->zone_restart_init = val; 199 return (0); 200 } else if (attr == LX_KERN_VERSION_NUM) { 201 if (bufsize > sizeof (int)) 202 return (ERANGE); 203 if (copyin(buf, &num, sizeof (num)) != 0) 204 return (EFAULT); 205 lx_set_kern_version(zone, num); 206 return (0); 207 } 208 return (EINVAL); 209 } 210 211 /* ARGSUSED */ 212 int 213 lx_getattr(zone_t *zone, int attr, void *buf, size_t *bufsize) 214 { 215 int num; 216 if (attr == LX_ATTR_RESTART_INIT) { 217 if (*bufsize < sizeof (boolean_t)) 218 return (ERANGE); 219 if (copyout(&zone->zone_restart_init, buf, 220 sizeof (boolean_t)) != 0) 221 return (EFAULT); 222 *bufsize = sizeof (boolean_t); 223 return (0); 224 } else if (attr == LX_KERN_VERSION_NUM) { 225 if (*bufsize < sizeof (int)) 226 return (ERANGE); 227 num = lx_get_kern_version(); 228 if (copyout(&num, buf, sizeof (int)) != 0) 229 return (EFAULT); 230 *bufsize = sizeof (int); 231 return (0); 232 } 233 return (-EINVAL); 234 } 235 236 /* 237 * Enable ptrace system call tracing for the given LWP. This is done by 238 * both setting the flag in that LWP's brand data (in the kernel) and setting 239 * the process-wide trace flag (in the brand library of the traced process). 240 */ 241 static int 242 lx_ptrace_syscall_set(pid_t pid, id_t lwpid, int set) 243 { 244 proc_t *p; 245 kthread_t *t; 246 klwp_t *lwp; 247 lx_proc_data_t *lpdp; 248 lx_lwp_data_t *lldp; 249 uintptr_t addr; 250 int ret, flag = 1; 251 252 if ((p = sprlock(pid)) == NULL) 253 return (ESRCH); 254 255 if (priv_proc_cred_perm(curproc->p_cred, p, NULL, VWRITE) != 0) { 256 sprunlock(p); 257 return (EPERM); 258 } 259 260 if ((t = idtot(p, lwpid)) == NULL || (lwp = ttolwp(t)) == NULL) { 261 sprunlock(p); 262 return (ESRCH); 263 } 264 265 if ((lpdp = p->p_brand_data) == NULL || 266 (lldp = lwp->lwp_brand) == NULL) { 267 sprunlock(p); 268 return (ESRCH); 269 } 270 271 if (set) { 272 /* 273 * Enable the ptrace flag for this LWP and this process. Note 274 * that we will turn off the LWP's ptrace flag, but we don't 275 * turn off the process's ptrace flag. 276 */ 277 lldp->br_ptrace = 1; 278 lpdp->l_ptrace = 1; 279 280 addr = lpdp->l_traceflag; 281 282 mutex_exit(&p->p_lock); 283 284 /* 285 * This can fail only in some rare corner cases where the 286 * process is exiting or we're completely out of memory. In 287 * these cases, it's sufficient to return an error to the ptrace 288 * consumer and leave the process-wide flag set. 289 */ 290 ret = uwrite(p, &flag, sizeof (flag), addr); 291 292 mutex_enter(&p->p_lock); 293 294 /* 295 * If we couldn't set the trace flag, unset the LWP's ptrace 296 * flag as there ptrace consumer won't expect this LWP to stop. 297 */ 298 if (ret != 0) 299 lldp->br_ptrace = 0; 300 } else { 301 lldp->br_ptrace = 0; 302 ret = 0; 303 } 304 305 sprunlock(p); 306 307 if (ret != 0) 308 ret = EIO; 309 310 return (ret); 311 } 312 313 static void 314 lx_ptrace_fire(void) 315 { 316 kthread_t *t = curthread; 317 klwp_t *lwp = ttolwp(t); 318 lx_lwp_data_t *lldp = lwp->lwp_brand; 319 320 /* 321 * The ptrace flag only applies until the next event is encountered 322 * for the given LWP. If it's set, turn off the flag and poke the 323 * controlling process by raising a signal. 324 */ 325 if (lldp->br_ptrace) { 326 lldp->br_ptrace = 0; 327 tsignal(t, SIGTRAP); 328 } 329 } 330 331 void 332 lx_brand_systrace_enable(void) 333 { 334 extern void lx_brand_int80_enable(void); 335 336 ASSERT(!lx_systrace_enabled); 337 338 lx_brand_int80_enable(); 339 340 lx_systrace_enabled = 1; 341 } 342 343 void 344 lx_brand_systrace_disable(void) 345 { 346 extern void lx_brand_int80_disable(void); 347 348 ASSERT(lx_systrace_enabled); 349 350 lx_brand_int80_disable(); 351 352 lx_systrace_enabled = 0; 353 } 354 355 void 356 lx_init_brand_data(zone_t *zone) 357 { 358 lx_zone_data_t *data; 359 ASSERT(zone->zone_brand == &lx_brand); 360 ASSERT(zone->zone_brand_data == NULL); 361 data = (lx_zone_data_t *)kmem_zalloc(sizeof (lx_zone_data_t), KM_SLEEP); 362 /* 363 * Set the default lxzd_kernel_version to LX_KERN_2_4. 364 * This can be changed by a call to setattr() during zone boot. 365 */ 366 data->lxzd_kernel_version = LX_KERN_2_4; 367 data->lxzd_max_syscall = LX_NSYSCALLS_2_4; 368 zone->zone_brand_data = data; 369 } 370 371 void 372 lx_free_brand_data(zone_t *zone) 373 { 374 kmem_free(zone->zone_brand_data, sizeof (lx_zone_data_t)); 375 } 376 377 /* 378 * Get the addresses of the user-space system call handler and attach it to 379 * the proc structure. Returning 0 indicates success; the value returned 380 * by the system call is the value stored in rval. Returning a non-zero 381 * value indicates a failure; the value returned is used to set errno, -1 382 * is returned from the syscall and the contents of rval are ignored. To 383 * set errno and have the syscall return a value other than -1 we can 384 * manually set errno and rval and return 0. 385 */ 386 int 387 lx_brandsys(int cmd, int64_t *rval, uintptr_t arg1, uintptr_t arg2, 388 uintptr_t arg3, uintptr_t arg4, uintptr_t arg5, uintptr_t arg6) 389 { 390 kthread_t *t = curthread; 391 proc_t *p = ttoproc(t); 392 lx_proc_data_t *pd; 393 int linux_call; 394 struct termios *termios; 395 uint_t termios_len; 396 int error; 397 lx_brand_registration_t reg; 398 399 /* 400 * There is one operation that is suppored for non-branded 401 * process. B_EXEC_BRAND. This is the equilivant of an 402 * exec call, but the new process that is created will be 403 * a branded process. 404 */ 405 if (cmd == B_EXEC_BRAND) { 406 ASSERT(p->p_zone != NULL); 407 ASSERT(p->p_zone->zone_brand == &lx_brand); 408 return (exec_common( 409 (char *)arg1, (const char **)arg2, (const char **)arg3, 410 EBA_BRAND)); 411 } 412 413 /* For all other operations this must be a branded process. */ 414 if (p->p_brand == NULL) 415 return (set_errno(ENOSYS)); 416 417 ASSERT(p->p_brand == &lx_brand); 418 ASSERT(p->p_brand_data != NULL); 419 420 switch (cmd) { 421 case B_REGISTER: 422 if (p->p_model == DATAMODEL_NATIVE) { 423 if (copyin((void *)arg1, ®, sizeof (reg)) != 0) { 424 lx_print("Failed to copyin brand registration " 425 "at 0x%p\n", (void *)arg1); 426 return (EFAULT); 427 } 428 #ifdef _LP64 429 } else { 430 lx_brand_registration32_t reg32; 431 432 if (copyin((void *)arg1, ®32, sizeof (reg32)) != 0) { 433 lx_print("Failed to copyin brand registration " 434 "at 0x%p\n", (void *)arg1); 435 return (EFAULT); 436 } 437 438 reg.lxbr_version = (uint_t)reg32.lxbr_version; 439 reg.lxbr_handler = 440 (void *)(uintptr_t)reg32.lxbr_handler; 441 reg.lxbr_tracehandler = 442 (void *)(uintptr_t)reg32.lxbr_tracehandler; 443 reg.lxbr_traceflag = 444 (void *)(uintptr_t)reg32.lxbr_traceflag; 445 #endif 446 } 447 448 if (reg.lxbr_version != LX_VERSION_1) { 449 lx_print("Invalid brand library version (%u)\n", 450 reg.lxbr_version); 451 return (EINVAL); 452 } 453 454 lx_print("Assigning brand 0x%p and handler 0x%p to proc 0x%p\n", 455 (void *)&lx_brand, (void *)reg.lxbr_handler, (void *)p); 456 pd = p->p_brand_data; 457 pd->l_handler = (uintptr_t)reg.lxbr_handler; 458 pd->l_tracehandler = (uintptr_t)reg.lxbr_tracehandler; 459 pd->l_traceflag = (uintptr_t)reg.lxbr_traceflag; 460 *rval = 0; 461 return (0); 462 case B_TTYMODES: 463 /* This is necessary for emulating TCGETS ioctls. */ 464 if (ddi_prop_lookup_byte_array(DDI_DEV_T_ANY, ddi_root_node(), 465 DDI_PROP_NOTPROM, "ttymodes", (uchar_t **)&termios, 466 &termios_len) != DDI_SUCCESS) 467 return (EIO); 468 469 ASSERT(termios_len == sizeof (*termios)); 470 471 if (copyout(&termios, (void *)arg1, sizeof (termios)) != 0) { 472 ddi_prop_free(termios); 473 return (EFAULT); 474 } 475 476 ddi_prop_free(termios); 477 *rval = 0; 478 return (0); 479 480 case B_ELFDATA: 481 pd = curproc->p_brand_data; 482 if (copyout(&pd->l_elf_data, (void *)arg1, 483 sizeof (lx_elf_data_t)) != 0) { 484 (void) set_errno(EFAULT); 485 return (*rval = -1); 486 } 487 *rval = 0; 488 return (0); 489 490 case B_EXEC_NATIVE: 491 error = exec_common( 492 (char *)arg1, (const char **)arg2, (const char **)arg3, 493 EBA_NATIVE); 494 if (error) { 495 (void) set_errno(error); 496 return (*rval = -1); 497 } 498 return (*rval = 0); 499 500 case B_LPID_TO_SPAIR: 501 /* 502 * Given a Linux pid as arg1, return the Solaris pid in arg2 and 503 * the Solaris LWP in arg3. We also translate pid 1 (which is 504 * hardcoded in many applications) to the zone's init process. 505 */ 506 { 507 pid_t s_pid; 508 id_t s_tid; 509 510 if ((pid_t)arg1 == 1) { 511 s_pid = p->p_zone->zone_proc_initpid; 512 /* handle the dead/missing init(1M) case */ 513 if (s_pid == -1) 514 s_pid = 1; 515 s_tid = 1; 516 } else if (lx_lpid_to_spair((pid_t)arg1, &s_pid, 517 &s_tid) < 0) 518 return (ESRCH); 519 520 if (copyout(&s_pid, (void *)arg2, 521 sizeof (s_pid)) != 0 || 522 copyout(&s_tid, (void *)arg3, sizeof (s_tid)) != 0) 523 return (EFAULT); 524 525 *rval = 0; 526 return (0); 527 } 528 529 case B_PTRACE_SYSCALL: 530 *rval = lx_ptrace_syscall_set((pid_t)arg1, (id_t)arg2, 531 (int)arg3); 532 return (0); 533 534 case B_SYSENTRY: 535 if (lx_systrace_enabled) { 536 uint32_t args[6]; 537 538 ASSERT(lx_systrace_entry_ptr != NULL); 539 540 if (copyin((void *)arg2, args, sizeof (args)) != 0) 541 return (EFAULT); 542 543 (*lx_systrace_entry_ptr)(arg1, args[0], args[1], 544 args[2], args[3], args[4], args[5]); 545 } 546 547 lx_ptrace_fire(); 548 549 pd = p->p_brand_data; 550 551 /* 552 * If neither DTrace not ptrace are interested in tracing 553 * this process any more, turn off the trace flag. 554 */ 555 if (!lx_systrace_enabled && !pd->l_ptrace) 556 (void) suword32((void *)pd->l_traceflag, 0); 557 558 *rval = 0; 559 return (0); 560 561 case B_SYSRETURN: 562 if (lx_systrace_enabled) { 563 ASSERT(lx_systrace_return_ptr != NULL); 564 565 (*lx_systrace_return_ptr)(arg1, arg2, arg2, 0, 0, 0, 0); 566 } 567 568 lx_ptrace_fire(); 569 570 pd = p->p_brand_data; 571 572 /* 573 * If neither DTrace not ptrace are interested in tracing 574 * this process any more, turn off the trace flag. 575 */ 576 if (!lx_systrace_enabled && !pd->l_ptrace) 577 (void) suword32((void *)pd->l_traceflag, 0); 578 579 *rval = 0; 580 return (0); 581 582 case B_SET_AFFINITY_MASK: 583 case B_GET_AFFINITY_MASK: 584 /* 585 * Retrieve or store the CPU affinity mask for the 586 * requested linux pid. 587 * 588 * arg1 is a linux PID (0 means curthread). 589 * arg2 is the size of the given mask. 590 * arg3 is the address of the affinity mask. 591 */ 592 return (lx_sched_affinity(cmd, arg1, arg2, arg3, rval)); 593 594 default: 595 linux_call = cmd - B_EMULATE_SYSCALL; 596 /* 597 * Only checking against highest syscall number for all kernel 598 * versions, since check for specific kernel version is done 599 * in userland prior to this call, and duplicating logic would 600 * be redundant. 601 */ 602 if (linux_call >= 0 && linux_call < LX_NSYSCALLS) { 603 *rval = lx_emulate_syscall(linux_call, arg1, arg2, 604 arg3, arg4, arg5, arg6); 605 return (0); 606 } 607 } 608 609 return (EINVAL); 610 } 611 612 int 613 lx_get_zone_kern_version(zone_t *zone) 614 { 615 return (((lx_zone_data_t *)zone->zone_brand_data)->lxzd_kernel_version); 616 } 617 618 int 619 lx_get_kern_version() 620 { 621 return (lx_get_zone_kern_version(curzone)); 622 } 623 624 void 625 lx_set_kern_version(zone_t *zone, int vers) 626 { 627 lx_zone_data_t *lxzd = (lx_zone_data_t *)zone->zone_brand_data; 628 629 lxzd->lxzd_kernel_version = vers; 630 if (vers == LX_KERN_2_6) 631 lxzd->lxzd_max_syscall = LX_NSYSCALLS_2_6; 632 } 633 634 /* 635 * Copy the per-process brand data from a parent proc to a child. 636 */ 637 void 638 lx_copy_procdata(proc_t *child, proc_t *parent) 639 { 640 lx_proc_data_t *cpd, *ppd; 641 642 ppd = parent->p_brand_data; 643 644 ASSERT(ppd != NULL); 645 646 cpd = kmem_alloc(sizeof (lx_proc_data_t), KM_SLEEP); 647 *cpd = *ppd; 648 649 child->p_brand_data = cpd; 650 } 651 652 /* 653 * Currently, only 32-bit branded ELF executables are supported. 654 */ 655 #if defined(_LP64) 656 #define elfexec elf32exec 657 #define mapexec_brand mapexec32_brand 658 #endif /* _LP64 */ 659 660 /* 661 * Exec routine called by elfexec() to load 32-bit Linux binaries. 662 */ 663 static int 664 lx_elfexec(struct vnode *vp, struct execa *uap, struct uarg *args, 665 struct intpdata *idata, int level, long *execsz, int setid, 666 caddr_t exec_file, struct cred *cred, int brand_action) 667 { 668 int error; 669 vnode_t *nvp; 670 auxv32_t phdr_auxv32[3] = { 671 { AT_SUN_BRAND_LX_PHDR, 0 }, 672 { AT_SUN_BRAND_AUX2, 0 }, 673 { AT_SUN_BRAND_AUX3, 0 } 674 }; 675 Elf32_Ehdr ehdr; 676 Elf32_Addr uphdr_vaddr; 677 intptr_t voffset; 678 int interp; 679 int i; 680 struct execenv env; 681 struct user *up = PTOU(ttoproc(curthread)); 682 lx_elf_data_t *edp = 683 &((lx_proc_data_t *)ttoproc(curthread)->p_brand_data)->l_elf_data; 684 685 ASSERT(ttoproc(curthread)->p_brand == &lx_brand); 686 ASSERT(ttoproc(curthread)->p_brand_data != NULL); 687 688 /* 689 * Set the brandname and library name for the new process so that 690 * elfexec() puts them onto the stack. 691 */ 692 args->brandname = LX_BRANDNAME; 693 args->emulator = LX_LIB_PATH; 694 695 /* 696 * We will exec the brand library, and map in the linux linker and the 697 * linux executable. 698 */ 699 if ((error = lookupname(LX_LIB_PATH, UIO_SYSSPACE, FOLLOW, NULLVPP, 700 &nvp))) { 701 uprintf("%s: not found.", LX_LIB); 702 return (error); 703 } 704 705 if ((error = elfexec(nvp, uap, args, idata, level + 1, execsz, setid, 706 exec_file, cred, brand_action))) { 707 VN_RELE(nvp); 708 return (error); 709 } 710 VN_RELE(nvp); 711 712 bzero(&env, sizeof (env)); 713 714 if ((error = mapexec_brand(vp, args, &ehdr, &uphdr_vaddr, &voffset, 715 exec_file, &interp, &env.ex_bssbase, &env.ex_brkbase, 716 &env.ex_brksize, NULL))) 717 return (error); 718 719 /* 720 * Save off the important properties of the lx executable. The brand 721 * library will ask us for this data later, when it is ready to set 722 * things up for the lx executable. 723 */ 724 edp->ed_phdr = (uphdr_vaddr == -1) ? voffset + ehdr.e_phoff : 725 voffset + uphdr_vaddr; 726 edp->ed_entry = voffset + ehdr.e_entry; 727 edp->ed_phent = ehdr.e_phentsize; 728 edp->ed_phnum = ehdr.e_phnum; 729 730 if (interp) { 731 if (ehdr.e_type == ET_DYN) { 732 /* 733 * This is a shared object executable, so we need to 734 * pick a reasonable place to put the heap. Just don't 735 * use the first page. 736 */ 737 env.ex_brkbase = (caddr_t)PAGESIZE; 738 env.ex_bssbase = (caddr_t)PAGESIZE; 739 } 740 741 /* 742 * If the program needs an interpreter (most do), map it in and 743 * store relevant information about it in the aux vector, where 744 * the brand library can find it. 745 */ 746 if ((error = lookupname(LX_LINKER, UIO_SYSSPACE, FOLLOW, NULLVPP, 747 &nvp))) { 748 uprintf("%s: not found.", LX_LINKER); 749 return (error); 750 } 751 if ((error = mapexec_brand(nvp, args, &ehdr, &uphdr_vaddr, 752 &voffset, exec_file, &interp, NULL, NULL, NULL, NULL))) { 753 VN_RELE(nvp); 754 return (error); 755 } 756 VN_RELE(nvp); 757 758 /* 759 * Now that we know the base address of the brand's linker, 760 * place it in the aux vector. 761 */ 762 edp->ed_base = voffset; 763 edp->ed_ldentry = voffset + ehdr.e_entry; 764 } else { 765 /* 766 * This program has no interpreter. The lx brand library will 767 * jump to the address in the AT_SUN_BRAND_LDENTRY aux vector, 768 * so in this case, put the entry point of the main executable 769 * there. 770 */ 771 if (ehdr.e_type == ET_EXEC) { 772 /* 773 * An executable with no interpreter, this must be a 774 * statically linked executable, which means we loaded 775 * it at the address specified in the elf header, in 776 * which case the e_entry field of the elf header is an 777 * absolute address. 778 */ 779 edp->ed_ldentry = ehdr.e_entry; 780 edp->ed_entry = ehdr.e_entry; 781 } else { 782 /* 783 * A shared object with no interpreter, we use the 784 * calculated address from above. 785 */ 786 edp->ed_ldentry = edp->ed_entry; 787 788 /* 789 * In all situations except an ET_DYN elf object with no 790 * interpreter, we want to leave the brk and base 791 * values set by mapexec_brand alone. Normally when 792 * running ET_DYN objects on Solaris (most likely 793 * /lib/ld.so.1) the kernel sets brk and base to 0 since 794 * it doesn't know where to put the heap, and later the 795 * linker will call brk() to initialize the heap in: 796 * usr/src/cmd/sgs/rtld/common/setup.c:setup() 797 * after it has determined where to put it. (This 798 * decision is made after the linker loads and inspects 799 * elf properties of the target executable being run.) 800 * 801 * So for ET_DYN Linux executables, we also don't know 802 * where the heap should go, so we'll set the brk and 803 * base to 0. But in this case the Solaris linker will 804 * not initialize the heap, so when the Linux linker 805 * starts running there is no heap allocated. This 806 * seems to be ok on Linux 2.4 based systems because the 807 * Linux linker/libc fall back to using mmap() to 808 * allocate memory. But on 2.6 systems, running 809 * applications by specifying them as command line 810 * arguments to the linker results in segfaults for an 811 * as yet undetermined reason (which seems to indicatej 812 * that a more permanent fix for heap initalization in 813 * these cases may be necessary). 814 */ 815 if (ehdr.e_type == ET_DYN) { 816 env.ex_bssbase = (caddr_t)0; 817 env.ex_brkbase = (caddr_t)0; 818 env.ex_brksize = 0; 819 } 820 } 821 822 } 823 824 env.ex_vp = vp; 825 setexecenv(&env); 826 827 /* 828 * We don't need to copy this stuff out. It is only used by our 829 * tools to locate the lx linker's debug section. But we should at 830 * least try to keep /proc's view of the aux vector consistent with 831 * what's on the process stack. 832 */ 833 phdr_auxv32[0].a_un.a_val = edp->ed_phdr; 834 835 /* 836 * Linux 2.6 programs such as ps will print an error message if the 837 * following aux entry is missing 838 */ 839 if (lx_get_kern_version() >= LX_KERN_2_6) { 840 phdr_auxv32[1].a_type = AT_CLKTCK; 841 phdr_auxv32[1].a_un.a_val = hz; 842 } 843 844 if (copyout(&phdr_auxv32, args->auxp_brand, 845 sizeof (phdr_auxv32)) == -1) 846 return (EFAULT); 847 848 /* 849 * /proc uses the AT_ENTRY aux vector entry to deduce 850 * the location of the executable in the address space. The user 851 * structure contains a copy of the aux vector that needs to have those 852 * entries patched with the values of the real lx executable (they 853 * currently contain the values from the lx brand library that was 854 * elfexec'd, above). 855 * 856 * For live processes, AT_BASE is used to locate the linker segment, 857 * which /proc and friends will later use to find Solaris symbols 858 * (such as rtld_db_preinit). However, for core files, /proc uses 859 * AT_ENTRY to find the right segment to label as the executable. 860 * So we set AT_ENTRY to be the entry point of the linux executable, 861 * but leave AT_BASE to be the address of the Solaris linker. 862 */ 863 for (i = 0; i < __KERN_NAUXV_IMPL; i++) { 864 if (up->u_auxv[i].a_type == AT_ENTRY) 865 up->u_auxv[i].a_un.a_val = edp->ed_entry; 866 if (up->u_auxv[i].a_type == AT_SUN_BRAND_LX_PHDR) 867 up->u_auxv[i].a_un.a_val = edp->ed_phdr; 868 } 869 870 return (0); 871 } 872 873 int 874 _init(void) 875 { 876 int err = 0; 877 878 /* pid/tid conversion hash tables */ 879 lx_pid_init(); 880 881 /* for lx_futex() */ 882 lx_futex_init(); 883 884 err = mod_install(&modlinkage); 885 if (err != 0) { 886 cmn_err(CE_WARN, "Couldn't install lx brand module"); 887 888 /* 889 * This looks drastic, but it should never happen. These 890 * two data structures should be completely free-able until 891 * they are used by Linux processes. Since the brand 892 * wasn't loaded there should be no Linux processes, and 893 * thus no way for these data structures to be modified. 894 */ 895 lx_pid_fini(); 896 if (lx_futex_fini()) 897 panic("lx brand module cannot be loaded or unloaded."); 898 } 899 return (err); 900 } 901 902 int 903 _info(struct modinfo *modinfop) 904 { 905 return (mod_info(&modlinkage, modinfop)); 906 } 907 908 int 909 _fini(void) 910 { 911 int err; 912 int futex_done = 0; 913 914 /* 915 * If there are any zones using this brand, we can't allow it to be 916 * unloaded. 917 */ 918 if (brand_zone_count(&lx_brand)) 919 return (EBUSY); 920 921 lx_pid_fini(); 922 923 if ((err = lx_futex_fini()) != 0) 924 goto done; 925 futex_done = 1; 926 927 err = mod_remove(&modlinkage); 928 929 done: 930 if (err) { 931 /* 932 * If we can't unload the module, then we have to get it 933 * back into a sane state. 934 */ 935 lx_pid_init(); 936 937 if (futex_done) 938 lx_futex_init(); 939 940 } 941 942 return (err); 943 }