1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 * Copyright 2015 Joyent, Inc. All rights reserved. 26 */ 27 28 #include <assert.h> 29 #include <errno.h> 30 #include <stdlib.h> 31 #include <signal.h> 32 #include <unistd.h> 33 #include <ucontext.h> 34 #include <thread.h> 35 #include <strings.h> 36 #include <libintl.h> 37 #include <sys/regset.h> 38 #include <sys/syscall.h> 39 #include <sys/inttypes.h> 40 #include <sys/param.h> 41 #include <sys/types.h> 42 #include <sys/segments.h> 43 #include <signal.h> 44 #include <sys/lx_misc.h> 45 #include <sys/lx_types.h> 46 #include <sys/lx_signal.h> 47 #include <sys/lx_syscall.h> 48 #include <sys/lx_brand.h> 49 #include <sys/lx_debug.h> 50 #include <sys/lx_thread.h> 51 #include <sys/fork.h> 52 #include <sys/mman.h> 53 #include <lx_syscall.h> 54 55 56 #define SHARED_AS \ 57 (LX_CLONE_VM | LX_CLONE_FS | LX_CLONE_FILES | LX_CLONE_SIGHAND \ 58 | LX_CLONE_THREAD) 59 #define CLONE_VFORK (LX_CLONE_VM | LX_CLONE_VFORK) 60 #define CLONE_TD (LX_CLONE_THREAD|LX_CLONE_DETACH) 61 62 #define IS_FORK(f) (((f) & SHARED_AS) == 0) 63 #define IS_VFORK(f) (((f) & CLONE_VFORK) == CLONE_VFORK) 64 65 /* 66 * This is dicey. This seems to be an internal glibc structure, and not 67 * part of any external interface. Thus, it is subject to change without 68 * notice. FWIW, clone(2) itself seems to be an internal (or at least 69 * unstable) interface, since strace(1) shows it differently than the man 70 * page. 71 */ 72 struct lx_desc 73 { 74 uint32_t entry_number; 75 uint32_t base_addr; 76 uint32_t limit; 77 uint32_t seg_32bit:1; 78 uint32_t contents:2; 79 uint32_t read_exec_only:1; 80 uint32_t limit_in_pages:1; 81 uint32_t seg_not_present:1; 82 uint32_t useable:1; 83 uint32_t empty:25; 84 }; 85 86 struct clone_state { 87 void *c_retaddr; /* instr after clone()'s int80 */ 88 int c_flags; /* flags to clone(2) */ 89 int c_sig; /* signal to send on thread exit */ 90 void *c_stk; /* %esp of new thread */ 91 void *c_ptidp; 92 struct lx_desc *c_ldtinfo; /* thread-specific segment */ 93 void *c_ctidp; 94 ucontext_t c_uc; /* original register state/sigmask */ 95 lx_affmask_t c_affmask; /* CPU affinity mask */ 96 volatile int *c_clone_res; /* pid/error returned to cloner */ 97 int c_ptrace_event; /* ptrace(2) event for child stop */ 98 void *c_ntv_stk; /* native stack for this thread */ 99 size_t c_ntv_stk_sz; /* native stack size */ 100 lx_tsd_t *c_lx_tsd; /* tsd area for thread */ 101 }; 102 103 /* 104 * Counter incremented when we vfork(2) ourselves, and decremented when the 105 * vfork(2)ed child exit(2)s or exec(2)s. 106 */ 107 static int is_vforked = 0; 108 109 long 110 lx_exit(uintptr_t p1) 111 { 112 int status = (int)p1; 113 lx_tsd_t *lx_tsd; 114 115 /* 116 * If we are a vfork(2)ed child, we need to exit as quickly and 117 * cleanly as possible to avoid corrupting our parent. 118 */ 119 if (is_vforked != 0) { 120 is_vforked--; 121 _exit(status); 122 } 123 124 lx_tsd = lx_get_tsd(); 125 126 lx_tsd->lxtsd_exit = LX_ET_EXIT; 127 lx_tsd->lxtsd_exit_status = status; 128 129 lx_ptrace_stop_if_option(LX_PTRACE_O_TRACEEXIT, B_FALSE, 130 (ulong_t)status, NULL); 131 132 /* 133 * This thread is exiting. Restore the state of the thread to 134 * what it was before we started running linux code. 135 */ 136 (void) setcontext(&lx_tsd->lxtsd_exit_context); 137 138 /* 139 * If we returned from the setcontext(2), something is very wrong. 140 */ 141 lx_err_fatal("exit: unable to set exit context: %s", strerror(errno)); 142 143 /*NOTREACHED*/ 144 return (0); 145 } 146 147 long 148 lx_group_exit(uintptr_t p1) 149 { 150 int status = (int)p1; 151 lx_tsd_t *lx_tsd; 152 153 /* 154 * If we are a vfork(2)ed child, we need to exit as quickly and 155 * cleanly as possible to avoid corrupting our parent. 156 */ 157 if (is_vforked != 0) { 158 is_vforked--; 159 _exit(status); 160 } 161 162 lx_tsd = lx_get_tsd(); 163 164 lx_tsd->lxtsd_exit = LX_ET_EXIT_GROUP; 165 lx_tsd->lxtsd_exit_status = status; 166 167 /* 168 * This thread is exiting. Restore the state of the thread to 169 * what it was before we started running linux code. 170 */ 171 (void) setcontext(&lx_tsd->lxtsd_exit_context); 172 173 /* 174 * If we returned from the setcontext(2), something is very wrong. 175 */ 176 lx_err_fatal("group_exit: unable to set exit context: %s", 177 strerror(errno)); 178 179 /*NOTREACHED*/ 180 return (0); 181 } 182 183 static void * 184 clone_start(void *arg) 185 { 186 int rval; 187 struct clone_state *cs = (struct clone_state *)arg; 188 lx_tsd_t *lxtsd; 189 190 /* 191 * Let the kernel finish setting up all the needed state for this 192 * new thread. 193 * 194 * We already created the thread using the thr_create(3C) library 195 * call, so most of the work required to emulate lx_clone(2) has 196 * been done by the time we get to this point. 197 */ 198 lx_debug("\tre-vectoring to lx kernel module to complete lx_clone()"); 199 lx_debug("\tB_HELPER_CLONE(0x%x, 0x%p, 0x%p, 0x%p)", 200 cs->c_flags, cs->c_ptidp, cs->c_ldtinfo, cs->c_ctidp); 201 202 rval = syscall(SYS_brand, B_HELPER_CLONE, cs->c_flags, cs->c_ptidp, 203 cs->c_ldtinfo, cs->c_ctidp); 204 205 /* 206 * At this point the parent is waiting for cs->c_clone_res to go 207 * non-zero to indicate the thread has been cloned. The value set 208 * in cs->c_clone_res will be used for the return value from 209 * clone(). 210 */ 211 if (rval < 0) { 212 *(cs->c_clone_res) = -errno; 213 lx_debug("\tkernel clone failed, errno %d\n", errno); 214 free(cs->c_lx_tsd); 215 free(cs); 216 return (NULL); 217 } 218 219 if (lx_sched_setaffinity(0, sizeof (cs->c_affmask), 220 (uintptr_t)&cs->c_affmask) != 0) { 221 *(cs->c_clone_res) = -errno; 222 223 lx_err_fatal("Unable to set affinity mask in child thread: %s", 224 strerror(errno)); 225 } 226 227 /* 228 * Initialize the thread specific data for this thread. 229 */ 230 lxtsd = cs->c_lx_tsd; 231 lx_init_tsd(lxtsd); 232 lxtsd->lxtsd_clone_state = cs; 233 234 /* 235 * Install the emulation stack for this thread. Register the 236 * thread-specific data structure with the stack list so that it may be 237 * freed at thread exit or fork(2). 238 */ 239 lx_install_stack(cs->c_ntv_stk, cs->c_ntv_stk_sz, lxtsd); 240 241 /* 242 * Let the parent know that the clone has (effectively) been 243 * completed. 244 */ 245 *(cs->c_clone_res) = rval; 246 247 /* 248 * We want to load the general registers from this context, restore the 249 * original signal mask, and switch to the BRAND stack. The original 250 * signal mask was saved to the context by lx_clone(). 251 */ 252 cs->c_uc.uc_flags = UC_CPU | UC_SIGMASK; 253 cs->c_uc.uc_brand_data[0] = (void *)LX_UC_STACK_BRAND; 254 255 /* 256 * New threads will not link into the existing context chain. 257 */ 258 cs->c_uc.uc_link = NULL; 259 260 /* 261 * Set stack pointer and entry point for new thread: 262 */ 263 LX_REG(&cs->c_uc, REG_SP) = (uintptr_t)cs->c_stk; 264 LX_REG(&cs->c_uc, REG_PC) = (uintptr_t)cs->c_retaddr; 265 266 /* 267 * Return 0 to the child: 268 */ 269 LX_REG(&cs->c_uc, REG_R0) = (uintptr_t)0; 270 271 /* 272 * Fire the ptrace(2) event stop in the new thread: 273 */ 274 lx_ptrace_stop_if_option(cs->c_ptrace_event, B_TRUE, 0, &cs->c_uc); 275 276 /* 277 * Jump to the Linux process. This call cannot return. 278 */ 279 lx_jump_to_linux(&cs->c_uc); 280 } 281 282 /* 283 * The way Linux handles stopping for FORK vs. CLONE does not map exactly to 284 * which syscall was used. Instead, it has to do with which signal is set in 285 * the low byte of the clone flag. The only time the CLONE event is emitted is 286 * if the clone signal (the low byte of the flags argument) is set to something 287 * other than SIGCHLD (see the Linux src in kernel/fork.c do_fork() for the 288 * actual code). 289 */ 290 static int 291 ptrace_clone_event(int flags) 292 { 293 if (flags & LX_CLONE_VFORK) 294 return (LX_PTRACE_O_TRACEVFORK); 295 296 if ((flags & LX_CSIGNAL) != LX_SIGCHLD) 297 return (LX_PTRACE_O_TRACECLONE); 298 299 return (LX_PTRACE_O_TRACEFORK); 300 } 301 302 /* 303 * See glibc sysdeps/unix/sysv/linux/x86_64/clone.S code for x64 argument order 304 * and the Linux kernel/fork.c code for the various ways arguments can be passed 305 * to the clone syscall (CONFIG_CLONE_BACKWARDS, et al). 306 */ 307 long 308 lx_clone(uintptr_t p1, uintptr_t p2, uintptr_t p3, uintptr_t p4, 309 uintptr_t p5) 310 { 311 struct clone_state *cs; 312 int flags = (int)p1; 313 void *cldstk = (void *)p2; 314 void *ptidp = (void *)p3; 315 #if defined(_LP64) 316 void *ctidp = (void *)p4; 317 struct lx_desc *ldtinfo = (void *)p5; 318 #else /* is 32bit */ 319 struct lx_desc *ldtinfo = (void *)p4; 320 void *ctidp = (void *)p5; 321 #endif 322 thread_t tid; 323 volatile int clone_res; 324 int sig; 325 int rval; 326 int pid; 327 ucontext_t *ucp; 328 sigset_t sigmask, osigmask; 329 int fork_flags = 0; 330 int ptrace_event; 331 int error = 0; 332 333 if (flags & LX_CLONE_SETTLS) { 334 lx_debug("lx_clone(flags=0x%x stk=0x%p ptidp=0x%p ldt=0x%p " 335 "ctidp=0x%p", flags, cldstk, ptidp, ldtinfo, ctidp); 336 } else { 337 lx_debug("lx_clone(flags=0x%x stk=0x%p ptidp=0x%p)", 338 flags, cldstk, ptidp); 339 } 340 341 /* 342 * Only supported for pid 0 on Linux after version 2.3.21, and 343 * apparently not at all since 2.5.16. 344 */ 345 if (flags & LX_CLONE_PID) 346 return (-EINVAL); 347 348 /* 349 * CLONE_THREAD requires CLONE_SIGHAND. 350 * 351 * CLONE_THREAD and CLONE_DETACHED must both be either set or cleared 352 * in kernel 2.4 and prior. 353 * In kernel 2.6 (and later) CLONE_DETACHED was dropped completely, so 354 * we no longer have this requirement. 355 */ 356 357 if (flags & CLONE_TD) { 358 if (!(flags & LX_CLONE_SIGHAND)) 359 return (-EINVAL); 360 if (strncmp(lx_release, "2.4", 3) == 0 && 361 (flags & CLONE_TD) != CLONE_TD) 362 return (-EINVAL); 363 } 364 365 ucp = lx_syscall_regs(); 366 367 /* test if pointer passed by user are writable */ 368 if (flags & LX_CLONE_PARENT_SETTID) { 369 if (uucopy(ptidp, &pid, sizeof (int)) != 0) 370 return (-EFAULT); 371 if (uucopy(&pid, ptidp, sizeof (int)) != 0) 372 return (-EFAULT); 373 } 374 if (flags & LX_CLONE_CHILD_SETTID) { 375 if (uucopy(ctidp, &pid, sizeof (int)) != 0) 376 return (-EFAULT); 377 if (uucopy(&pid, ctidp, sizeof (int)) != 0) 378 return (-EFAULT); 379 } 380 381 ptrace_event = ptrace_clone_event(flags); 382 383 /* 384 * Inform the in-kernel ptrace(2) subsystem that we are about to 385 * emulate a fork(2), vfork(2) or clone(2) system call. 386 */ 387 lx_ptrace_clone_begin(ptrace_event, !!(flags & LX_CLONE_PTRACE)); 388 389 /* 390 * Handle a fork(2) operation here. If this is not a fork, a new 391 * thread will be created after this block. 392 */ 393 if (IS_FORK(flags) || IS_VFORK(flags)) { 394 if (flags & LX_CLONE_PARENT) { 395 lx_unsupported("clone(2) only supports CLONE_PARENT " 396 "for threads.\n"); 397 return (-ENOTSUP); 398 } 399 400 if ((flags & LX_CSIGNAL) == 0) 401 fork_flags |= FORK_NOSIGCHLD; 402 403 /* 404 * Suspend signal delivery, run the stack management prefork 405 * handler and perform the actual fork(2) operation. 406 */ 407 _sigoff(); 408 lx_stack_prefork(); 409 if (flags & LX_CLONE_VFORK) { 410 lx_sighandlers_t saved; 411 412 /* 413 * Because we keep our signal disposition at user-land 414 * (and in memory), we must prevent it from being 415 * clobbered should our vforked child change the 416 * disposition (e.g., via sigaction()) before releasing 417 * the address space. We preserve our disposition by 418 * taking a snapshot of it before the vfork and 419 * restoring it afterwards -- which we can get away 420 * with because we know that we aren't executing 421 * concurrently with our child. 422 */ 423 lx_sighandlers_save(&saved); 424 is_vforked++; 425 rval = vforkx(fork_flags); 426 if (rval != 0) { 427 is_vforked--; 428 lx_sighandlers_restore(&saved); 429 } 430 } else { 431 rval = forkx(fork_flags); 432 } 433 434 /* 435 * The parent process returns through the regular system call 436 * path here. 437 */ 438 if (rval != 0) { 439 if (!IS_VFORK(flags) || rval < 0) { 440 /* 441 * Run the stack management postfork handler in 442 * the parent. If this was a vfork(2), we only 443 * run it in the parent if the fork operation 444 * failed; the vfork(2) child has already run 445 * it for our address space. 446 */ 447 lx_stack_postfork(); 448 } 449 450 /* 451 * Since we've already forked, we can't do much if 452 * uucopy fails, so we just ignore failure. Failure is 453 * unlikely since we've tested the memory before we did 454 * the fork. 455 */ 456 if (rval > 0 && (flags & LX_CLONE_PARENT_SETTID)) { 457 (void) uucopy(&rval, ptidp, sizeof (int)); 458 } 459 460 if (rval > 0) { 461 lx_ptrace_stop_if_option(ptrace_event, B_FALSE, 462 (ulong_t)rval, NULL); 463 } 464 465 /* 466 * Re-enable signal delivery in the parent process. 467 */ 468 _sigon(); 469 470 return ((rval < 0) ? -errno : rval); 471 } 472 473 /* 474 * The rest of this block runs only within the new child 475 * process. 476 */ 477 478 /* 479 * Run the stack management postfork handler in the child. 480 */ 481 lx_stack_postfork(); 482 483 if (!IS_VFORK(flags)) { 484 /* 485 * We must free the stacks and thread-specific data 486 * objects for every thread except the one duplicated 487 * from the parent by forkx(). 488 */ 489 lx_free_other_stacks(); 490 } 491 492 if (rval == 0 && (flags & LX_CLONE_CHILD_SETTID)) { 493 /* 494 * lx_getpid should not fail, and if it does, there's 495 * not much we can do about it since we've already 496 * forked, so on failure, we just don't copy the 497 * memory. 498 */ 499 pid = syscall(SYS_brand, B_GETPID); 500 if (pid >= 0) 501 (void) uucopy(&pid, ctidp, sizeof (int)); 502 } 503 504 /* 505 * Set up additional data in the lx_proc_data structure as 506 * necessary. 507 */ 508 if ((rval = syscall(SYS_brand, B_HELPER_CLONE, flags, ptidp, 509 ldtinfo, ctidp)) < 0) { 510 return (rval); 511 } 512 513 if (IS_VFORK(flags)) { 514 ucontext_t vforkuc; 515 516 /* 517 * The vfork(2) interface is somewhat less than ideal. 518 * The unfortunate notion of borrowing the address 519 * space of the parent process requires us to jump 520 * through several hoops to prevent corrupting parent 521 * emulation state. 522 * 523 * When returning in the child, we make a copy of the 524 * system call return context and discard three pages 525 * of the native stack. Returning normally would 526 * clobber the native stack frame in which the brand 527 * library in the parent process is presently waiting. 528 * 529 * The calling program is expected to correctly use 530 * this dusty, underspecified relic. Neglecting to 531 * immediately call execve(2) or exit(2) is not 532 * cricket; this stack space will be permanently lost, 533 * not to mention myriad other undefined behaviour. 534 */ 535 bcopy(ucp, &vforkuc, sizeof (vforkuc)); 536 vforkuc.uc_brand_data[1] -= LX_NATIVE_STACK_VFORK_GAP; 537 vforkuc.uc_link = NULL; 538 539 lx_debug("\tvfork native stack sp %p", 540 vforkuc.uc_brand_data[1]); 541 542 /* 543 * If provided, the child needs its new stack set up. 544 */ 545 if (cldstk != 0) { 546 lx_debug("\tvfork cldstk %p", cldstk); 547 LX_REG(&vforkuc, REG_SP) = (uintptr_t)cldstk; 548 } 549 550 /* 551 * Stop for ptrace if required. 552 */ 553 lx_ptrace_stop_if_option(ptrace_event, B_TRUE, 0, NULL); 554 555 /* 556 * Return to the child via the specially constructed 557 * vfork(2) context. 558 */ 559 LX_EMULATE_RETURN(&vforkuc, LX_SYS_clone, 0, 0); 560 (void) syscall(SYS_brand, B_EMULATION_DONE, &vforkuc, 561 LX_SYS_clone, 0, 0); 562 563 assert(0); 564 } 565 566 /* 567 * If provided, the child needs its new stack set up. 568 */ 569 if (cldstk != 0) { 570 lx_debug("\tcldstk %p", cldstk); 571 LX_REG(ucp, REG_SP) = (uintptr_t)cldstk; 572 } 573 574 /* 575 * Stop for ptrace if required. 576 */ 577 lx_ptrace_stop_if_option(ptrace_event, B_TRUE, 0, NULL); 578 579 /* 580 * Re-enable signal delivery in the child process. 581 */ 582 _sigon(); 583 584 /* 585 * The child process returns via the regular emulated system 586 * call path: 587 */ 588 return (0); 589 } 590 591 /* 592 * We have very restricted support.... only exactly these flags are 593 * supported 594 */ 595 if (((flags & SHARED_AS) != SHARED_AS)) { 596 lx_unsupported("clone(2) requires that all or none of " 597 "CLONE_VM/FS/FILES/THREAD/SIGHAND be set. (flags:0x%08X)\n", 598 flags); 599 return (-ENOTSUP); 600 } 601 602 if (cldstk == NULL) { 603 lx_unsupported("clone(2) requires the caller to allocate the " 604 "child's stack.\n"); 605 return (-ENOTSUP); 606 } 607 608 /* 609 * If we want a signal-on-exit, ensure that the signal is valid. 610 */ 611 if ((sig = ltos_signo[flags & LX_CSIGNAL]) == -1) { 612 lx_unsupported("clone(2) passed unsupported signal: %d", sig); 613 return (-ENOTSUP); 614 } 615 616 /* 617 * Initialise the state structure we pass as an argument to the new 618 * thread: 619 */ 620 if ((cs = malloc(sizeof (*cs))) == NULL) { 621 lx_debug("could not allocate clone_state: %s", strerror(errno)); 622 return (-ENOMEM); 623 } 624 cs->c_flags = flags; 625 cs->c_sig = sig; 626 cs->c_stk = cldstk; 627 cs->c_ptidp = ptidp; 628 cs->c_ldtinfo = ldtinfo; 629 cs->c_ctidp = ctidp; 630 cs->c_clone_res = &clone_res; 631 cs->c_ptrace_event = ptrace_event; 632 /* 633 * We want the new thread to return directly to the call site for 634 * the system call. 635 */ 636 cs->c_retaddr = (void *)LX_REG(ucp, REG_PC); 637 /* 638 * Copy the saved context for the clone(2) system call so that the 639 * new thread may use it to initialise registers. 640 */ 641 bcopy(ucp, &cs->c_uc, sizeof (cs->c_uc)); 642 if ((cs->c_lx_tsd = malloc(sizeof (*cs->c_lx_tsd))) == NULL) { 643 free(cs); 644 return (-ENOMEM); 645 } 646 647 if (lx_sched_getaffinity(0, sizeof (cs->c_affmask), 648 (uintptr_t)&cs->c_affmask) == -1) { 649 lx_err_fatal("Unable to get affinity mask for parent " 650 "thread: %s", strerror(errno)); 651 } 652 653 clone_res = 0; 654 655 /* 656 * Block all signals because the thread we create won't be able to 657 * properly handle them until it's fully set up. 658 */ 659 (void) sigfillset(&sigmask); 660 if (sigprocmask(SIG_BLOCK, &sigmask, &osigmask) < 0) { 661 lx_debug("lx_clone sigprocmask() failed: %s", strerror(errno)); 662 free(cs->c_lx_tsd); 663 free(cs); 664 return (-errno); 665 } 666 cs->c_uc.uc_sigmask = osigmask; 667 668 /* 669 * Allocate the native stack for this new thread now, so that we 670 * can return failure gracefully as ENOMEM. 671 */ 672 if (lx_alloc_stack(&cs->c_ntv_stk, &cs->c_ntv_stk_sz) != 0) { 673 free(cs->c_lx_tsd); 674 free(cs); 675 return (-ENOMEM); 676 } 677 678 rval = thr_create(NULL, NULL, clone_start, cs, THR_DETACHED, &tid); 679 680 /* 681 * If the thread did not start, free the resources we allocated: 682 */ 683 if (rval == -1) { 684 error = errno; 685 (void) munmap(cs->c_ntv_stk, cs->c_ntv_stk_sz); 686 free(cs->c_lx_tsd); 687 free(cs); 688 } 689 690 /* 691 * Release any pending signals 692 */ 693 (void) sigprocmask(SIG_SETMASK, &osigmask, NULL); 694 695 /* 696 * Wait for the child to be created and have its tid assigned. 697 */ 698 if (rval == 0) { 699 while (clone_res == 0) 700 ; 701 702 rval = clone_res; 703 lx_ptrace_stop_if_option(ptrace_event, B_FALSE, (ulong_t)rval, 704 NULL); 705 706 return (rval); 707 } else { 708 /* 709 * Return the error from thr_create(3C). 710 */ 711 return (-error); 712 } 713 }