1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 * Copyright 2015 Joyent, Inc. All rights reserved.
26 */
27
28 #include <assert.h>
29 #include <errno.h>
30 #include <stdlib.h>
31 #include <signal.h>
32 #include <unistd.h>
33 #include <ucontext.h>
34 #include <thread.h>
35 #include <strings.h>
36 #include <libintl.h>
37 #include <sys/regset.h>
38 #include <sys/syscall.h>
39 #include <sys/inttypes.h>
40 #include <sys/param.h>
41 #include <sys/types.h>
42 #include <sys/segments.h>
43 #include <signal.h>
44 #include <sys/lx_misc.h>
45 #include <sys/lx_types.h>
46 #include <sys/lx_signal.h>
47 #include <sys/lx_syscall.h>
48 #include <sys/lx_brand.h>
49 #include <sys/lx_debug.h>
50 #include <sys/lx_thread.h>
51 #include <sys/fork.h>
52 #include <sys/mman.h>
53 #include <lx_syscall.h>
54
55
56 #define SHARED_AS \
57 (LX_CLONE_VM | LX_CLONE_FS | LX_CLONE_FILES | LX_CLONE_SIGHAND \
58 | LX_CLONE_THREAD)
59 #define CLONE_VFORK (LX_CLONE_VM | LX_CLONE_VFORK)
60 #define CLONE_TD (LX_CLONE_THREAD|LX_CLONE_DETACH)
61
62 #define IS_FORK(f) (((f) & SHARED_AS) == 0)
63 #define IS_VFORK(f) (((f) & CLONE_VFORK) == CLONE_VFORK)
64
65 /*
66 * This is dicey. This seems to be an internal glibc structure, and not
67 * part of any external interface. Thus, it is subject to change without
68 * notice. FWIW, clone(2) itself seems to be an internal (or at least
69 * unstable) interface, since strace(1) shows it differently than the man
70 * page.
71 */
72 struct lx_desc
73 {
74 uint32_t entry_number;
75 uint32_t base_addr;
76 uint32_t limit;
77 uint32_t seg_32bit:1;
78 uint32_t contents:2;
79 uint32_t read_exec_only:1;
80 uint32_t limit_in_pages:1;
81 uint32_t seg_not_present:1;
82 uint32_t useable:1;
83 uint32_t empty:25;
84 };
85
86 struct clone_state {
87 void *c_retaddr; /* instr after clone()'s int80 */
88 int c_flags; /* flags to clone(2) */
89 int c_sig; /* signal to send on thread exit */
90 void *c_stk; /* %esp of new thread */
91 void *c_ptidp;
92 struct lx_desc *c_ldtinfo; /* thread-specific segment */
93 void *c_ctidp;
94 ucontext_t c_uc; /* original register state/sigmask */
95 lx_affmask_t c_affmask; /* CPU affinity mask */
96 volatile int *c_clone_res; /* pid/error returned to cloner */
97 int c_ptrace_event; /* ptrace(2) event for child stop */
98 void *c_ntv_stk; /* native stack for this thread */
99 size_t c_ntv_stk_sz; /* native stack size */
100 lx_tsd_t *c_lx_tsd; /* tsd area for thread */
101 };
102
103 /*
104 * Counter incremented when we vfork(2) ourselves, and decremented when the
105 * vfork(2)ed child exit(2)s or exec(2)s.
106 */
107 static int is_vforked = 0;
108
109 long
110 lx_exit(uintptr_t p1)
111 {
112 int status = (int)p1;
113 lx_tsd_t *lx_tsd;
114
115 /*
116 * If we are a vfork(2)ed child, we need to exit as quickly and
117 * cleanly as possible to avoid corrupting our parent.
118 */
119 if (is_vforked != 0) {
120 is_vforked--;
121 _exit(status);
122 }
123
124 lx_tsd = lx_get_tsd();
125
126 lx_tsd->lxtsd_exit = LX_ET_EXIT;
127 lx_tsd->lxtsd_exit_status = status;
128
129 lx_ptrace_stop_if_option(LX_PTRACE_O_TRACEEXIT, B_FALSE,
130 (ulong_t)status, NULL);
131
132 /*
133 * This thread is exiting. Restore the state of the thread to
134 * what it was before we started running linux code.
135 */
136 (void) setcontext(&lx_tsd->lxtsd_exit_context);
137
138 /*
139 * If we returned from the setcontext(2), something is very wrong.
140 */
141 lx_err_fatal("exit: unable to set exit context: %s", strerror(errno));
142
143 /*NOTREACHED*/
144 return (0);
145 }
146
147 long
148 lx_group_exit(uintptr_t p1)
149 {
150 int status = (int)p1;
151 lx_tsd_t *lx_tsd;
152
153 /*
154 * If we are a vfork(2)ed child, we need to exit as quickly and
155 * cleanly as possible to avoid corrupting our parent.
156 */
157 if (is_vforked != 0) {
158 is_vforked--;
159 _exit(status);
160 }
161
162 lx_tsd = lx_get_tsd();
163
164 lx_tsd->lxtsd_exit = LX_ET_EXIT_GROUP;
165 lx_tsd->lxtsd_exit_status = status;
166
167 /*
168 * This thread is exiting. Restore the state of the thread to
169 * what it was before we started running linux code.
170 */
171 (void) setcontext(&lx_tsd->lxtsd_exit_context);
172
173 /*
174 * If we returned from the setcontext(2), something is very wrong.
175 */
176 lx_err_fatal("group_exit: unable to set exit context: %s",
177 strerror(errno));
178
179 /*NOTREACHED*/
180 return (0);
181 }
182
183 static void *
184 clone_start(void *arg)
185 {
186 int rval;
187 struct clone_state *cs = (struct clone_state *)arg;
188 lx_tsd_t *lxtsd;
189
190 /*
191 * Let the kernel finish setting up all the needed state for this
192 * new thread.
193 *
194 * We already created the thread using the thr_create(3C) library
195 * call, so most of the work required to emulate lx_clone(2) has
196 * been done by the time we get to this point.
197 */
198 lx_debug("\tre-vectoring to lx kernel module to complete lx_clone()");
199 lx_debug("\tB_HELPER_CLONE(0x%x, 0x%p, 0x%p, 0x%p)",
200 cs->c_flags, cs->c_ptidp, cs->c_ldtinfo, cs->c_ctidp);
201
202 rval = syscall(SYS_brand, B_HELPER_CLONE, cs->c_flags, cs->c_ptidp,
203 cs->c_ldtinfo, cs->c_ctidp);
204
205 /*
206 * At this point the parent is waiting for cs->c_clone_res to go
207 * non-zero to indicate the thread has been cloned. The value set
208 * in cs->c_clone_res will be used for the return value from
209 * clone().
210 */
211 if (rval < 0) {
212 *(cs->c_clone_res) = -errno;
213 lx_debug("\tkernel clone failed, errno %d\n", errno);
214 free(cs->c_lx_tsd);
215 free(cs);
216 return (NULL);
217 }
218
219 if (lx_sched_setaffinity(0, sizeof (cs->c_affmask),
220 (uintptr_t)&cs->c_affmask) != 0) {
221 *(cs->c_clone_res) = -errno;
222
223 lx_err_fatal("Unable to set affinity mask in child thread: %s",
224 strerror(errno));
225 }
226
227 /*
228 * Initialize the thread specific data for this thread.
229 */
230 lxtsd = cs->c_lx_tsd;
231 lx_init_tsd(lxtsd);
232 lxtsd->lxtsd_clone_state = cs;
233
234 /*
235 * Install the emulation stack for this thread. Register the
236 * thread-specific data structure with the stack list so that it may be
237 * freed at thread exit or fork(2).
238 */
239 lx_install_stack(cs->c_ntv_stk, cs->c_ntv_stk_sz, lxtsd);
240
241 /*
242 * Let the parent know that the clone has (effectively) been
243 * completed.
244 */
245 *(cs->c_clone_res) = rval;
246
247 /*
248 * We want to load the general registers from this context, restore the
249 * original signal mask, and switch to the BRAND stack. The original
250 * signal mask was saved to the context by lx_clone().
251 */
252 cs->c_uc.uc_flags = UC_CPU | UC_SIGMASK;
253 cs->c_uc.uc_brand_data[0] = (void *)LX_UC_STACK_BRAND;
254
255 /*
256 * New threads will not link into the existing context chain.
257 */
258 cs->c_uc.uc_link = NULL;
259
260 /*
261 * Set stack pointer and entry point for new thread:
262 */
263 LX_REG(&cs->c_uc, REG_SP) = (uintptr_t)cs->c_stk;
264 LX_REG(&cs->c_uc, REG_PC) = (uintptr_t)cs->c_retaddr;
265
266 /*
267 * Return 0 to the child:
268 */
269 LX_REG(&cs->c_uc, REG_R0) = (uintptr_t)0;
270
271 /*
272 * Fire the ptrace(2) event stop in the new thread:
273 */
274 lx_ptrace_stop_if_option(cs->c_ptrace_event, B_TRUE, 0, &cs->c_uc);
275
276 /*
277 * Jump to the Linux process. This call cannot return.
278 */
279 lx_jump_to_linux(&cs->c_uc);
280 }
281
282 /*
283 * The way Linux handles stopping for FORK vs. CLONE does not map exactly to
284 * which syscall was used. Instead, it has to do with which signal is set in
285 * the low byte of the clone flag. The only time the CLONE event is emitted is
286 * if the clone signal (the low byte of the flags argument) is set to something
287 * other than SIGCHLD (see the Linux src in kernel/fork.c do_fork() for the
288 * actual code).
289 */
290 static int
291 ptrace_clone_event(int flags)
292 {
293 if (flags & LX_CLONE_VFORK)
294 return (LX_PTRACE_O_TRACEVFORK);
295
296 if ((flags & LX_CSIGNAL) != LX_SIGCHLD)
297 return (LX_PTRACE_O_TRACECLONE);
298
299 return (LX_PTRACE_O_TRACEFORK);
300 }
301
302 /*
303 * See glibc sysdeps/unix/sysv/linux/x86_64/clone.S code for x64 argument order
304 * and the Linux kernel/fork.c code for the various ways arguments can be passed
305 * to the clone syscall (CONFIG_CLONE_BACKWARDS, et al).
306 */
307 long
308 lx_clone(uintptr_t p1, uintptr_t p2, uintptr_t p3, uintptr_t p4,
309 uintptr_t p5)
310 {
311 struct clone_state *cs;
312 int flags = (int)p1;
313 void *cldstk = (void *)p2;
314 void *ptidp = (void *)p3;
315 #if defined(_LP64)
316 void *ctidp = (void *)p4;
317 struct lx_desc *ldtinfo = (void *)p5;
318 #else /* is 32bit */
319 struct lx_desc *ldtinfo = (void *)p4;
320 void *ctidp = (void *)p5;
321 #endif
322 thread_t tid;
323 volatile int clone_res;
324 int sig;
325 int rval;
326 int pid;
327 ucontext_t *ucp;
328 sigset_t sigmask, osigmask;
329 int fork_flags = 0;
330 int ptrace_event;
331 int error = 0;
332
333 if (flags & LX_CLONE_SETTLS) {
334 lx_debug("lx_clone(flags=0x%x stk=0x%p ptidp=0x%p ldt=0x%p "
335 "ctidp=0x%p", flags, cldstk, ptidp, ldtinfo, ctidp);
336 } else {
337 lx_debug("lx_clone(flags=0x%x stk=0x%p ptidp=0x%p)",
338 flags, cldstk, ptidp);
339 }
340
341 /*
342 * Only supported for pid 0 on Linux after version 2.3.21, and
343 * apparently not at all since 2.5.16.
344 */
345 if (flags & LX_CLONE_PID)
346 return (-EINVAL);
347
348 /*
349 * CLONE_THREAD requires CLONE_SIGHAND.
350 *
351 * CLONE_THREAD and CLONE_DETACHED must both be either set or cleared
352 * in kernel 2.4 and prior.
353 * In kernel 2.6 (and later) CLONE_DETACHED was dropped completely, so
354 * we no longer have this requirement.
355 */
356
357 if (flags & CLONE_TD) {
358 if (!(flags & LX_CLONE_SIGHAND))
359 return (-EINVAL);
360 if (strncmp(lx_release, "2.4", 3) == 0 &&
361 (flags & CLONE_TD) != CLONE_TD)
362 return (-EINVAL);
363 }
364
365 ucp = lx_syscall_regs();
366
367 /* test if pointer passed by user are writable */
368 if (flags & LX_CLONE_PARENT_SETTID) {
369 if (uucopy(ptidp, &pid, sizeof (int)) != 0)
370 return (-EFAULT);
371 if (uucopy(&pid, ptidp, sizeof (int)) != 0)
372 return (-EFAULT);
373 }
374 if (flags & LX_CLONE_CHILD_SETTID) {
375 if (uucopy(ctidp, &pid, sizeof (int)) != 0)
376 return (-EFAULT);
377 if (uucopy(&pid, ctidp, sizeof (int)) != 0)
378 return (-EFAULT);
379 }
380
381 ptrace_event = ptrace_clone_event(flags);
382
383 /*
384 * Inform the in-kernel ptrace(2) subsystem that we are about to
385 * emulate a fork(2), vfork(2) or clone(2) system call.
386 */
387 lx_ptrace_clone_begin(ptrace_event, !!(flags & LX_CLONE_PTRACE));
388
389 /*
390 * Handle a fork(2) operation here. If this is not a fork, a new
391 * thread will be created after this block.
392 */
393 if (IS_FORK(flags) || IS_VFORK(flags)) {
394 if (flags & LX_CLONE_PARENT) {
395 lx_unsupported("clone(2) only supports CLONE_PARENT "
396 "for threads.\n");
397 return (-ENOTSUP);
398 }
399
400 if ((flags & LX_CSIGNAL) == 0)
401 fork_flags |= FORK_NOSIGCHLD;
402
403 /*
404 * Suspend signal delivery, run the stack management prefork
405 * handler and perform the actual fork(2) operation.
406 */
407 _sigoff();
408 lx_stack_prefork();
409 if (flags & LX_CLONE_VFORK) {
410 lx_sighandlers_t saved;
411
412 /*
413 * Because we keep our signal disposition at user-land
414 * (and in memory), we must prevent it from being
415 * clobbered should our vforked child change the
416 * disposition (e.g., via sigaction()) before releasing
417 * the address space. We preserve our disposition by
418 * taking a snapshot of it before the vfork and
419 * restoring it afterwards -- which we can get away
420 * with because we know that we aren't executing
421 * concurrently with our child.
422 */
423 lx_sighandlers_save(&saved);
424 is_vforked++;
425 rval = vforkx(fork_flags);
426 if (rval != 0) {
427 is_vforked--;
428 lx_sighandlers_restore(&saved);
429 }
430 } else {
431 rval = forkx(fork_flags);
432 }
433
434 /*
435 * The parent process returns through the regular system call
436 * path here.
437 */
438 if (rval != 0) {
439 if (!IS_VFORK(flags) || rval < 0) {
440 /*
441 * Run the stack management postfork handler in
442 * the parent. If this was a vfork(2), we only
443 * run it in the parent if the fork operation
444 * failed; the vfork(2) child has already run
445 * it for our address space.
446 */
447 lx_stack_postfork();
448 }
449
450 /*
451 * Since we've already forked, we can't do much if
452 * uucopy fails, so we just ignore failure. Failure is
453 * unlikely since we've tested the memory before we did
454 * the fork.
455 */
456 if (rval > 0 && (flags & LX_CLONE_PARENT_SETTID)) {
457 (void) uucopy(&rval, ptidp, sizeof (int));
458 }
459
460 if (rval > 0) {
461 lx_ptrace_stop_if_option(ptrace_event, B_FALSE,
462 (ulong_t)rval, NULL);
463 }
464
465 /*
466 * Re-enable signal delivery in the parent process.
467 */
468 _sigon();
469
470 return ((rval < 0) ? -errno : rval);
471 }
472
473 /*
474 * The rest of this block runs only within the new child
475 * process.
476 */
477
478 /*
479 * Run the stack management postfork handler in the child.
480 */
481 lx_stack_postfork();
482
483 if (!IS_VFORK(flags)) {
484 /*
485 * We must free the stacks and thread-specific data
486 * objects for every thread except the one duplicated
487 * from the parent by forkx().
488 */
489 lx_free_other_stacks();
490 }
491
492 if (rval == 0 && (flags & LX_CLONE_CHILD_SETTID)) {
493 /*
494 * lx_getpid should not fail, and if it does, there's
495 * not much we can do about it since we've already
496 * forked, so on failure, we just don't copy the
497 * memory.
498 */
499 pid = syscall(SYS_brand, B_GETPID);
500 if (pid >= 0)
501 (void) uucopy(&pid, ctidp, sizeof (int));
502 }
503
504 /*
505 * Set up additional data in the lx_proc_data structure as
506 * necessary.
507 */
508 if ((rval = syscall(SYS_brand, B_HELPER_CLONE, flags, ptidp,
509 ldtinfo, ctidp)) < 0) {
510 return (rval);
511 }
512
513 if (IS_VFORK(flags)) {
514 ucontext_t vforkuc;
515
516 /*
517 * The vfork(2) interface is somewhat less than ideal.
518 * The unfortunate notion of borrowing the address
519 * space of the parent process requires us to jump
520 * through several hoops to prevent corrupting parent
521 * emulation state.
522 *
523 * When returning in the child, we make a copy of the
524 * system call return context and discard three pages
525 * of the native stack. Returning normally would
526 * clobber the native stack frame in which the brand
527 * library in the parent process is presently waiting.
528 *
529 * The calling program is expected to correctly use
530 * this dusty, underspecified relic. Neglecting to
531 * immediately call execve(2) or exit(2) is not
532 * cricket; this stack space will be permanently lost,
533 * not to mention myriad other undefined behaviour.
534 */
535 bcopy(ucp, &vforkuc, sizeof (vforkuc));
536 vforkuc.uc_brand_data[1] -= LX_NATIVE_STACK_VFORK_GAP;
537 vforkuc.uc_link = NULL;
538
539 lx_debug("\tvfork native stack sp %p",
540 vforkuc.uc_brand_data[1]);
541
542 /*
543 * If provided, the child needs its new stack set up.
544 */
545 if (cldstk != 0) {
546 lx_debug("\tvfork cldstk %p", cldstk);
547 LX_REG(&vforkuc, REG_SP) = (uintptr_t)cldstk;
548 }
549
550 /*
551 * Stop for ptrace if required.
552 */
553 lx_ptrace_stop_if_option(ptrace_event, B_TRUE, 0, NULL);
554
555 /*
556 * Return to the child via the specially constructed
557 * vfork(2) context.
558 */
559 LX_EMULATE_RETURN(&vforkuc, LX_SYS_clone, 0, 0);
560 (void) syscall(SYS_brand, B_EMULATION_DONE, &vforkuc,
561 LX_SYS_clone, 0, 0);
562
563 assert(0);
564 }
565
566 /*
567 * If provided, the child needs its new stack set up.
568 */
569 if (cldstk != 0) {
570 lx_debug("\tcldstk %p", cldstk);
571 LX_REG(ucp, REG_SP) = (uintptr_t)cldstk;
572 }
573
574 /*
575 * Stop for ptrace if required.
576 */
577 lx_ptrace_stop_if_option(ptrace_event, B_TRUE, 0, NULL);
578
579 /*
580 * Re-enable signal delivery in the child process.
581 */
582 _sigon();
583
584 /*
585 * The child process returns via the regular emulated system
586 * call path:
587 */
588 return (0);
589 }
590
591 /*
592 * We have very restricted support.... only exactly these flags are
593 * supported
594 */
595 if (((flags & SHARED_AS) != SHARED_AS)) {
596 lx_unsupported("clone(2) requires that all or none of "
597 "CLONE_VM/FS/FILES/THREAD/SIGHAND be set. (flags:0x%08X)\n",
598 flags);
599 return (-ENOTSUP);
600 }
601
602 if (cldstk == NULL) {
603 lx_unsupported("clone(2) requires the caller to allocate the "
604 "child's stack.\n");
605 return (-ENOTSUP);
606 }
607
608 /*
609 * If we want a signal-on-exit, ensure that the signal is valid.
610 */
611 if ((sig = ltos_signo[flags & LX_CSIGNAL]) == -1) {
612 lx_unsupported("clone(2) passed unsupported signal: %d", sig);
613 return (-ENOTSUP);
614 }
615
616 /*
617 * Initialise the state structure we pass as an argument to the new
618 * thread:
619 */
620 if ((cs = malloc(sizeof (*cs))) == NULL) {
621 lx_debug("could not allocate clone_state: %s", strerror(errno));
622 return (-ENOMEM);
623 }
624 cs->c_flags = flags;
625 cs->c_sig = sig;
626 cs->c_stk = cldstk;
627 cs->c_ptidp = ptidp;
628 cs->c_ldtinfo = ldtinfo;
629 cs->c_ctidp = ctidp;
630 cs->c_clone_res = &clone_res;
631 cs->c_ptrace_event = ptrace_event;
632 /*
633 * We want the new thread to return directly to the call site for
634 * the system call.
635 */
636 cs->c_retaddr = (void *)LX_REG(ucp, REG_PC);
637 /*
638 * Copy the saved context for the clone(2) system call so that the
639 * new thread may use it to initialise registers.
640 */
641 bcopy(ucp, &cs->c_uc, sizeof (cs->c_uc));
642 if ((cs->c_lx_tsd = malloc(sizeof (*cs->c_lx_tsd))) == NULL) {
643 free(cs);
644 return (-ENOMEM);
645 }
646
647 if (lx_sched_getaffinity(0, sizeof (cs->c_affmask),
648 (uintptr_t)&cs->c_affmask) == -1) {
649 lx_err_fatal("Unable to get affinity mask for parent "
650 "thread: %s", strerror(errno));
651 }
652
653 clone_res = 0;
654
655 /*
656 * Block all signals because the thread we create won't be able to
657 * properly handle them until it's fully set up.
658 */
659 (void) sigfillset(&sigmask);
660 if (sigprocmask(SIG_BLOCK, &sigmask, &osigmask) < 0) {
661 lx_debug("lx_clone sigprocmask() failed: %s", strerror(errno));
662 free(cs->c_lx_tsd);
663 free(cs);
664 return (-errno);
665 }
666 cs->c_uc.uc_sigmask = osigmask;
667
668 /*
669 * Allocate the native stack for this new thread now, so that we
670 * can return failure gracefully as ENOMEM.
671 */
672 if (lx_alloc_stack(&cs->c_ntv_stk, &cs->c_ntv_stk_sz) != 0) {
673 free(cs->c_lx_tsd);
674 free(cs);
675 return (-ENOMEM);
676 }
677
678 rval = thr_create(NULL, NULL, clone_start, cs, THR_DETACHED, &tid);
679
680 /*
681 * If the thread did not start, free the resources we allocated:
682 */
683 if (rval == -1) {
684 error = errno;
685 (void) munmap(cs->c_ntv_stk, cs->c_ntv_stk_sz);
686 free(cs->c_lx_tsd);
687 free(cs);
688 }
689
690 /*
691 * Release any pending signals
692 */
693 (void) sigprocmask(SIG_SETMASK, &osigmask, NULL);
694
695 /*
696 * Wait for the child to be created and have its tid assigned.
697 */
698 if (rval == 0) {
699 while (clone_res == 0)
700 ;
701
702 rval = clone_res;
703 lx_ptrace_stop_if_option(ptrace_event, B_FALSE, (ulong_t)rval,
704 NULL);
705
706 return (rval);
707 } else {
708 /*
709 * Return the error from thr_create(3C).
710 */
711 return (-error);
712 }
713 }