1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26
27 #pragma ident "%Z%%M% %I% %E% SMI"
28
29 #include <assert.h>
30 #include <errno.h>
31 #include <stdlib.h>
32 #include <signal.h>
33 #include <unistd.h>
34 #include <ucontext.h>
35 #include <thread.h>
36 #include <strings.h>
37 #include <libintl.h>
38 #include <sys/regset.h>
39 #include <sys/syscall.h>
40 #include <sys/inttypes.h>
41 #include <sys/param.h>
42 #include <sys/types.h>
43 #include <sys/segments.h>
44 #include <signal.h>
45 #include <sys/lx_misc.h>
46 #include <sys/lx_types.h>
47 #include <sys/lx_signal.h>
48 #include <sys/lx_syscall.h>
49 #include <sys/lx_brand.h>
50 #include <sys/lx_debug.h>
51 #include <sys/lx_thread.h>
52
53 #define LX_CSIGNAL 0x000000ff
54 #define LX_CLONE_VM 0x00000100
55 #define LX_CLONE_FS 0x00000200
56 #define LX_CLONE_FILES 0x00000400
57 #define LX_CLONE_SIGHAND 0x00000800
58 #define LX_CLONE_PID 0x00001000
59 #define LX_CLONE_PTRACE 0x00002000
60 #define LX_CLONE_VFORK 0x00004000
61 #define LX_CLONE_PARENT 0x00008000
62 #define LX_CLONE_THREAD 0x00010000
63 #define LX_CLONE_SYSVSEM 0x00040000
64 #define LX_CLONE_SETTLS 0x00080000
65 #define LX_CLONE_PARENT_SETTID 0x00100000
66 #define LX_CLONE_CHILD_CLEARTID 0x00200000
67 #define LX_CLONE_DETACH 0x00400000
68 #define LX_CLONE_CHILD_SETTID 0x01000000
69
70 #define SHARED_AS \
71 (LX_CLONE_VM | LX_CLONE_FS | LX_CLONE_FILES | LX_CLONE_SIGHAND)
72 #define CLONE_VFORK (LX_CLONE_VM | LX_CLONE_VFORK)
73 #define CLONE_TD (LX_CLONE_THREAD|LX_CLONE_DETACH)
74
75 #define IS_FORK(f) (((f) & SHARED_AS) == 0)
76 #define IS_VFORK(f) (((f) & CLONE_VFORK) == CLONE_VFORK)
77
78 #define LX_EXIT 1
79 #define LX_EXIT_GROUP 2
80
81 /*
82 * This is dicey. This seems to be an internal glibc structure, and not
83 * part of any external interface. Thus, it is subject to change without
84 * notice. FWIW, clone(2) itself seems to be an internal (or at least
85 * unstable) interface, since strace(1) shows it differently than the man
86 * page.
87 */
88 struct lx_desc
89 {
90 uint32_t entry_number;
91 uint32_t base_addr;
92 uint32_t limit;
93 uint32_t seg_32bit:1;
94 uint32_t contents:2;
95 uint32_t read_exec_only:1;
96 uint32_t limit_in_pages:1;
97 uint32_t seg_not_present:1;
98 uint32_t useable:1;
99 uint32_t empty:25;
100 };
101
102 struct clone_state {
103 void *c_retaddr; /* instr after clone()'s int80 */
104 int c_flags; /* flags to clone(2) */
105 int c_sig; /* signal to send on thread exit */
106 void *c_stk; /* %esp of new thread */
107 void *c_ptidp;
108 struct lx_desc *c_ldtinfo; /* thread-specific segment */
109 void *c_ctidp;
110 uintptr_t c_gs; /* Linux's %gs */
111 sigset_t c_sigmask; /* signal mask */
112 lx_affmask_t c_affmask; /* CPU affinity mask */
113 volatile int *c_clone_res; /* pid/error returned to cloner */
114 };
115
116 extern void lx_setup_clone(uintptr_t, void *, void *);
117
118 /*
119 * Counter incremented when we vfork(2) ourselves, and decremented when the
120 * vfork(2)ed child exit(2)s or exec(2)s.
121 */
122 static int is_vforked = 0;
123
124 int
125 lx_exit(uintptr_t p1)
126 {
127 int ret, status = (int)p1;
128 lx_tsd_t *lx_tsd;
129
130 /*
131 * If we are a vfork(2)ed child, we need to exit as quickly and
132 * cleanly as possible to avoid corrupting our parent.
133 */
134 if (is_vforked != 0) {
135 is_vforked--;
136 _exit(status);
137 }
138
139 if ((ret = thr_getspecific(lx_tsd_key, (void **)&lx_tsd)) != 0)
140 lx_err_fatal(gettext(
141 "%s: unable to read thread-specific data: %s"),
142 "exit", strerror(ret));
143
144 assert(lx_tsd != 0);
145
146 lx_tsd->lxtsd_exit = LX_EXIT;
147 lx_tsd->lxtsd_exit_status = status;
148
149 /*
150 * Block all signals in the exit context to avoid taking any signals
151 * (to the degree possible) while exiting.
152 */
153 (void) sigfillset(&lx_tsd->lxtsd_exit_context.uc_sigmask);
154
155 /*
156 * This thread is exiting. Restore the state of the thread to
157 * what it was before we started running linux code.
158 */
159 (void) setcontext(&lx_tsd->lxtsd_exit_context);
160
161 /*
162 * If we returned from the setcontext(2), something is very wrong.
163 */
164 lx_err_fatal(gettext("%s: unable to set exit context: %s"),
165 "exit", strerror(errno));
166
167 /*NOTREACHED*/
168 return (0);
169 }
170
171 int
172 lx_group_exit(uintptr_t p1)
173 {
174 int ret, status = (int)p1;
175 lx_tsd_t *lx_tsd;
176
177 /*
178 * If we are a vfork(2)ed child, we need to exit as quickly and
179 * cleanly as possible to avoid corrupting our parent.
180 */
181 if (is_vforked != 0) {
182 is_vforked--;
183 _exit(status);
184 }
185
186 if ((ret = thr_getspecific(lx_tsd_key, (void **)&lx_tsd)) != 0)
187 lx_err_fatal(gettext(
188 "%s: unable to read thread-specific data: %s"),
189 "group_exit", strerror(ret));
190
191 assert(lx_tsd != 0);
192
193 lx_tsd->lxtsd_exit = LX_EXIT_GROUP;
194 lx_tsd->lxtsd_exit_status = status;
195
196 /*
197 * Block all signals in the exit context to avoid taking any signals
198 * (to the degree possible) while exiting.
199 */
200 (void) sigfillset(&lx_tsd->lxtsd_exit_context.uc_sigmask);
201
202 /*
203 * This thread is exiting. Restore the state of the thread to
204 * what it was before we started running linux code.
205 */
206 (void) setcontext(&lx_tsd->lxtsd_exit_context);
207
208 /*
209 * If we returned from the setcontext(2), something is very wrong.
210 */
211 lx_err_fatal(gettext("%s: unable to set exit context: %s"),
212 "group_exit", strerror(errno));
213
214 /*NOTREACHED*/
215 return (0);
216 }
217
218 static void *
219 clone_start(void *arg)
220 {
221 int rval;
222 struct clone_state *cs = (struct clone_state *)arg;
223 lx_tsd_t lx_tsd;
224
225 /*
226 * Let the kernel finish setting up all the needed state for this
227 * new thread.
228 *
229 * We already created the thread using the thr_create(3C) library
230 * call, so most of the work required to emulate lx_clone(2) has
231 * been done by the time we get to this point. Instead of creating
232 * a new brandsys(2) subcommand to perform the last few bits of
233 * bookkeeping, we just use the lx_clone() slot in the syscall
234 * table.
235 */
236 lx_debug("\tre-vectoring to lx kernel module to complete lx_clone()");
237 lx_debug("\tLX_SYS_clone(0x%x, 0x%p, 0x%p, 0x%p, 0x%p)",
238 cs->c_flags, cs->c_stk, cs->c_ptidp, cs->c_ldtinfo, cs->c_ctidp);
239
240 rval = syscall(SYS_brand, B_EMULATE_SYSCALL + LX_SYS_clone,
241 cs->c_flags, cs->c_stk, cs->c_ptidp, cs->c_ldtinfo, cs->c_ctidp,
242 NULL);
243
244 /*
245 * At this point the parent is waiting for cs->c_clone_res to go
246 * non-zero to indicate the thread has been cloned. The value set
247 * in cs->c_clone_res will be used for the return value from
248 * clone().
249 */
250 if (rval < 0) {
251 *(cs->c_clone_res) = -errno;
252 lx_debug("\tkernel clone failed, errno %d\n", errno);
253 return (NULL);
254 }
255
256 if (lx_sched_setaffinity(0, sizeof (cs->c_affmask),
257 (uintptr_t)&cs->c_affmask) != 0) {
258 *(cs->c_clone_res) = -errno;
259
260 lx_err_fatal(gettext(
261 "Unable to set affinity mask in child thread: %s"),
262 strerror(errno));
263 }
264
265 /* Initialize the thread specific data for this thread. */
266 bzero(&lx_tsd, sizeof (lx_tsd));
267 lx_tsd.lxtsd_gs = cs->c_gs;
268
269 /*
270 * Use the address of the stack-allocated lx_tsd as the
271 * per-thread storage area to cache various values for later
272 * use.
273 *
274 * This address is only used by this thread, so there is no
275 * danger of other threads using this storage area, nor of it
276 * being accessed once this stack frame has been freed.
277 */
278 if (thr_setspecific(lx_tsd_key, &lx_tsd) != 0) {
279 *(cs->c_clone_res) = -errno;
280 lx_err_fatal(
281 gettext("Unable to set thread-specific ptr for clone: %s"),
282 strerror(rval));
283 }
284
285 /*
286 * Save the current context of this thread.
287 *
288 * We'll restore this context when this thread attempts to exit.
289 */
290 if (getcontext(&lx_tsd.lxtsd_exit_context) != 0) {
291 *(cs->c_clone_res) = -errno;
292
293 lx_err_fatal(gettext(
294 "Unable to initialize thread-specific exit context: %s"),
295 strerror(errno));
296 }
297
298 /*
299 * Do the final stack twiddling, reset %gs, and return to the
300 * clone(2) path.
301 */
302 if (lx_tsd.lxtsd_exit == 0) {
303 if (sigprocmask(SIG_SETMASK, &cs->c_sigmask, NULL) < 0) {
304 *(cs->c_clone_res) = -errno;
305
306 lx_err_fatal(gettext(
307 "Unable to release held signals for child "
308 "thread: %s"), strerror(errno));
309 }
310
311 /*
312 * Let the parent know that the clone has (effectively) been
313 * completed.
314 */
315 *(cs->c_clone_res) = rval;
316
317 lx_setup_clone(cs->c_gs, cs->c_retaddr, cs->c_stk);
318
319 /* lx_setup_clone() should never return. */
320 assert(0);
321 }
322
323 /*
324 * We are here because the Linux application called the exit() or
325 * exit_group() system call. In turn the brand library did a
326 * setcontext() to jump to the thread context state saved in
327 * getcontext(), above.
328 */
329 if (lx_tsd.lxtsd_exit == LX_EXIT)
330 thr_exit((void *)lx_tsd.lxtsd_exit_status);
331 else
332 exit(lx_tsd.lxtsd_exit_status);
333
334 assert(0);
335 /*NOTREACHED*/
336 }
337
338 int
339 lx_clone(uintptr_t p1, uintptr_t p2, uintptr_t p3, uintptr_t p4,
340 uintptr_t p5)
341 {
342 struct clone_state *cs;
343 int flags = (int)p1;
344 void *cldstk = (void *)p2;
345 void *ptidp = (void *)p3;
346 struct lx_desc *ldtinfo = (void *)p4;
347 void *ctidp = (void *)p5;
348 thread_t tid;
349 volatile int clone_res;
350 int sig;
351 int rval;
352 int pid;
353 lx_regs_t *rp;
354 sigset_t sigmask;
355
356 if (flags & LX_CLONE_SETTLS) {
357 lx_debug("lx_clone(flags=0x%x stk=0x%p ptidp=0x%p ldt=0x%p "
358 "ctidp=0x%p", flags, cldstk, ptidp, ldtinfo, ctidp);
359 } else {
360 lx_debug("lx_clone(flags=0x%x stk=0x%p ptidp=0x%p)",
361 flags, cldstk, ptidp);
362 }
363
364 /*
365 * Only supported for pid 0 on Linux
366 */
367 if (flags & LX_CLONE_PID)
368 return (-EINVAL);
369
370 /*
371 * CLONE_THREAD requires CLONE_SIGHAND.
372 *
373 * CLONE_THREAD and CLONE_DETACHED must both be either set or cleared
374 * in kernel 2.4 and prior.
375 * In kernel 2.6 CLONE_DETACHED was dropped completely, so we no
376 * longer have this requirement.
377 */
378
379 if (flags & CLONE_TD) {
380 if (!(flags & LX_CLONE_SIGHAND))
381 return (-EINVAL);
382 if ((lx_get_kern_version() <= LX_KERN_2_4) &&
383 (flags & CLONE_TD) != CLONE_TD)
384 return (-EINVAL);
385 }
386
387 rp = lx_syscall_regs();
388
389 /* test if pointer passed by user are writable */
390 if (flags & LX_CLONE_PARENT_SETTID) {
391 if (uucopy(ptidp, &pid, sizeof (int)) != 0)
392 return (-EFAULT);
393 if (uucopy(&pid, ptidp, sizeof (int)) != 0)
394 return (-EFAULT);
395 }
396 if (flags & LX_CLONE_CHILD_SETTID) {
397 if (uucopy(ctidp, &pid, sizeof (int)) != 0)
398 return (-EFAULT);
399 if (uucopy(&pid, ctidp, sizeof (int)) != 0)
400 return (-EFAULT);
401 }
402
403 /* See if this is a fork() operation or a thr_create(). */
404 if (IS_FORK(flags) || IS_VFORK(flags)) {
405 if (flags & LX_CLONE_PARENT) {
406 lx_unsupported(gettext(
407 "clone(2) only supports CLONE_PARENT "
408 "for threads.\n"));
409 return (-ENOTSUP);
410 }
411
412 if (flags & LX_CLONE_PTRACE)
413 lx_ptrace_fork();
414
415 if (flags & LX_CLONE_VFORK) {
416 is_vforked++;
417 rval = vfork();
418 if (rval != 0)
419 is_vforked--;
420 } else {
421 rval = fork1();
422 if (rval == 0 && lx_is_rpm)
423 (void) sleep(lx_rpm_delay);
424 }
425
426 /*
427 * Since we've already forked, we can't do much if uucopy fails,
428 * so we just ignore failure. Failure is unlikely since we've
429 * tested the memory before we did the fork.
430 */
431 if (rval > 0 && (flags & LX_CLONE_PARENT_SETTID)) {
432 (void) uucopy(&rval, ptidp, sizeof (int));
433 }
434
435 if (rval == 0 && (flags & LX_CLONE_CHILD_SETTID)) {
436 /*
437 * lx_getpid should not fail, and if it does, there's
438 * not much we can do about it since we've already
439 * forked, so on failure, we just don't copy the
440 * memory.
441 */
442 pid = lx_getpid();
443 if (pid >= 0)
444 (void) uucopy(&pid, ctidp, sizeof (int));
445 }
446
447 /* Parent just returns */
448 if (rval != 0)
449 return ((rval < 0) ? -errno : rval);
450
451 /*
452 * If provided, the child needs its new stack set up.
453 */
454 if (cldstk)
455 lx_setup_clone(rp->lxr_gs, (void *)rp->lxr_eip, cldstk);
456
457 return (0);
458 }
459
460 /*
461 * We have very restricted support.... only exactly these flags are
462 * supported
463 */
464 if (((flags & SHARED_AS) != SHARED_AS)) {
465 lx_unsupported(gettext(
466 "clone(2) requires that all or none of CLONE_VM "
467 "CLONE_FS, CLONE_FILES, and CLONE_SIGHAND be set.\n"));
468 return (-ENOTSUP);
469 }
470
471 if (cldstk == NULL) {
472 lx_unsupported(gettext(
473 "clone(2) requires the caller to allocate the "
474 "child's stack.\n"));
475 return (-ENOTSUP);
476 }
477
478 /*
479 * If we want a signal-on-exit, ensure that the signal is valid.
480 */
481 if ((sig = ltos_signo[flags & LX_CSIGNAL]) == -1) {
482 lx_unsupported(gettext(
483 "clone(2) passed unsupported signal: %d"), sig);
484 return (-ENOTSUP);
485 }
486
487 /*
488 * To avoid malloc() here, we steal a part of the new thread's
489 * stack to store all the info that thread might need for
490 * initialization. We also make it 64-bit aligned for good
491 * measure.
492 */
493 cs = (struct clone_state *)
494 ((p2 - sizeof (struct clone_state)) & -((uintptr_t)8));
495 cs->c_flags = flags;
496 cs->c_sig = sig;
497 cs->c_stk = cldstk;
498 cs->c_ptidp = ptidp;
499 cs->c_ldtinfo = ldtinfo;
500 cs->c_ctidp = ctidp;
501 cs->c_clone_res = &clone_res;
502 cs->c_gs = rp->lxr_gs;
503
504 if (lx_sched_getaffinity(0, sizeof (cs->c_affmask),
505 (uintptr_t)&cs->c_affmask) == -1)
506 lx_err_fatal(gettext(
507 "Unable to get affinity mask for parent thread: %s"),
508 strerror(errno));
509
510 /*
511 * We want the new thread to return directly to the return site for
512 * the system call.
513 */
514 cs->c_retaddr = (void *)rp->lxr_eip;
515 clone_res = 0;
516
517 (void) sigfillset(&sigmask);
518
519 /*
520 * Block all signals because the thread we create won't be able to
521 * properly handle them until it's fully set up.
522 */
523 if (sigprocmask(SIG_BLOCK, &sigmask, &cs->c_sigmask) < 0) {
524 lx_debug("lx_clone sigprocmask() failed: %s", strerror(errno));
525 return (-errno);
526 }
527
528 rval = thr_create(NULL, NULL, clone_start, cs, THR_DETACHED, &tid);
529
530 /*
531 * Release any pending signals
532 */
533 (void) sigprocmask(SIG_SETMASK, &cs->c_sigmask, NULL);
534
535 /*
536 * Wait for the child to be created and have its tid assigned.
537 */
538 if (rval == 0) {
539 while (clone_res == 0)
540 ;
541
542 rval = clone_res;
543 }
544
545 return (rval);
546 }