1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  * Copyright 2015 Joyent, Inc.  All rights reserved.
  26  */
  27 
  28 #include <assert.h>
  29 #include <errno.h>
  30 #include <stdlib.h>
  31 #include <signal.h>
  32 #include <unistd.h>
  33 #include <ucontext.h>
  34 #include <thread.h>
  35 #include <strings.h>
  36 #include <libintl.h>
  37 #include <sys/regset.h>
  38 #include <sys/syscall.h>
  39 #include <sys/inttypes.h>
  40 #include <sys/param.h>
  41 #include <sys/types.h>
  42 #include <sys/segments.h>
  43 #include <signal.h>
  44 #include <sys/lx_misc.h>
  45 #include <sys/lx_types.h>
  46 #include <sys/lx_signal.h>
  47 #include <sys/lx_syscall.h>
  48 #include <sys/lx_brand.h>
  49 #include <sys/lx_debug.h>
  50 #include <sys/lx_thread.h>
  51 #include <sys/fork.h>
  52 #include <sys/mman.h>
  53 #include <lx_syscall.h>
  54 
  55 
  56 #define SHARED_AS       \
  57         (LX_CLONE_VM | LX_CLONE_FS | LX_CLONE_FILES | LX_CLONE_SIGHAND  \
  58             | LX_CLONE_THREAD)
  59 #define CLONE_VFORK (LX_CLONE_VM | LX_CLONE_VFORK)
  60 #define CLONE_TD (LX_CLONE_THREAD|LX_CLONE_DETACH)
  61 
  62 #define IS_FORK(f)      (((f) & SHARED_AS) == 0)
  63 #define IS_VFORK(f)     (((f) & CLONE_VFORK) == CLONE_VFORK)
  64 
  65 /*
  66  * This is dicey.  This seems to be an internal glibc structure, and not
  67  * part of any external interface.  Thus, it is subject to change without
  68  * notice.  FWIW, clone(2) itself seems to be an internal (or at least
  69  * unstable) interface, since strace(1) shows it differently than the man
  70  * page.
  71  */
  72 struct lx_desc
  73 {
  74         uint32_t entry_number;
  75         uint32_t base_addr;
  76         uint32_t limit;
  77         uint32_t seg_32bit:1;
  78         uint32_t contents:2;
  79         uint32_t read_exec_only:1;
  80         uint32_t limit_in_pages:1;
  81         uint32_t seg_not_present:1;
  82         uint32_t useable:1;
  83         uint32_t empty:25;
  84 };
  85 
  86 struct clone_state {
  87         void            *c_retaddr;     /* instr after clone()'s int80 */
  88         int             c_flags;        /* flags to clone(2) */
  89         int             c_sig;          /* signal to send on thread exit */
  90         void            *c_stk;         /* %esp of new thread */
  91         void            *c_ptidp;
  92         struct lx_desc  *c_ldtinfo;     /* thread-specific segment */
  93         void            *c_ctidp;
  94         ucontext_t      c_uc;           /* original register state */
  95         sigset_t        c_sigmask;      /* signal mask */
  96         lx_affmask_t    c_affmask;      /* CPU affinity mask */
  97         volatile int    *c_clone_res;   /* pid/error returned to cloner */
  98         int             c_ptrace_event; /* ptrace(2) event for child stop */
  99         void            *c_ntv_stk;     /* native stack for this thread */
 100         size_t          c_ntv_stk_sz;   /* native stack size */
 101         lx_tsd_t        *c_lx_tsd;      /* tsd area for thread */
 102 };
 103 
 104 /*
 105  * Counter incremented when we vfork(2) ourselves, and decremented when the
 106  * vfork(2)ed child exit(2)s or exec(2)s.
 107  */
 108 static int is_vforked = 0;
 109 
 110 long
 111 lx_exit(uintptr_t p1)
 112 {
 113         int             status = (int)p1;
 114         lx_tsd_t        *lx_tsd;
 115 
 116         /*
 117          * If we are a vfork(2)ed child, we need to exit as quickly and
 118          * cleanly as possible to avoid corrupting our parent.
 119          */
 120         if (is_vforked != 0) {
 121                 is_vforked--;
 122                 _exit(status);
 123         }
 124 
 125         lx_tsd = lx_get_tsd();
 126 
 127         lx_tsd->lxtsd_exit = LX_ET_EXIT;
 128         lx_tsd->lxtsd_exit_status = status;
 129 
 130         lx_ptrace_stop_if_option(LX_PTRACE_O_TRACEEXIT, B_FALSE,
 131             (ulong_t)status, NULL);
 132 
 133         /*
 134          * This thread is exiting.  Restore the state of the thread to
 135          * what it was before we started running linux code.
 136          */
 137         (void) setcontext(&lx_tsd->lxtsd_exit_context);
 138 
 139         /*
 140          * If we returned from the setcontext(2), something is very wrong.
 141          */
 142         lx_err_fatal("exit: unable to set exit context: %s", strerror(errno));
 143 
 144         /*NOTREACHED*/
 145         return (0);
 146 }
 147 
 148 long
 149 lx_group_exit(uintptr_t p1)
 150 {
 151         int             status = (int)p1;
 152         lx_tsd_t        *lx_tsd;
 153 
 154         /*
 155          * If we are a vfork(2)ed child, we need to exit as quickly and
 156          * cleanly as possible to avoid corrupting our parent.
 157          */
 158         if (is_vforked != 0) {
 159                 is_vforked--;
 160                 _exit(status);
 161         }
 162 
 163         lx_tsd = lx_get_tsd();
 164 
 165         lx_tsd->lxtsd_exit = LX_ET_EXIT_GROUP;
 166         lx_tsd->lxtsd_exit_status = status;
 167 
 168         /*
 169          * This thread is exiting.  Restore the state of the thread to
 170          * what it was before we started running linux code.
 171          */
 172         (void) setcontext(&lx_tsd->lxtsd_exit_context);
 173 
 174         /*
 175          * If we returned from the setcontext(2), something is very wrong.
 176          */
 177         lx_err_fatal("group_exit: unable to set exit context: %s",
 178             strerror(errno));
 179 
 180         /*NOTREACHED*/
 181         return (0);
 182 }
 183 
 184 static void *
 185 clone_start(void *arg)
 186 {
 187         int rval;
 188         struct clone_state *cs = (struct clone_state *)arg;
 189         lx_tsd_t *lxtsd;
 190 
 191         /*
 192          * Let the kernel finish setting up all the needed state for this
 193          * new thread.
 194          *
 195          * We already created the thread using the thr_create(3C) library
 196          * call, so most of the work required to emulate lx_clone(2) has
 197          * been done by the time we get to this point.
 198          */
 199         lx_debug("\tre-vectoring to lx kernel module to complete lx_clone()");
 200         lx_debug("\tB_HELPER_CLONE(0x%x, 0x%p, 0x%p, 0x%p)",
 201             cs->c_flags, cs->c_ptidp, cs->c_ldtinfo, cs->c_ctidp);
 202 
 203         rval = syscall(SYS_brand, B_HELPER_CLONE, cs->c_flags, cs->c_ptidp,
 204             cs->c_ldtinfo, cs->c_ctidp);
 205 
 206         /*
 207          * At this point the parent is waiting for cs->c_clone_res to go
 208          * non-zero to indicate the thread has been cloned.  The value set
 209          * in cs->c_clone_res will be used for the return value from
 210          * clone().
 211          */
 212         if (rval < 0) {
 213                 *(cs->c_clone_res) = -errno;
 214                 lx_debug("\tkernel clone failed, errno %d\n", errno);
 215                 free(cs->c_lx_tsd);
 216                 free(cs);
 217                 return (NULL);
 218         }
 219 
 220         if (lx_sched_setaffinity(0, sizeof (cs->c_affmask),
 221             (uintptr_t)&cs->c_affmask) != 0) {
 222                 *(cs->c_clone_res) = -errno;
 223 
 224                 lx_err_fatal("Unable to set affinity mask in child thread: %s",
 225                     strerror(errno));
 226         }
 227 
 228         /*
 229          * Initialize the thread specific data for this thread.
 230          */
 231         lxtsd = cs->c_lx_tsd;
 232         lx_init_tsd(lxtsd);
 233         lxtsd->lxtsd_clone_state = cs;
 234 
 235         /*
 236          * Install the emulation stack for this thread.  Register the
 237          * thread-specific data structure with the stack list so that it may be
 238          * freed at thread exit or fork(2).
 239          */
 240         lx_install_stack(cs->c_ntv_stk, cs->c_ntv_stk_sz, lxtsd);
 241 
 242         if (sigprocmask(SIG_SETMASK, &cs->c_sigmask, NULL) < 0) {
 243                 *(cs->c_clone_res) = -errno;
 244 
 245                 lx_err_fatal("Unable to release held signals for child "
 246                     "thread: %s", strerror(errno));
 247         }
 248 
 249         /*
 250          * Let the parent know that the clone has (effectively) been
 251          * completed.
 252          */
 253         *(cs->c_clone_res) = rval;
 254 
 255         /*
 256          * We want to load the general registers from this context, and
 257          * switch to the BRAND stack.
 258          */
 259         cs->c_uc.uc_flags = UC_CPU;
 260         cs->c_uc.uc_brand_data[0] = (void *)LX_UC_STACK_BRAND;
 261 
 262         /*
 263          * New threads will not link into the existing context chain.
 264          */
 265         cs->c_uc.uc_link = NULL;
 266 
 267         /*
 268          * Set stack pointer and entry point for new thread:
 269          */
 270         LX_REG(&cs->c_uc, REG_SP) = (uintptr_t)cs->c_stk;
 271         LX_REG(&cs->c_uc, REG_PC) = (uintptr_t)cs->c_retaddr;
 272 
 273         /*
 274          * Return 0 to the child:
 275          */
 276         LX_REG(&cs->c_uc, REG_R0) = (uintptr_t)0;
 277 
 278         /*
 279          * Fire the ptrace(2) event stop in the new thread:
 280          */
 281         lx_ptrace_stop_if_option(cs->c_ptrace_event, B_TRUE, 0, &cs->c_uc);
 282 
 283         /*
 284          * Jump to the Linux process.  The system call must not return.
 285          */
 286         if (syscall(SYS_brand, B_JUMP_TO_LINUX, &cs->c_uc) == -1) {
 287                 lx_err_fatal("B_JUMP_TO_LINUX failed: %s",
 288                     strerror(errno));
 289         }
 290         abort();
 291 
 292         /*NOTREACHED*/
 293         return (NULL);
 294 }
 295 
 296 /*
 297  * The way Linux handles stopping for FORK vs. CLONE does not map exactly to
 298  * which syscall was used. Instead, it has to do with which signal is set in
 299  * the low byte of the clone flag. The only time the CLONE event is emitted is
 300  * if the clone signal (the low byte of the flags argument) is set to something
 301  * other than SIGCHLD (see the Linux src in kernel/fork.c do_fork() for the
 302  * actual code).
 303  */
 304 static int
 305 ptrace_clone_event(int flags)
 306 {
 307         if (flags & LX_CLONE_VFORK)
 308                 return (LX_PTRACE_O_TRACEVFORK);
 309 
 310         if ((flags & LX_CSIGNAL) != LX_SIGCHLD)
 311                 return (LX_PTRACE_O_TRACECLONE);
 312 
 313         return (LX_PTRACE_O_TRACEFORK);
 314 }
 315 
 316 /*
 317  * See glibc sysdeps/unix/sysv/linux/x86_64/clone.S code for x64 argument order
 318  * and the Linux kernel/fork.c code for the various ways arguments can be passed
 319  * to the clone syscall (CONFIG_CLONE_BACKWARDS, et al).
 320  */
 321 long
 322 lx_clone(uintptr_t p1, uintptr_t p2, uintptr_t p3, uintptr_t p4,
 323         uintptr_t p5)
 324 {
 325         struct clone_state *cs;
 326         int flags = (int)p1;
 327         void *cldstk = (void *)p2;
 328         void *ptidp = (void *)p3;
 329 #if defined(_LP64)
 330         void *ctidp = (void *)p4;
 331         struct lx_desc *ldtinfo = (void *)p5;
 332 #else /* is 32bit */
 333         struct lx_desc *ldtinfo = (void *)p4;
 334         void *ctidp = (void *)p5;
 335 #endif
 336         thread_t tid;
 337         volatile int clone_res;
 338         int sig;
 339         int rval;
 340         int pid;
 341         ucontext_t *ucp;
 342         sigset_t sigmask, osigmask;
 343         int fork_flags = 0;
 344         int ptrace_event;
 345         int error = 0;
 346 
 347         if (flags & LX_CLONE_SETTLS) {
 348                 lx_debug("lx_clone(flags=0x%x stk=0x%p ptidp=0x%p ldt=0x%p "
 349                     "ctidp=0x%p", flags, cldstk, ptidp, ldtinfo, ctidp);
 350         } else {
 351                 lx_debug("lx_clone(flags=0x%x stk=0x%p ptidp=0x%p)",
 352                     flags, cldstk, ptidp);
 353         }
 354 
 355         /*
 356          * Only supported for pid 0 on Linux after version 2.3.21, and
 357          * apparently not at all since 2.5.16.
 358          */
 359         if (flags & LX_CLONE_PID)
 360                 return (-EINVAL);
 361 
 362         /*
 363          * CLONE_THREAD requires CLONE_SIGHAND.
 364          *
 365          * CLONE_THREAD and CLONE_DETACHED must both be either set or cleared
 366          * in kernel 2.4 and prior.
 367          * In kernel 2.6 (and later) CLONE_DETACHED was dropped completely, so
 368          * we no longer have this requirement.
 369          */
 370 
 371         if (flags & CLONE_TD) {
 372                 if (!(flags & LX_CLONE_SIGHAND))
 373                         return (-EINVAL);
 374                 if (strncmp(lx_release, "2.4", 3) == 0 &&
 375                     (flags & CLONE_TD) != CLONE_TD)
 376                         return (-EINVAL);
 377         }
 378 
 379         ucp = lx_syscall_regs();
 380 
 381         /* test if pointer passed by user are writable */
 382         if (flags & LX_CLONE_PARENT_SETTID) {
 383                 if (uucopy(ptidp, &pid, sizeof (int)) != 0)
 384                         return (-EFAULT);
 385                 if (uucopy(&pid, ptidp, sizeof (int)) != 0)
 386                         return (-EFAULT);
 387         }
 388         if (flags & LX_CLONE_CHILD_SETTID) {
 389                 if (uucopy(ctidp, &pid, sizeof (int)) != 0)
 390                         return (-EFAULT);
 391                 if (uucopy(&pid, ctidp, sizeof (int)) != 0)
 392                         return (-EFAULT);
 393         }
 394 
 395         ptrace_event = ptrace_clone_event(flags);
 396 
 397         /*
 398          * Inform the in-kernel ptrace(2) subsystem that we are about to
 399          * emulate a fork(2), vfork(2) or clone(2) system call.
 400          */
 401         lx_ptrace_clone_begin(ptrace_event, !!(flags & LX_CLONE_PTRACE));
 402 
 403         /*
 404          * Handle a fork(2) operation here.  If this is not a fork, a new
 405          * thread will be created after this block.
 406          */
 407         if (IS_FORK(flags) || IS_VFORK(flags)) {
 408                 if (flags & LX_CLONE_PARENT) {
 409                         lx_unsupported("clone(2) only supports CLONE_PARENT "
 410                             "for threads.\n");
 411                         return (-ENOTSUP);
 412                 }
 413 
 414                 if ((flags & LX_CSIGNAL) == 0)
 415                         fork_flags |= FORK_NOSIGCHLD;
 416 
 417                 /*
 418                  * Suspend signal delivery, run the stack management prefork
 419                  * handler and perform the actual fork(2) operation.
 420                  */
 421                 _sigoff();
 422                 lx_stack_prefork();
 423                 if (flags & LX_CLONE_VFORK) {
 424                         lx_sighandlers_t saved;
 425 
 426                         /*
 427                          * Because we keep our signal disposition at user-land
 428                          * (and in memory), we must prevent it from being
 429                          * clobbered should our vforked child change the
 430                          * disposition (e.g., via sigaction()) before releasing
 431                          * the address space.  We preserve our disposition by
 432                          * taking a snapshot of it before the vfork and
 433                          * restoring it afterwards -- which we can get away
 434                          * with because we know that we aren't executing
 435                          * concurrently with our child.
 436                          */
 437                         lx_sighandlers_save(&saved);
 438                         is_vforked++;
 439                         rval = vforkx(fork_flags);
 440                         if (rval != 0) {
 441                                 is_vforked--;
 442                                 lx_sighandlers_restore(&saved);
 443                         }
 444                 } else {
 445                         rval = forkx(fork_flags);
 446                 }
 447 
 448                 /*
 449                  * The parent process returns through the regular system call
 450                  * path here.
 451                  */
 452                 if (rval != 0) {
 453                         if (!IS_VFORK(flags) || rval < 0) {
 454                                 /*
 455                                  * Run the stack management postfork handler in
 456                                  * the parent.  If this was a vfork(2), we only
 457                                  * run it in the parent if the fork operation
 458                                  * failed; the vfork(2) child has already run
 459                                  * it for our address space.
 460                                  */
 461                                 lx_stack_postfork();
 462                         }
 463 
 464                         /*
 465                          * Since we've already forked, we can't do much if
 466                          * uucopy fails, so we just ignore failure. Failure is
 467                          * unlikely since we've tested the memory before we did
 468                          * the fork.
 469                          */
 470                         if (rval > 0 && (flags & LX_CLONE_PARENT_SETTID)) {
 471                                 (void) uucopy(&rval, ptidp, sizeof (int));
 472                         }
 473 
 474                         if (rval > 0) {
 475                                 lx_ptrace_stop_if_option(ptrace_event, B_FALSE,
 476                                     (ulong_t)rval, NULL);
 477                         }
 478 
 479                         /*
 480                          * Re-enable signal delivery in the parent process.
 481                          */
 482                         _sigon();
 483 
 484                         return ((rval < 0) ? -errno : rval);
 485                 }
 486 
 487                 /*
 488                  * The rest of this block runs only within the new child
 489                  * process.
 490                  */
 491 
 492                 /*
 493                  * Run the stack management postfork handler in the child.
 494                  */
 495                 lx_stack_postfork();
 496 
 497                 if (!IS_VFORK(flags)) {
 498                         /*
 499                          * We must free the stacks and thread-specific data
 500                          * objects for every thread except the one duplicated
 501                          * from the parent by forkx().
 502                          */
 503                         lx_free_other_stacks();
 504                 }
 505 
 506                 if (rval == 0 && (flags & LX_CLONE_CHILD_SETTID)) {
 507                         /*
 508                          * lx_getpid should not fail, and if it does, there's
 509                          * not much we can do about it since we've already
 510                          * forked, so on failure, we just don't copy the
 511                          * memory.
 512                          */
 513                         pid = syscall(SYS_brand, B_GETPID);
 514                         if (pid >= 0)
 515                                 (void) uucopy(&pid, ctidp, sizeof (int));
 516                 }
 517 
 518                 /*
 519                  * Set up additional data in the lx_proc_data structure as
 520                  * necessary.
 521                  */
 522                 if ((rval = syscall(SYS_brand, B_HELPER_CLONE, flags, ptidp,
 523                     ldtinfo, ctidp)) < 0) {
 524                         return (rval);
 525                 }
 526 
 527                 if (IS_VFORK(flags)) {
 528                         ucontext_t vforkuc;
 529 
 530                         /*
 531                          * The vfork(2) interface is somewhat less than ideal.
 532                          * The unfortunate notion of borrowing the address
 533                          * space of the parent process requires us to jump
 534                          * through several hoops to prevent corrupting parent
 535                          * emulation state.
 536                          *
 537                          * When returning in the child, we make a copy of the
 538                          * system call return context and discard three pages
 539                          * of the native stack.  Returning normally would
 540                          * clobber the native stack frame in which the brand
 541                          * library in the parent process is presently waiting.
 542                          *
 543                          * The calling program is expected to correctly use
 544                          * this dusty, underspecified relic.  Neglecting to
 545                          * immediately call execve(2) or exit(2) is not
 546                          * cricket; this stack space will be permanently lost,
 547                          * not to mention myriad other undefined behaviour.
 548                          */
 549                         bcopy(ucp, &vforkuc, sizeof (vforkuc));
 550                         vforkuc.uc_brand_data[1] -= LX_NATIVE_STACK_VFORK_GAP;
 551                         vforkuc.uc_link = NULL;
 552 
 553                         lx_debug("\tvfork native stack sp %p",
 554                             vforkuc.uc_brand_data[1]);
 555 
 556                         /*
 557                          * If provided, the child needs its new stack set up.
 558                          */
 559                         if (cldstk != 0) {
 560                                 lx_debug("\tvfork cldstk %p", cldstk);
 561                                 LX_REG(&vforkuc, REG_SP) = (uintptr_t)cldstk;
 562                         }
 563 
 564                         /*
 565                          * Stop for ptrace if required.
 566                          */
 567                         lx_ptrace_stop_if_option(ptrace_event, B_TRUE, 0, NULL);
 568 
 569                         /*
 570                          * Return to the child via the specially constructed
 571                          * vfork(2) context.
 572                          */
 573                         LX_EMULATE_RETURN(&vforkuc, LX_SYS_clone, 0, 0);
 574                         (void) syscall(SYS_brand, B_EMULATION_DONE, &vforkuc,
 575                             LX_SYS_clone, 0, 0);
 576 
 577                         assert(0);
 578                 }
 579 
 580                 /*
 581                  * If provided, the child needs its new stack set up.
 582                  */
 583                 if (cldstk != 0) {
 584                         lx_debug("\tcldstk %p", cldstk);
 585                         LX_REG(ucp, REG_SP) = (uintptr_t)cldstk;
 586                 }
 587 
 588                 /*
 589                  * Stop for ptrace if required.
 590                  */
 591                 lx_ptrace_stop_if_option(ptrace_event, B_TRUE, 0, NULL);
 592 
 593                 /*
 594                  * Re-enable signal delivery in the child process.
 595                  */
 596                 _sigon();
 597 
 598                 /*
 599                  * The child process returns via the regular emulated system
 600                  * call path:
 601                  */
 602                 return (0);
 603         }
 604 
 605         /*
 606          * We have very restricted support.... only exactly these flags are
 607          * supported
 608          */
 609         if (((flags & SHARED_AS) != SHARED_AS)) {
 610                 lx_unsupported("clone(2) requires that all or none of "
 611                     "CLONE_VM/FS/FILES/THREAD/SIGHAND be set. (flags:0x%08X)\n",
 612                     flags);
 613                 return (-ENOTSUP);
 614         }
 615 
 616         if (cldstk == NULL) {
 617                 lx_unsupported("clone(2) requires the caller to allocate the "
 618                     "child's stack.\n");
 619                 return (-ENOTSUP);
 620         }
 621 
 622         /*
 623          * If we want a signal-on-exit, ensure that the signal is valid.
 624          */
 625         if ((sig = ltos_signo[flags & LX_CSIGNAL]) == -1) {
 626                 lx_unsupported("clone(2) passed unsupported signal: %d", sig);
 627                 return (-ENOTSUP);
 628         }
 629 
 630         /*
 631          * Initialise the state structure we pass as an argument to the new
 632          * thread:
 633          */
 634         if ((cs = malloc(sizeof (*cs))) == NULL) {
 635                 lx_debug("could not allocate clone_state: %s", strerror(errno));
 636                 return (-ENOMEM);
 637         }
 638         cs->c_flags = flags;
 639         cs->c_sig = sig;
 640         cs->c_stk = cldstk;
 641         cs->c_ptidp = ptidp;
 642         cs->c_ldtinfo = ldtinfo;
 643         cs->c_ctidp = ctidp;
 644         cs->c_clone_res = &clone_res;
 645         cs->c_ptrace_event = ptrace_event;
 646         /*
 647          * We want the new thread to return directly to the call site for
 648          * the system call.
 649          */
 650         cs->c_retaddr = (void *)LX_REG(ucp, REG_PC);
 651         /*
 652          * Copy the saved context for the clone(2) system call so that the
 653          * new thread may use it to initialise registers.
 654          */
 655         bcopy(ucp, &cs->c_uc, sizeof (cs->c_uc));
 656         if ((cs->c_lx_tsd = malloc(sizeof (*cs->c_lx_tsd))) == NULL) {
 657                 free(cs);
 658                 return (-ENOMEM);
 659         }
 660 
 661         if (lx_sched_getaffinity(0, sizeof (cs->c_affmask),
 662             (uintptr_t)&cs->c_affmask) == -1) {
 663                 lx_err_fatal("Unable to get affinity mask for parent "
 664                     "thread: %s", strerror(errno));
 665         }
 666 
 667         clone_res = 0;
 668 
 669         (void) sigfillset(&sigmask);
 670 
 671         /*
 672          * Block all signals because the thread we create won't be able to
 673          * properly handle them until it's fully set up.
 674          */
 675         if (sigprocmask(SIG_BLOCK, &sigmask, &osigmask) < 0) {
 676                 lx_debug("lx_clone sigprocmask() failed: %s", strerror(errno));
 677                 free(cs->c_lx_tsd);
 678                 free(cs);
 679                 return (-errno);
 680         }
 681         cs->c_sigmask = osigmask;
 682 
 683         /*
 684          * Allocate the native stack for this new thread now, so that we
 685          * can return failure gracefully as ENOMEM.
 686          */
 687         if (lx_alloc_stack(&cs->c_ntv_stk, &cs->c_ntv_stk_sz) != 0) {
 688                 free(cs->c_lx_tsd);
 689                 free(cs);
 690                 return (-ENOMEM);
 691         }
 692 
 693         rval = thr_create(NULL, NULL, clone_start, cs, THR_DETACHED, &tid);
 694 
 695         /*
 696          * If the thread did not start, free the resources we allocated:
 697          */
 698         if (rval == -1) {
 699                 error = errno;
 700                 (void) munmap(cs->c_ntv_stk, cs->c_ntv_stk_sz);
 701                 free(cs->c_lx_tsd);
 702                 free(cs);
 703         }
 704 
 705         /*
 706          * Release any pending signals
 707          */
 708         (void) sigprocmask(SIG_SETMASK, &osigmask, NULL);
 709 
 710         /*
 711          * Wait for the child to be created and have its tid assigned.
 712          */
 713         if (rval == 0) {
 714                 while (clone_res == 0)
 715                         ;
 716 
 717                 rval = clone_res;
 718                 lx_ptrace_stop_if_option(ptrace_event, B_FALSE, (ulong_t)rval,
 719                     NULL);
 720 
 721                 return (rval);
 722         } else {
 723                 /*
 724                  * Return the error from thr_create(3C).
 725                  */
 726                 return (-error);
 727         }
 728 }