1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  * Copyright 2015 Joyent, Inc.  All rights reserved.
  26  */
  27 
  28 #include <assert.h>
  29 #include <errno.h>
  30 #include <stdlib.h>
  31 #include <signal.h>
  32 #include <unistd.h>
  33 #include <ucontext.h>
  34 #include <thread.h>
  35 #include <strings.h>
  36 #include <libintl.h>
  37 #include <sys/regset.h>
  38 #include <sys/syscall.h>
  39 #include <sys/inttypes.h>
  40 #include <sys/param.h>
  41 #include <sys/types.h>
  42 #include <sys/segments.h>
  43 #include <signal.h>
  44 #include <sys/lx_misc.h>
  45 #include <sys/lx_types.h>
  46 #include <sys/lx_signal.h>
  47 #include <sys/lx_syscall.h>
  48 #include <sys/lx_brand.h>
  49 #include <sys/lx_debug.h>
  50 #include <sys/lx_thread.h>
  51 #include <sys/fork.h>
  52 #include <sys/mman.h>
  53 #include <lx_syscall.h>
  54 
  55 
  56 #define SHARED_AS       \
  57         (LX_CLONE_VM | LX_CLONE_FS | LX_CLONE_FILES | LX_CLONE_SIGHAND  \
  58             | LX_CLONE_THREAD)
  59 #define CLONE_VFORK (LX_CLONE_VM | LX_CLONE_VFORK)
  60 #define CLONE_TD (LX_CLONE_THREAD|LX_CLONE_DETACH)
  61 
  62 #define IS_FORK(f)      (((f) & SHARED_AS) == 0)
  63 #define IS_VFORK(f)     (((f) & CLONE_VFORK) == CLONE_VFORK)
  64 
  65 /*
  66  * This is dicey.  This seems to be an internal glibc structure, and not
  67  * part of any external interface.  Thus, it is subject to change without
  68  * notice.  FWIW, clone(2) itself seems to be an internal (or at least
  69  * unstable) interface, since strace(1) shows it differently than the man
  70  * page.
  71  */
  72 struct lx_desc
  73 {
  74         uint32_t entry_number;
  75         uint32_t base_addr;
  76         uint32_t limit;
  77         uint32_t seg_32bit:1;
  78         uint32_t contents:2;
  79         uint32_t read_exec_only:1;
  80         uint32_t limit_in_pages:1;
  81         uint32_t seg_not_present:1;
  82         uint32_t useable:1;
  83         uint32_t empty:25;
  84 };
  85 
  86 struct clone_state {
  87         void            *c_retaddr;     /* instr after clone()'s int80 */
  88         int             c_flags;        /* flags to clone(2) */
  89         int             c_sig;          /* signal to send on thread exit */
  90         void            *c_stk;         /* %esp of new thread */
  91         void            *c_ptidp;
  92         struct lx_desc  *c_ldtinfo;     /* thread-specific segment */
  93         void            *c_ctidp;
  94         ucontext_t      c_uc;           /* original register state/sigmask */
  95         lx_affmask_t    c_affmask;      /* CPU affinity mask */
  96         volatile int    *c_clone_res;   /* pid/error returned to cloner */
  97         int             c_ptrace_event; /* ptrace(2) event for child stop */
  98         void            *c_ntv_stk;     /* native stack for this thread */
  99         size_t          c_ntv_stk_sz;   /* native stack size */
 100         lx_tsd_t        *c_lx_tsd;      /* tsd area for thread */
 101 };
 102 
 103 /*
 104  * Counter incremented when we vfork(2) ourselves, and decremented when the
 105  * vfork(2)ed child exit(2)s or exec(2)s.
 106  */
 107 static int is_vforked = 0;
 108 
 109 long
 110 lx_exit(uintptr_t p1)
 111 {
 112         int             status = (int)p1;
 113         lx_tsd_t        *lx_tsd;
 114 
 115         /*
 116          * If we are a vfork(2)ed child, we need to exit as quickly and
 117          * cleanly as possible to avoid corrupting our parent.
 118          */
 119         if (is_vforked != 0) {
 120                 is_vforked--;
 121                 _exit(status);
 122         }
 123 
 124         lx_tsd = lx_get_tsd();
 125 
 126         lx_tsd->lxtsd_exit = LX_ET_EXIT;
 127         lx_tsd->lxtsd_exit_status = status;
 128 
 129         lx_ptrace_stop_if_option(LX_PTRACE_O_TRACEEXIT, B_FALSE,
 130             (ulong_t)status, NULL);
 131 
 132         /*
 133          * This thread is exiting.  Restore the state of the thread to
 134          * what it was before we started running linux code.
 135          */
 136         (void) setcontext(&lx_tsd->lxtsd_exit_context);
 137 
 138         /*
 139          * If we returned from the setcontext(2), something is very wrong.
 140          */
 141         lx_err_fatal("exit: unable to set exit context: %s", strerror(errno));
 142 
 143         /*NOTREACHED*/
 144         return (0);
 145 }
 146 
 147 long
 148 lx_group_exit(uintptr_t p1)
 149 {
 150         int             status = (int)p1;
 151         lx_tsd_t        *lx_tsd;
 152 
 153         /*
 154          * If we are a vfork(2)ed child, we need to exit as quickly and
 155          * cleanly as possible to avoid corrupting our parent.
 156          */
 157         if (is_vforked != 0) {
 158                 is_vforked--;
 159                 _exit(status);
 160         }
 161 
 162         lx_tsd = lx_get_tsd();
 163 
 164         lx_tsd->lxtsd_exit = LX_ET_EXIT_GROUP;
 165         lx_tsd->lxtsd_exit_status = status;
 166 
 167         /*
 168          * This thread is exiting.  Restore the state of the thread to
 169          * what it was before we started running linux code.
 170          */
 171         (void) setcontext(&lx_tsd->lxtsd_exit_context);
 172 
 173         /*
 174          * If we returned from the setcontext(2), something is very wrong.
 175          */
 176         lx_err_fatal("group_exit: unable to set exit context: %s",
 177             strerror(errno));
 178 
 179         /*NOTREACHED*/
 180         return (0);
 181 }
 182 
 183 static void *
 184 clone_start(void *arg)
 185 {
 186         int rval;
 187         struct clone_state *cs = (struct clone_state *)arg;
 188         lx_tsd_t *lxtsd;
 189 
 190         /*
 191          * Let the kernel finish setting up all the needed state for this
 192          * new thread.
 193          *
 194          * We already created the thread using the thr_create(3C) library
 195          * call, so most of the work required to emulate lx_clone(2) has
 196          * been done by the time we get to this point.
 197          */
 198         lx_debug("\tre-vectoring to lx kernel module to complete lx_clone()");
 199         lx_debug("\tB_HELPER_CLONE(0x%x, 0x%p, 0x%p, 0x%p)",
 200             cs->c_flags, cs->c_ptidp, cs->c_ldtinfo, cs->c_ctidp);
 201 
 202         rval = syscall(SYS_brand, B_HELPER_CLONE, cs->c_flags, cs->c_ptidp,
 203             cs->c_ldtinfo, cs->c_ctidp);
 204 
 205         /*
 206          * At this point the parent is waiting for cs->c_clone_res to go
 207          * non-zero to indicate the thread has been cloned.  The value set
 208          * in cs->c_clone_res will be used for the return value from
 209          * clone().
 210          */
 211         if (rval < 0) {
 212                 *(cs->c_clone_res) = -errno;
 213                 lx_debug("\tkernel clone failed, errno %d\n", errno);
 214                 free(cs->c_lx_tsd);
 215                 free(cs);
 216                 return (NULL);
 217         }
 218 
 219         if (lx_sched_setaffinity(0, sizeof (cs->c_affmask),
 220             (uintptr_t)&cs->c_affmask) != 0) {
 221                 *(cs->c_clone_res) = -errno;
 222 
 223                 lx_err_fatal("Unable to set affinity mask in child thread: %s",
 224                     strerror(errno));
 225         }
 226 
 227         /*
 228          * Initialize the thread specific data for this thread.
 229          */
 230         lxtsd = cs->c_lx_tsd;
 231         lx_init_tsd(lxtsd);
 232         lxtsd->lxtsd_clone_state = cs;
 233 
 234         /*
 235          * Install the emulation stack for this thread.  Register the
 236          * thread-specific data structure with the stack list so that it may be
 237          * freed at thread exit or fork(2).
 238          */
 239         lx_install_stack(cs->c_ntv_stk, cs->c_ntv_stk_sz, lxtsd);
 240 
 241         /*
 242          * Let the parent know that the clone has (effectively) been
 243          * completed.
 244          */
 245         *(cs->c_clone_res) = rval;
 246 
 247         /*
 248          * We want to load the general registers from this context, restore the
 249          * original signal mask, and switch to the BRAND stack.  The original
 250          * signal mask was saved to the context by lx_clone().
 251          */
 252         cs->c_uc.uc_flags = UC_CPU | UC_SIGMASK;
 253         cs->c_uc.uc_brand_data[0] = (void *)LX_UC_STACK_BRAND;
 254 
 255         /*
 256          * New threads will not link into the existing context chain.
 257          */
 258         cs->c_uc.uc_link = NULL;
 259 
 260         /*
 261          * Set stack pointer and entry point for new thread:
 262          */
 263         LX_REG(&cs->c_uc, REG_SP) = (uintptr_t)cs->c_stk;
 264         LX_REG(&cs->c_uc, REG_PC) = (uintptr_t)cs->c_retaddr;
 265 
 266         /*
 267          * Return 0 to the child:
 268          */
 269         LX_REG(&cs->c_uc, REG_R0) = (uintptr_t)0;
 270 
 271         /*
 272          * Fire the ptrace(2) event stop in the new thread:
 273          */
 274         lx_ptrace_stop_if_option(cs->c_ptrace_event, B_TRUE, 0, &cs->c_uc);
 275 
 276         /*
 277          * Jump to the Linux process.  This call cannot return.
 278          */
 279         lx_jump_to_linux(&cs->c_uc);
 280 }
 281 
 282 /*
 283  * The way Linux handles stopping for FORK vs. CLONE does not map exactly to
 284  * which syscall was used. Instead, it has to do with which signal is set in
 285  * the low byte of the clone flag. The only time the CLONE event is emitted is
 286  * if the clone signal (the low byte of the flags argument) is set to something
 287  * other than SIGCHLD (see the Linux src in kernel/fork.c do_fork() for the
 288  * actual code).
 289  */
 290 static int
 291 ptrace_clone_event(int flags)
 292 {
 293         if (flags & LX_CLONE_VFORK)
 294                 return (LX_PTRACE_O_TRACEVFORK);
 295 
 296         if ((flags & LX_CSIGNAL) != LX_SIGCHLD)
 297                 return (LX_PTRACE_O_TRACECLONE);
 298 
 299         return (LX_PTRACE_O_TRACEFORK);
 300 }
 301 
 302 /*
 303  * See glibc sysdeps/unix/sysv/linux/x86_64/clone.S code for x64 argument order
 304  * and the Linux kernel/fork.c code for the various ways arguments can be passed
 305  * to the clone syscall (CONFIG_CLONE_BACKWARDS, et al).
 306  */
 307 long
 308 lx_clone(uintptr_t p1, uintptr_t p2, uintptr_t p3, uintptr_t p4,
 309         uintptr_t p5)
 310 {
 311         struct clone_state *cs;
 312         int flags = (int)p1;
 313         void *cldstk = (void *)p2;
 314         void *ptidp = (void *)p3;
 315 #if defined(_LP64)
 316         void *ctidp = (void *)p4;
 317         struct lx_desc *ldtinfo = (void *)p5;
 318 #else /* is 32bit */
 319         struct lx_desc *ldtinfo = (void *)p4;
 320         void *ctidp = (void *)p5;
 321 #endif
 322         thread_t tid;
 323         volatile int clone_res;
 324         int sig;
 325         int rval;
 326         int pid;
 327         ucontext_t *ucp;
 328         sigset_t sigmask, osigmask;
 329         int fork_flags = 0;
 330         int ptrace_event;
 331         int error = 0;
 332 
 333         if (flags & LX_CLONE_SETTLS) {
 334                 lx_debug("lx_clone(flags=0x%x stk=0x%p ptidp=0x%p ldt=0x%p "
 335                     "ctidp=0x%p", flags, cldstk, ptidp, ldtinfo, ctidp);
 336         } else {
 337                 lx_debug("lx_clone(flags=0x%x stk=0x%p ptidp=0x%p)",
 338                     flags, cldstk, ptidp);
 339         }
 340 
 341         /*
 342          * Only supported for pid 0 on Linux after version 2.3.21, and
 343          * apparently not at all since 2.5.16.
 344          */
 345         if (flags & LX_CLONE_PID)
 346                 return (-EINVAL);
 347 
 348         /*
 349          * CLONE_THREAD requires CLONE_SIGHAND.
 350          *
 351          * CLONE_THREAD and CLONE_DETACHED must both be either set or cleared
 352          * in kernel 2.4 and prior.
 353          * In kernel 2.6 (and later) CLONE_DETACHED was dropped completely, so
 354          * we no longer have this requirement.
 355          */
 356 
 357         if (flags & CLONE_TD) {
 358                 if (!(flags & LX_CLONE_SIGHAND))
 359                         return (-EINVAL);
 360                 if (strncmp(lx_release, "2.4", 3) == 0 &&
 361                     (flags & CLONE_TD) != CLONE_TD)
 362                         return (-EINVAL);
 363         }
 364 
 365         ucp = lx_syscall_regs();
 366 
 367         /* test if pointer passed by user are writable */
 368         if (flags & LX_CLONE_PARENT_SETTID) {
 369                 if (uucopy(ptidp, &pid, sizeof (int)) != 0)
 370                         return (-EFAULT);
 371                 if (uucopy(&pid, ptidp, sizeof (int)) != 0)
 372                         return (-EFAULT);
 373         }
 374         if (flags & LX_CLONE_CHILD_SETTID) {
 375                 if (uucopy(ctidp, &pid, sizeof (int)) != 0)
 376                         return (-EFAULT);
 377                 if (uucopy(&pid, ctidp, sizeof (int)) != 0)
 378                         return (-EFAULT);
 379         }
 380 
 381         ptrace_event = ptrace_clone_event(flags);
 382 
 383         /*
 384          * Inform the in-kernel ptrace(2) subsystem that we are about to
 385          * emulate a fork(2), vfork(2) or clone(2) system call.
 386          */
 387         lx_ptrace_clone_begin(ptrace_event, !!(flags & LX_CLONE_PTRACE));
 388 
 389         /*
 390          * Handle a fork(2) operation here.  If this is not a fork, a new
 391          * thread will be created after this block.
 392          */
 393         if (IS_FORK(flags) || IS_VFORK(flags)) {
 394                 if (flags & LX_CLONE_PARENT) {
 395                         lx_unsupported("clone(2) only supports CLONE_PARENT "
 396                             "for threads.\n");
 397                         return (-ENOTSUP);
 398                 }
 399 
 400                 if ((flags & LX_CSIGNAL) == 0)
 401                         fork_flags |= FORK_NOSIGCHLD;
 402 
 403                 /*
 404                  * Suspend signal delivery, run the stack management prefork
 405                  * handler and perform the actual fork(2) operation.
 406                  */
 407                 _sigoff();
 408                 lx_stack_prefork();
 409                 if (flags & LX_CLONE_VFORK) {
 410                         lx_sighandlers_t saved;
 411 
 412                         /*
 413                          * Because we keep our signal disposition at user-land
 414                          * (and in memory), we must prevent it from being
 415                          * clobbered should our vforked child change the
 416                          * disposition (e.g., via sigaction()) before releasing
 417                          * the address space.  We preserve our disposition by
 418                          * taking a snapshot of it before the vfork and
 419                          * restoring it afterwards -- which we can get away
 420                          * with because we know that we aren't executing
 421                          * concurrently with our child.
 422                          */
 423                         lx_sighandlers_save(&saved);
 424                         is_vforked++;
 425                         rval = vforkx(fork_flags);
 426                         if (rval != 0) {
 427                                 is_vforked--;
 428                                 lx_sighandlers_restore(&saved);
 429                         }
 430                 } else {
 431                         rval = forkx(fork_flags);
 432                 }
 433 
 434                 /*
 435                  * The parent process returns through the regular system call
 436                  * path here.
 437                  */
 438                 if (rval != 0) {
 439                         if (!IS_VFORK(flags) || rval < 0) {
 440                                 /*
 441                                  * Run the stack management postfork handler in
 442                                  * the parent.  If this was a vfork(2), we only
 443                                  * run it in the parent if the fork operation
 444                                  * failed; the vfork(2) child has already run
 445                                  * it for our address space.
 446                                  */
 447                                 lx_stack_postfork();
 448                         }
 449 
 450                         /*
 451                          * Since we've already forked, we can't do much if
 452                          * uucopy fails, so we just ignore failure. Failure is
 453                          * unlikely since we've tested the memory before we did
 454                          * the fork.
 455                          */
 456                         if (rval > 0 && (flags & LX_CLONE_PARENT_SETTID)) {
 457                                 (void) uucopy(&rval, ptidp, sizeof (int));
 458                         }
 459 
 460                         if (rval > 0) {
 461                                 lx_ptrace_stop_if_option(ptrace_event, B_FALSE,
 462                                     (ulong_t)rval, NULL);
 463                         }
 464 
 465                         /*
 466                          * Re-enable signal delivery in the parent process.
 467                          */
 468                         _sigon();
 469 
 470                         return ((rval < 0) ? -errno : rval);
 471                 }
 472 
 473                 /*
 474                  * The rest of this block runs only within the new child
 475                  * process.
 476                  */
 477 
 478                 /*
 479                  * Run the stack management postfork handler in the child.
 480                  */
 481                 lx_stack_postfork();
 482 
 483                 if (!IS_VFORK(flags)) {
 484                         /*
 485                          * We must free the stacks and thread-specific data
 486                          * objects for every thread except the one duplicated
 487                          * from the parent by forkx().
 488                          */
 489                         lx_free_other_stacks();
 490                 }
 491 
 492                 if (rval == 0 && (flags & LX_CLONE_CHILD_SETTID)) {
 493                         /*
 494                          * lx_getpid should not fail, and if it does, there's
 495                          * not much we can do about it since we've already
 496                          * forked, so on failure, we just don't copy the
 497                          * memory.
 498                          */
 499                         pid = syscall(SYS_brand, B_GETPID);
 500                         if (pid >= 0)
 501                                 (void) uucopy(&pid, ctidp, sizeof (int));
 502                 }
 503 
 504                 /*
 505                  * Set up additional data in the lx_proc_data structure as
 506                  * necessary.
 507                  */
 508                 if ((rval = syscall(SYS_brand, B_HELPER_CLONE, flags, ptidp,
 509                     ldtinfo, ctidp)) < 0) {
 510                         return (rval);
 511                 }
 512 
 513                 if (IS_VFORK(flags)) {
 514                         ucontext_t vforkuc;
 515 
 516                         /*
 517                          * The vfork(2) interface is somewhat less than ideal.
 518                          * The unfortunate notion of borrowing the address
 519                          * space of the parent process requires us to jump
 520                          * through several hoops to prevent corrupting parent
 521                          * emulation state.
 522                          *
 523                          * When returning in the child, we make a copy of the
 524                          * system call return context and discard three pages
 525                          * of the native stack.  Returning normally would
 526                          * clobber the native stack frame in which the brand
 527                          * library in the parent process is presently waiting.
 528                          *
 529                          * The calling program is expected to correctly use
 530                          * this dusty, underspecified relic.  Neglecting to
 531                          * immediately call execve(2) or exit(2) is not
 532                          * cricket; this stack space will be permanently lost,
 533                          * not to mention myriad other undefined behaviour.
 534                          */
 535                         bcopy(ucp, &vforkuc, sizeof (vforkuc));
 536                         vforkuc.uc_brand_data[1] -= LX_NATIVE_STACK_VFORK_GAP;
 537                         vforkuc.uc_link = NULL;
 538 
 539                         lx_debug("\tvfork native stack sp %p",
 540                             vforkuc.uc_brand_data[1]);
 541 
 542                         /*
 543                          * If provided, the child needs its new stack set up.
 544                          */
 545                         if (cldstk != 0) {
 546                                 lx_debug("\tvfork cldstk %p", cldstk);
 547                                 LX_REG(&vforkuc, REG_SP) = (uintptr_t)cldstk;
 548                         }
 549 
 550                         /*
 551                          * Stop for ptrace if required.
 552                          */
 553                         lx_ptrace_stop_if_option(ptrace_event, B_TRUE, 0, NULL);
 554 
 555                         /*
 556                          * Return to the child via the specially constructed
 557                          * vfork(2) context.
 558                          */
 559                         LX_EMULATE_RETURN(&vforkuc, LX_SYS_clone, 0, 0);
 560                         (void) syscall(SYS_brand, B_EMULATION_DONE, &vforkuc,
 561                             LX_SYS_clone, 0, 0);
 562 
 563                         assert(0);
 564                 }
 565 
 566                 /*
 567                  * If provided, the child needs its new stack set up.
 568                  */
 569                 if (cldstk != 0) {
 570                         lx_debug("\tcldstk %p", cldstk);
 571                         LX_REG(ucp, REG_SP) = (uintptr_t)cldstk;
 572                 }
 573 
 574                 /*
 575                  * Stop for ptrace if required.
 576                  */
 577                 lx_ptrace_stop_if_option(ptrace_event, B_TRUE, 0, NULL);
 578 
 579                 /*
 580                  * Re-enable signal delivery in the child process.
 581                  */
 582                 _sigon();
 583 
 584                 /*
 585                  * The child process returns via the regular emulated system
 586                  * call path:
 587                  */
 588                 return (0);
 589         }
 590 
 591         /*
 592          * We have very restricted support.... only exactly these flags are
 593          * supported
 594          */
 595         if (((flags & SHARED_AS) != SHARED_AS)) {
 596                 lx_unsupported("clone(2) requires that all or none of "
 597                     "CLONE_VM/FS/FILES/THREAD/SIGHAND be set. (flags:0x%08X)\n",
 598                     flags);
 599                 return (-ENOTSUP);
 600         }
 601 
 602         if (cldstk == NULL) {
 603                 lx_unsupported("clone(2) requires the caller to allocate the "
 604                     "child's stack.\n");
 605                 return (-ENOTSUP);
 606         }
 607 
 608         /*
 609          * If we want a signal-on-exit, ensure that the signal is valid.
 610          */
 611         if ((sig = ltos_signo[flags & LX_CSIGNAL]) == -1) {
 612                 lx_unsupported("clone(2) passed unsupported signal: %d", sig);
 613                 return (-ENOTSUP);
 614         }
 615 
 616         /*
 617          * Initialise the state structure we pass as an argument to the new
 618          * thread:
 619          */
 620         if ((cs = malloc(sizeof (*cs))) == NULL) {
 621                 lx_debug("could not allocate clone_state: %s", strerror(errno));
 622                 return (-ENOMEM);
 623         }
 624         cs->c_flags = flags;
 625         cs->c_sig = sig;
 626         cs->c_stk = cldstk;
 627         cs->c_ptidp = ptidp;
 628         cs->c_ldtinfo = ldtinfo;
 629         cs->c_ctidp = ctidp;
 630         cs->c_clone_res = &clone_res;
 631         cs->c_ptrace_event = ptrace_event;
 632         /*
 633          * We want the new thread to return directly to the call site for
 634          * the system call.
 635          */
 636         cs->c_retaddr = (void *)LX_REG(ucp, REG_PC);
 637         /*
 638          * Copy the saved context for the clone(2) system call so that the
 639          * new thread may use it to initialise registers.
 640          */
 641         bcopy(ucp, &cs->c_uc, sizeof (cs->c_uc));
 642         if ((cs->c_lx_tsd = malloc(sizeof (*cs->c_lx_tsd))) == NULL) {
 643                 free(cs);
 644                 return (-ENOMEM);
 645         }
 646 
 647         if (lx_sched_getaffinity(0, sizeof (cs->c_affmask),
 648             (uintptr_t)&cs->c_affmask) == -1) {
 649                 lx_err_fatal("Unable to get affinity mask for parent "
 650                     "thread: %s", strerror(errno));
 651         }
 652 
 653         clone_res = 0;
 654 
 655         /*
 656          * Block all signals because the thread we create won't be able to
 657          * properly handle them until it's fully set up.
 658          */
 659         (void) sigfillset(&sigmask);
 660         if (sigprocmask(SIG_BLOCK, &sigmask, &osigmask) < 0) {
 661                 lx_debug("lx_clone sigprocmask() failed: %s", strerror(errno));
 662                 free(cs->c_lx_tsd);
 663                 free(cs);
 664                 return (-errno);
 665         }
 666         cs->c_uc.uc_sigmask = osigmask;
 667 
 668         /*
 669          * Allocate the native stack for this new thread now, so that we
 670          * can return failure gracefully as ENOMEM.
 671          */
 672         if (lx_alloc_stack(&cs->c_ntv_stk, &cs->c_ntv_stk_sz) != 0) {
 673                 free(cs->c_lx_tsd);
 674                 free(cs);
 675                 return (-ENOMEM);
 676         }
 677 
 678         rval = thr_create(NULL, NULL, clone_start, cs, THR_DETACHED, &tid);
 679 
 680         /*
 681          * If the thread did not start, free the resources we allocated:
 682          */
 683         if (rval == -1) {
 684                 error = errno;
 685                 (void) munmap(cs->c_ntv_stk, cs->c_ntv_stk_sz);
 686                 free(cs->c_lx_tsd);
 687                 free(cs);
 688         }
 689 
 690         /*
 691          * Release any pending signals
 692          */
 693         (void) sigprocmask(SIG_SETMASK, &osigmask, NULL);
 694 
 695         /*
 696          * Wait for the child to be created and have its tid assigned.
 697          */
 698         if (rval == 0) {
 699                 while (clone_res == 0)
 700                         ;
 701 
 702                 rval = clone_res;
 703                 lx_ptrace_stop_if_option(ptrace_event, B_FALSE, (ulong_t)rval,
 704                     NULL);
 705 
 706                 return (rval);
 707         } else {
 708                 /*
 709                  * Return the error from thr_create(3C).
 710                  */
 711                 return (-error);
 712         }
 713 }