1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26 
  27 #include <sys/types.h>
  28 #include <sys/kmem.h>
  29 #include <sys/errno.h>
  30 #include <sys/thread.h>
  31 #include <sys/systm.h>
  32 #include <sys/syscall.h>
  33 #include <sys/proc.h>
  34 #include <sys/modctl.h>
  35 #include <sys/cmn_err.h>
  36 #include <sys/model.h>
  37 #include <sys/exec.h>
  38 #include <sys/lx_impl.h>
  39 #include <sys/machbrand.h>
  40 #include <sys/lx_syscalls.h>
  41 #include <sys/lx_pid.h>
  42 #include <sys/lx_futex.h>
  43 #include <sys/lx_brand.h>
  44 #include <sys/termios.h>
  45 #include <sys/sunddi.h>
  46 #include <sys/ddi.h>
  47 #include <sys/vnode.h>
  48 #include <sys/pathname.h>
  49 #include <sys/auxv.h>
  50 #include <sys/priv.h>
  51 #include <sys/regset.h>
  52 #include <sys/privregs.h>
  53 #include <sys/archsystm.h>
  54 #include <sys/zone.h>
  55 #include <sys/brand.h>
  56 
  57 int     lx_debug = 0;
  58 
  59 void    lx_init_brand_data(zone_t *);
  60 void    lx_free_brand_data(zone_t *);
  61 void    lx_setbrand(proc_t *);
  62 int     lx_getattr(zone_t *, int, void *, size_t *);
  63 int     lx_setattr(zone_t *, int, void *, size_t);
  64 int     lx_brandsys(int, int64_t *, uintptr_t, uintptr_t, uintptr_t,
  65                 uintptr_t, uintptr_t, uintptr_t);
  66 int     lx_get_kern_version(void);
  67 void    lx_set_kern_version(zone_t *, int);
  68 void    lx_copy_procdata(proc_t *, proc_t *);
  69 
  70 extern void lx_setrval(klwp_t *, int, int);
  71 extern void lx_proc_exit(proc_t *, klwp_t *);
  72 extern void lx_exec();
  73 extern int lx_initlwp(klwp_t *);
  74 extern void lx_forklwp(klwp_t *, klwp_t *);
  75 extern void lx_exitlwp(klwp_t *);
  76 extern void lx_freelwp(klwp_t *);
  77 extern greg_t lx_fixsegreg(greg_t, model_t);
  78 extern int lx_sched_affinity(int, uintptr_t, int, uintptr_t, int64_t *);
  79 
  80 int lx_systrace_brand_enabled;
  81 
  82 lx_systrace_f *lx_systrace_entry_ptr;
  83 lx_systrace_f *lx_systrace_return_ptr;
  84 
  85 static int lx_systrace_enabled;
  86 
  87 static int lx_elfexec(struct vnode *vp, struct execa *uap, struct uarg *args,
  88     struct intpdata *idata, int level, long *execsz, int setid,
  89     caddr_t exec_file, struct cred *cred, int brand_action);
  90 
  91 /* lx brand */
  92 struct brand_ops lx_brops = {
  93         lx_init_brand_data,
  94         lx_free_brand_data,
  95         lx_brandsys,
  96         lx_setbrand,
  97         lx_getattr,
  98         lx_setattr,
  99         lx_copy_procdata,
 100         lx_proc_exit,
 101         lx_exec,
 102         lx_setrval,
 103         lx_initlwp,
 104         lx_forklwp,
 105         lx_freelwp,
 106         lx_exitlwp,
 107         lx_elfexec,
 108         NULL,
 109         NULL,
 110         NSIG,
 111 };
 112 
 113 struct brand_mach_ops lx_mops = {
 114         NULL,
 115         lx_brand_int80_callback,
 116         NULL,
 117         NULL,
 118         NULL,
 119         lx_fixsegreg,
 120 };
 121 
 122 struct brand lx_brand = {
 123         BRAND_VER_1,
 124         "lx",
 125         &lx_brops,
 126         &lx_mops
 127 };
 128 
 129 static struct modlbrand modlbrand = {
 130         &mod_brandops, "lx brand", &lx_brand
 131 };
 132 
 133 static struct modlinkage modlinkage = {
 134         MODREV_1, (void *)&modlbrand, NULL
 135 };
 136 
 137 void
 138 lx_proc_exit(proc_t *p, klwp_t *lwp)
 139 {
 140         zone_t *z = p->p_zone;
 141 
 142         ASSERT(p->p_brand != NULL);
 143         ASSERT(p->p_brand_data != NULL);
 144 
 145         /*
 146          * If init is dying and we aren't explicitly shutting down the zone
 147          * or the system, then Solaris is about to restart init.  The Linux
 148          * init is not designed to handle a restart, which it interprets as
 149          * a reboot.  To give it a sane environment in which to run, we
 150          * reboot the zone.
 151          */
 152         if (p->p_pid == z->zone_proc_initpid) {
 153                 if (z->zone_boot_err == 0 &&
 154                     z->zone_restart_init &&
 155                     zone_status_get(z) < ZONE_IS_SHUTTING_DOWN &&
 156                     zone_status_get(global_zone) < ZONE_IS_SHUTTING_DOWN)
 157                         (void) zone_kadmin(A_REBOOT, 0, NULL, CRED());
 158         }
 159         lx_exitlwp(lwp);
 160         kmem_free(p->p_brand_data, sizeof (struct lx_proc_data));
 161         p->p_brand_data = NULL;
 162 }
 163 
 164 void
 165 lx_setbrand(proc_t *p)
 166 {
 167         kthread_t *t = p->p_tlist;
 168         int err;
 169 
 170         ASSERT(p->p_brand_data == NULL);
 171         ASSERT(ttolxlwp(curthread) == NULL);
 172 
 173         p->p_brand_data = kmem_zalloc(sizeof (struct lx_proc_data), KM_SLEEP);
 174 
 175         /*
 176          * This routine can only be called for single-threaded processes.
 177          * Since lx_initlwp() can only fail if we run out of PIDs for
 178          * multithreaded processes, we know that this can never fail.
 179          */
 180         err = lx_initlwp(t->t_lwp);
 181         ASSERT(err == 0);
 182 }
 183 
 184 /* ARGSUSED */
 185 int
 186 lx_setattr(zone_t *zone, int attr, void *buf, size_t bufsize)
 187 {
 188         boolean_t val;
 189         int num;
 190 
 191         if (attr == LX_ATTR_RESTART_INIT) {
 192                 if (bufsize > sizeof (boolean_t))
 193                         return (ERANGE);
 194                 if (copyin(buf, &val, sizeof (val)) != 0)
 195                         return (EFAULT);
 196                 if (val != B_TRUE && val != B_FALSE)
 197                         return (EINVAL);
 198                 zone->zone_restart_init = val;
 199                 return (0);
 200         } else if (attr == LX_KERN_VERSION_NUM) {
 201                 if (bufsize > sizeof (int))
 202                         return (ERANGE);
 203                 if (copyin(buf, &num, sizeof (num)) != 0)
 204                         return (EFAULT);
 205                 lx_set_kern_version(zone, num);
 206                 return (0);
 207         }
 208         return (EINVAL);
 209 }
 210 
 211 /* ARGSUSED */
 212 int
 213 lx_getattr(zone_t *zone, int attr, void *buf, size_t *bufsize)
 214 {
 215         int num;
 216         if (attr == LX_ATTR_RESTART_INIT) {
 217                 if (*bufsize < sizeof (boolean_t))
 218                         return (ERANGE);
 219                 if (copyout(&zone->zone_restart_init, buf,
 220                     sizeof (boolean_t)) != 0)
 221                         return (EFAULT);
 222                 *bufsize = sizeof (boolean_t);
 223                 return (0);
 224         } else if (attr == LX_KERN_VERSION_NUM) {
 225                 if (*bufsize < sizeof (int))
 226                         return (ERANGE);
 227                 num = lx_get_kern_version();
 228                 if (copyout(&num, buf, sizeof (int)) != 0)
 229                         return (EFAULT);
 230                 *bufsize = sizeof (int);
 231                 return (0);
 232         }
 233         return (-EINVAL);
 234 }
 235 
 236 /*
 237  * Enable ptrace system call tracing for the given LWP. This is done by
 238  * both setting the flag in that LWP's brand data (in the kernel) and setting
 239  * the process-wide trace flag (in the brand library of the traced process).
 240  */
 241 static int
 242 lx_ptrace_syscall_set(pid_t pid, id_t lwpid, int set)
 243 {
 244         proc_t *p;
 245         kthread_t *t;
 246         klwp_t *lwp;
 247         lx_proc_data_t *lpdp;
 248         lx_lwp_data_t *lldp;
 249         uintptr_t addr;
 250         int ret, flag = 1;
 251 
 252         if ((p = sprlock(pid)) == NULL)
 253                 return (ESRCH);
 254 
 255         if (priv_proc_cred_perm(curproc->p_cred, p, NULL, VWRITE) != 0) {
 256                 sprunlock(p);
 257                 return (EPERM);
 258         }
 259 
 260         if ((t = idtot(p, lwpid)) == NULL || (lwp = ttolwp(t)) == NULL) {
 261                 sprunlock(p);
 262                 return (ESRCH);
 263         }
 264 
 265         if ((lpdp = p->p_brand_data) == NULL ||
 266             (lldp = lwp->lwp_brand) == NULL) {
 267                 sprunlock(p);
 268                 return (ESRCH);
 269         }
 270 
 271         if (set) {
 272                 /*
 273                  * Enable the ptrace flag for this LWP and this process. Note
 274                  * that we will turn off the LWP's ptrace flag, but we don't
 275                  * turn off the process's ptrace flag.
 276                  */
 277                 lldp->br_ptrace = 1;
 278                 lpdp->l_ptrace = 1;
 279 
 280                 addr = lpdp->l_traceflag;
 281 
 282                 mutex_exit(&p->p_lock);
 283 
 284                 /*
 285                  * This can fail only in some rare corner cases where the
 286                  * process is exiting or we're completely out of memory. In
 287                  * these cases, it's sufficient to return an error to the ptrace
 288                  * consumer and leave the process-wide flag set.
 289                  */
 290                 ret = uwrite(p, &flag, sizeof (flag), addr);
 291 
 292                 mutex_enter(&p->p_lock);
 293 
 294                 /*
 295                  * If we couldn't set the trace flag, unset the LWP's ptrace
 296                  * flag as there ptrace consumer won't expect this LWP to stop.
 297                  */
 298                 if (ret != 0)
 299                         lldp->br_ptrace = 0;
 300         } else {
 301                 lldp->br_ptrace = 0;
 302                 ret = 0;
 303         }
 304 
 305         sprunlock(p);
 306 
 307         if (ret != 0)
 308                 ret = EIO;
 309 
 310         return (ret);
 311 }
 312 
 313 static void
 314 lx_ptrace_fire(void)
 315 {
 316         kthread_t *t = curthread;
 317         klwp_t *lwp = ttolwp(t);
 318         lx_lwp_data_t *lldp = lwp->lwp_brand;
 319 
 320         /*
 321          * The ptrace flag only applies until the next event is encountered
 322          * for the given LWP. If it's set, turn off the flag and poke the
 323          * controlling process by raising a signal.
 324          */
 325         if (lldp->br_ptrace) {
 326                 lldp->br_ptrace = 0;
 327                 tsignal(t, SIGTRAP);
 328         }
 329 }
 330 
 331 void
 332 lx_brand_systrace_enable(void)
 333 {
 334         extern void lx_brand_int80_enable(void);
 335 
 336         ASSERT(!lx_systrace_enabled);
 337 
 338         lx_brand_int80_enable();
 339 
 340         lx_systrace_enabled = 1;
 341 }
 342 
 343 void
 344 lx_brand_systrace_disable(void)
 345 {
 346         extern void lx_brand_int80_disable(void);
 347 
 348         ASSERT(lx_systrace_enabled);
 349 
 350         lx_brand_int80_disable();
 351 
 352         lx_systrace_enabled = 0;
 353 }
 354 
 355 void
 356 lx_init_brand_data(zone_t *zone)
 357 {
 358         lx_zone_data_t *data;
 359         ASSERT(zone->zone_brand == &lx_brand);
 360         ASSERT(zone->zone_brand_data == NULL);
 361         data = (lx_zone_data_t *)kmem_zalloc(sizeof (lx_zone_data_t), KM_SLEEP);
 362         /*
 363          * Set the default lxzd_kernel_version to LX_KERN_2_4.
 364          * This can be changed by a call to setattr() during zone boot.
 365          */
 366         data->lxzd_kernel_version = LX_KERN_2_4;
 367         data->lxzd_max_syscall = LX_NSYSCALLS_2_4;
 368         zone->zone_brand_data = data;
 369 }
 370 
 371 void
 372 lx_free_brand_data(zone_t *zone)
 373 {
 374         kmem_free(zone->zone_brand_data, sizeof (lx_zone_data_t));
 375 }
 376 
 377 /*
 378  * Get the addresses of the user-space system call handler and attach it to
 379  * the proc structure. Returning 0 indicates success; the value returned
 380  * by the system call is the value stored in rval. Returning a non-zero
 381  * value indicates a failure; the value returned is used to set errno, -1
 382  * is returned from the syscall and the contents of rval are ignored. To
 383  * set errno and have the syscall return a value other than -1 we can
 384  * manually set errno and rval and return 0.
 385  */
 386 int
 387 lx_brandsys(int cmd, int64_t *rval, uintptr_t arg1, uintptr_t arg2,
 388     uintptr_t arg3, uintptr_t arg4, uintptr_t arg5, uintptr_t arg6)
 389 {
 390         kthread_t *t = curthread;
 391         proc_t *p = ttoproc(t);
 392         lx_proc_data_t *pd;
 393         int linux_call;
 394         struct termios *termios;
 395         uint_t termios_len;
 396         int error;
 397         lx_brand_registration_t reg;
 398 
 399         /*
 400          * There is one operation that is suppored for non-branded
 401          * process.  B_EXEC_BRAND.  This is the equilivant of an
 402          * exec call, but the new process that is created will be
 403          * a branded process.
 404          */
 405         if (cmd == B_EXEC_BRAND) {
 406                 ASSERT(p->p_zone != NULL);
 407                 ASSERT(p->p_zone->zone_brand == &lx_brand);
 408                 return (exec_common(
 409                     (char *)arg1, (const char **)arg2, (const char **)arg3,
 410                     EBA_BRAND));
 411         }
 412 
 413         /* For all other operations this must be a branded process. */
 414         if (p->p_brand == NULL)
 415                 return (set_errno(ENOSYS));
 416 
 417         ASSERT(p->p_brand == &lx_brand);
 418         ASSERT(p->p_brand_data != NULL);
 419 
 420         switch (cmd) {
 421         case B_REGISTER:
 422                 if (p->p_model == DATAMODEL_NATIVE) {
 423                         if (copyin((void *)arg1, &reg, sizeof (reg)) != 0) {
 424                                 lx_print("Failed to copyin brand registration "
 425                                     "at 0x%p\n", (void *)arg1);
 426                                 return (EFAULT);
 427                         }
 428 #ifdef _LP64
 429                 } else {
 430                         lx_brand_registration32_t reg32;
 431 
 432                         if (copyin((void *)arg1, &reg32, sizeof (reg32)) != 0) {
 433                                 lx_print("Failed to copyin brand registration "
 434                                     "at 0x%p\n", (void *)arg1);
 435                                 return (EFAULT);
 436                         }
 437 
 438                         reg.lxbr_version = (uint_t)reg32.lxbr_version;
 439                         reg.lxbr_handler =
 440                             (void *)(uintptr_t)reg32.lxbr_handler;
 441                         reg.lxbr_tracehandler =
 442                             (void *)(uintptr_t)reg32.lxbr_tracehandler;
 443                         reg.lxbr_traceflag =
 444                             (void *)(uintptr_t)reg32.lxbr_traceflag;
 445 #endif
 446                 }
 447 
 448                 if (reg.lxbr_version != LX_VERSION_1) {
 449                         lx_print("Invalid brand library version (%u)\n",
 450                             reg.lxbr_version);
 451                         return (EINVAL);
 452                 }
 453 
 454                 lx_print("Assigning brand 0x%p and handler 0x%p to proc 0x%p\n",
 455                     (void *)&lx_brand, (void *)reg.lxbr_handler, (void *)p);
 456                 pd = p->p_brand_data;
 457                 pd->l_handler = (uintptr_t)reg.lxbr_handler;
 458                 pd->l_tracehandler = (uintptr_t)reg.lxbr_tracehandler;
 459                 pd->l_traceflag = (uintptr_t)reg.lxbr_traceflag;
 460                 *rval = 0;
 461                 return (0);
 462         case B_TTYMODES:
 463                 /* This is necessary for emulating TCGETS ioctls. */
 464                 if (ddi_prop_lookup_byte_array(DDI_DEV_T_ANY, ddi_root_node(),
 465                     DDI_PROP_NOTPROM, "ttymodes", (uchar_t **)&termios,
 466                     &termios_len) != DDI_SUCCESS)
 467                         return (EIO);
 468 
 469                 ASSERT(termios_len == sizeof (*termios));
 470 
 471                 if (copyout(&termios, (void *)arg1, sizeof (termios)) != 0) {
 472                         ddi_prop_free(termios);
 473                         return (EFAULT);
 474                 }
 475 
 476                 ddi_prop_free(termios);
 477                 *rval = 0;
 478                 return (0);
 479 
 480         case B_ELFDATA:
 481                 pd = curproc->p_brand_data;
 482                 if (copyout(&pd->l_elf_data, (void *)arg1,
 483                     sizeof (lx_elf_data_t)) != 0) {
 484                         (void) set_errno(EFAULT);
 485                         return (*rval = -1);
 486                 }
 487                 *rval = 0;
 488                 return (0);
 489 
 490         case B_EXEC_NATIVE:
 491                 error = exec_common(
 492                     (char *)arg1, (const char **)arg2, (const char **)arg3,
 493                     EBA_NATIVE);
 494                 if (error) {
 495                         (void) set_errno(error);
 496                         return (*rval = -1);
 497                 }
 498                 return (*rval = 0);
 499 
 500         case B_LPID_TO_SPAIR:
 501                 /*
 502                  * Given a Linux pid as arg1, return the Solaris pid in arg2 and
 503                  * the Solaris LWP in arg3.  We also translate pid 1 (which is
 504                  * hardcoded in many applications) to the zone's init process.
 505                  */
 506                 {
 507                         pid_t s_pid;
 508                         id_t s_tid;
 509 
 510                         if ((pid_t)arg1 == 1) {
 511                                 s_pid = p->p_zone->zone_proc_initpid;
 512                                 /* handle the dead/missing init(1M) case */
 513                                 if (s_pid == -1)
 514                                         s_pid = 1;
 515                                 s_tid = 1;
 516                         } else if (lx_lpid_to_spair((pid_t)arg1, &s_pid,
 517                             &s_tid) < 0)
 518                                 return (ESRCH);
 519 
 520                         if (copyout(&s_pid, (void *)arg2,
 521                             sizeof (s_pid)) != 0 ||
 522                             copyout(&s_tid, (void *)arg3, sizeof (s_tid)) != 0)
 523                                 return (EFAULT);
 524 
 525                         *rval = 0;
 526                         return (0);
 527                 }
 528 
 529         case B_PTRACE_SYSCALL:
 530                 *rval = lx_ptrace_syscall_set((pid_t)arg1, (id_t)arg2,
 531                     (int)arg3);
 532                 return (0);
 533 
 534         case B_SYSENTRY:
 535                 if (lx_systrace_enabled) {
 536                         uint32_t args[6];
 537 
 538                         ASSERT(lx_systrace_entry_ptr != NULL);
 539 
 540                         if (copyin((void *)arg2, args, sizeof (args)) != 0)
 541                                 return (EFAULT);
 542 
 543                         (*lx_systrace_entry_ptr)(arg1, args[0], args[1],
 544                             args[2], args[3], args[4], args[5]);
 545                 }
 546 
 547                 lx_ptrace_fire();
 548 
 549                 pd = p->p_brand_data;
 550 
 551                 /*
 552                  * If neither DTrace not ptrace are interested in tracing
 553                  * this process any more, turn off the trace flag.
 554                  */
 555                 if (!lx_systrace_enabled && !pd->l_ptrace)
 556                         (void) suword32((void *)pd->l_traceflag, 0);
 557 
 558                 *rval = 0;
 559                 return (0);
 560 
 561         case B_SYSRETURN:
 562                 if (lx_systrace_enabled) {
 563                         ASSERT(lx_systrace_return_ptr != NULL);
 564 
 565                         (*lx_systrace_return_ptr)(arg1, arg2, arg2, 0, 0, 0, 0);
 566                 }
 567 
 568                 lx_ptrace_fire();
 569 
 570                 pd = p->p_brand_data;
 571 
 572                 /*
 573                  * If neither DTrace not ptrace are interested in tracing
 574                  * this process any more, turn off the trace flag.
 575                  */
 576                 if (!lx_systrace_enabled && !pd->l_ptrace)
 577                         (void) suword32((void *)pd->l_traceflag, 0);
 578 
 579                 *rval = 0;
 580                 return (0);
 581 
 582         case B_SET_AFFINITY_MASK:
 583         case B_GET_AFFINITY_MASK:
 584                 /*
 585                  * Retrieve or store the CPU affinity mask for the
 586                  * requested linux pid.
 587                  *
 588                  * arg1 is a linux PID (0 means curthread).
 589                  * arg2 is the size of the given mask.
 590                  * arg3 is the address of the affinity mask.
 591                  */
 592                 return (lx_sched_affinity(cmd, arg1, arg2, arg3, rval));
 593 
 594         default:
 595                 linux_call = cmd - B_EMULATE_SYSCALL;
 596                 /*
 597                  * Only checking against highest syscall number for all kernel
 598                  * versions, since check for specific kernel version is done
 599                  * in userland prior to this call, and duplicating logic would
 600                  * be redundant.
 601                  */
 602                 if (linux_call >= 0 && linux_call < LX_NSYSCALLS) {
 603                         *rval = lx_emulate_syscall(linux_call, arg1, arg2,
 604                             arg3, arg4, arg5, arg6);
 605                         return (0);
 606                 }
 607         }
 608 
 609         return (EINVAL);
 610 }
 611 
 612 int
 613 lx_get_zone_kern_version(zone_t *zone)
 614 {
 615         return (((lx_zone_data_t *)zone->zone_brand_data)->lxzd_kernel_version);
 616 }
 617 
 618 int
 619 lx_get_kern_version()
 620 {
 621         return (lx_get_zone_kern_version(curzone));
 622 }
 623 
 624 void
 625 lx_set_kern_version(zone_t *zone, int vers)
 626 {
 627         lx_zone_data_t *lxzd = (lx_zone_data_t *)zone->zone_brand_data;
 628 
 629         lxzd->lxzd_kernel_version = vers;
 630         if (vers == LX_KERN_2_6)
 631                 lxzd->lxzd_max_syscall = LX_NSYSCALLS_2_6;
 632 }
 633 
 634 /*
 635  * Copy the per-process brand data from a parent proc to a child.
 636  */
 637 void
 638 lx_copy_procdata(proc_t *child, proc_t *parent)
 639 {
 640         lx_proc_data_t *cpd, *ppd;
 641 
 642         ppd = parent->p_brand_data;
 643 
 644         ASSERT(ppd != NULL);
 645 
 646         cpd = kmem_alloc(sizeof (lx_proc_data_t), KM_SLEEP);
 647         *cpd = *ppd;
 648 
 649         child->p_brand_data = cpd;
 650 }
 651 
 652 /*
 653  * Currently, only 32-bit branded ELF executables are supported.
 654  */
 655 #if defined(_LP64)
 656 #define elfexec                 elf32exec
 657 #define mapexec_brand           mapexec32_brand
 658 #endif /* _LP64 */
 659 
 660 /*
 661  * Exec routine called by elfexec() to load 32-bit Linux binaries.
 662  */
 663 static int
 664 lx_elfexec(struct vnode *vp, struct execa *uap, struct uarg *args,
 665     struct intpdata *idata, int level, long *execsz, int setid,
 666     caddr_t exec_file, struct cred *cred, int brand_action)
 667 {
 668         int             error;
 669         vnode_t         *nvp;
 670         auxv32_t        phdr_auxv32[3] = {
 671             { AT_SUN_BRAND_LX_PHDR, 0 },
 672             { AT_SUN_BRAND_AUX2, 0 },
 673             { AT_SUN_BRAND_AUX3, 0 }
 674         };
 675         Elf32_Ehdr      ehdr;
 676         Elf32_Addr      uphdr_vaddr;
 677         intptr_t        voffset;
 678         int             interp;
 679         int             i;
 680         struct execenv  env;
 681         struct user     *up = PTOU(ttoproc(curthread));
 682         lx_elf_data_t   *edp =
 683             &((lx_proc_data_t *)ttoproc(curthread)->p_brand_data)->l_elf_data;
 684 
 685         ASSERT(ttoproc(curthread)->p_brand == &lx_brand);
 686         ASSERT(ttoproc(curthread)->p_brand_data != NULL);
 687 
 688         /*
 689          * Set the brandname and library name for the new process so that
 690          * elfexec() puts them onto the stack.
 691          */
 692         args->brandname = LX_BRANDNAME;
 693         args->emulator = LX_LIB_PATH;
 694 
 695         /*
 696          * We will exec the brand library, and map in the linux linker and the
 697          * linux executable.
 698          */
 699         if ((error = lookupname(LX_LIB_PATH, UIO_SYSSPACE, FOLLOW, NULLVPP,
 700             &nvp))) {
 701                 uprintf("%s: not found.", LX_LIB);
 702                 return (error);
 703         }
 704 
 705         if ((error = elfexec(nvp, uap, args, idata, level + 1, execsz, setid,
 706             exec_file, cred, brand_action))) {
 707                 VN_RELE(nvp);
 708                 return (error);
 709         }
 710         VN_RELE(nvp);
 711 
 712         bzero(&env, sizeof (env));
 713 
 714         if ((error = mapexec_brand(vp, args, &ehdr, &uphdr_vaddr, &voffset,
 715             exec_file, &interp, &env.ex_bssbase, &env.ex_brkbase,
 716             &env.ex_brksize, NULL)))
 717                 return (error);
 718 
 719         /*
 720          * Save off the important properties of the lx executable. The brand
 721          * library will ask us for this data later, when it is ready to set
 722          * things up for the lx executable.
 723          */
 724         edp->ed_phdr = (uphdr_vaddr == -1) ? voffset + ehdr.e_phoff :
 725             voffset + uphdr_vaddr;
 726         edp->ed_entry = voffset + ehdr.e_entry;
 727         edp->ed_phent = ehdr.e_phentsize;
 728         edp->ed_phnum = ehdr.e_phnum;
 729 
 730         if (interp) {
 731                 if (ehdr.e_type == ET_DYN) {
 732                         /*
 733                          * This is a shared object executable, so we need to
 734                          * pick a reasonable place to put the heap. Just don't
 735                          * use the first page.
 736                          */
 737                         env.ex_brkbase = (caddr_t)PAGESIZE;
 738                         env.ex_bssbase = (caddr_t)PAGESIZE;
 739                 }
 740 
 741                 /*
 742                  * If the program needs an interpreter (most do), map it in and
 743                  * store relevant information about it in the aux vector, where
 744                  * the brand library can find it.
 745                  */
 746                 if ((error = lookupname(LX_LINKER, UIO_SYSSPACE, FOLLOW, NULLVPP,
 747                     &nvp))) {
 748                         uprintf("%s: not found.", LX_LINKER);
 749                         return (error);
 750                 }
 751                 if ((error = mapexec_brand(nvp, args, &ehdr, &uphdr_vaddr,
 752                     &voffset, exec_file, &interp, NULL, NULL, NULL, NULL))) {
 753                         VN_RELE(nvp);
 754                         return (error);
 755                 }
 756                 VN_RELE(nvp);
 757 
 758                 /*
 759                  * Now that we know the base address of the brand's linker,
 760                  * place it in the aux vector.
 761                  */
 762                 edp->ed_base = voffset;
 763                 edp->ed_ldentry = voffset + ehdr.e_entry;
 764         } else {
 765                 /*
 766                  * This program has no interpreter. The lx brand library will
 767                  * jump to the address in the AT_SUN_BRAND_LDENTRY aux vector,
 768                  * so in this case, put the entry point of the main executable
 769                  * there.
 770                  */
 771                 if (ehdr.e_type == ET_EXEC) {
 772                         /*
 773                          * An executable with no interpreter, this must be a
 774                          * statically linked executable, which means we loaded
 775                          * it at the address specified in the elf header, in
 776                          * which case the e_entry field of the elf header is an
 777                          * absolute address.
 778                          */
 779                         edp->ed_ldentry = ehdr.e_entry;
 780                         edp->ed_entry = ehdr.e_entry;
 781                 } else {
 782                         /*
 783                          * A shared object with no interpreter, we use the
 784                          * calculated address from above.
 785                          */
 786                         edp->ed_ldentry = edp->ed_entry;
 787 
 788                         /*
 789                          * In all situations except an ET_DYN elf object with no
 790                          * interpreter, we want to leave the brk and base
 791                          * values set by mapexec_brand alone. Normally when
 792                          * running ET_DYN objects on Solaris (most likely
 793                          * /lib/ld.so.1) the kernel sets brk and base to 0 since
 794                          * it doesn't know where to put the heap, and later the
 795                          * linker will call brk() to initialize the heap in:
 796                          *      usr/src/cmd/sgs/rtld/common/setup.c:setup()
 797                          * after it has determined where to put it.  (This
 798                          * decision is made after the linker loads and inspects
 799                          * elf properties of the target executable being run.)
 800                          *
 801                          * So for ET_DYN Linux executables, we also don't know
 802                          * where the heap should go, so we'll set the brk and
 803                          * base to 0.  But in this case the Solaris linker will
 804                          * not initialize the heap, so when the Linux linker
 805                          * starts running there is no heap allocated.  This
 806                          * seems to be ok on Linux 2.4 based systems because the
 807                          * Linux linker/libc fall back to using mmap() to
 808                          * allocate memory. But on 2.6 systems, running
 809                          * applications by specifying them as command line
 810                          * arguments to the linker results in segfaults for an
 811                          * as yet undetermined reason (which seems to indicatej
 812                          * that a more permanent fix for heap initalization in
 813                          * these cases may be necessary).
 814                          */
 815                         if (ehdr.e_type == ET_DYN) {
 816                                 env.ex_bssbase = (caddr_t)0;
 817                                 env.ex_brkbase = (caddr_t)0;
 818                                 env.ex_brksize = 0;
 819                         }
 820                 }
 821 
 822         }
 823 
 824         env.ex_vp = vp;
 825         setexecenv(&env);
 826 
 827         /*
 828          * We don't need to copy this stuff out. It is only used by our
 829          * tools to locate the lx linker's debug section. But we should at
 830          * least try to keep /proc's view of the aux vector consistent with
 831          * what's on the process stack.
 832          */
 833         phdr_auxv32[0].a_un.a_val = edp->ed_phdr;
 834 
 835         /*
 836          * Linux 2.6 programs such as ps will print an error message if the
 837          * following aux entry is missing
 838          */
 839         if (lx_get_kern_version() >= LX_KERN_2_6) {
 840                 phdr_auxv32[1].a_type = AT_CLKTCK;
 841                 phdr_auxv32[1].a_un.a_val = hz;
 842         }
 843 
 844         if (copyout(&phdr_auxv32, args->auxp_brand,
 845             sizeof (phdr_auxv32)) == -1)
 846                 return (EFAULT);
 847 
 848         /*
 849          * /proc uses the AT_ENTRY aux vector entry to deduce
 850          * the location of the executable in the address space. The user
 851          * structure contains a copy of the aux vector that needs to have those
 852          * entries patched with the values of the real lx executable (they
 853          * currently contain the values from the lx brand library that was
 854          * elfexec'd, above).
 855          *
 856          * For live processes, AT_BASE is used to locate the linker segment,
 857          * which /proc and friends will later use to find Solaris symbols
 858          * (such as rtld_db_preinit). However, for core files, /proc uses
 859          * AT_ENTRY to find the right segment to label as the executable.
 860          * So we set AT_ENTRY to be the entry point of the linux executable,
 861          * but leave AT_BASE to be the address of the Solaris linker.
 862          */
 863         for (i = 0; i < __KERN_NAUXV_IMPL; i++) {
 864                 if (up->u_auxv[i].a_type == AT_ENTRY)
 865                         up->u_auxv[i].a_un.a_val = edp->ed_entry;
 866                 if (up->u_auxv[i].a_type == AT_SUN_BRAND_LX_PHDR)
 867                         up->u_auxv[i].a_un.a_val = edp->ed_phdr;
 868         }
 869 
 870         return (0);
 871 }
 872 
 873 int
 874 _init(void)
 875 {
 876         int err = 0;
 877 
 878         /* pid/tid conversion hash tables */
 879         lx_pid_init();
 880 
 881         /* for lx_futex() */
 882         lx_futex_init();
 883 
 884         err = mod_install(&modlinkage);
 885         if (err != 0) {
 886                 cmn_err(CE_WARN, "Couldn't install lx brand module");
 887 
 888                 /*
 889                  * This looks drastic, but it should never happen.  These
 890                  * two data structures should be completely free-able until
 891                  * they are used by Linux processes.  Since the brand
 892                  * wasn't loaded there should be no Linux processes, and
 893                  * thus no way for these data structures to be modified.
 894                  */
 895                 lx_pid_fini();
 896                 if (lx_futex_fini())
 897                         panic("lx brand module cannot be loaded or unloaded.");
 898         }
 899         return (err);
 900 }
 901 
 902 int
 903 _info(struct modinfo *modinfop)
 904 {
 905         return (mod_info(&modlinkage, modinfop));
 906 }
 907 
 908 int
 909 _fini(void)
 910 {
 911         int err;
 912         int futex_done = 0;
 913 
 914         /*
 915          * If there are any zones using this brand, we can't allow it to be
 916          * unloaded.
 917          */
 918         if (brand_zone_count(&lx_brand))
 919                 return (EBUSY);
 920 
 921         lx_pid_fini();
 922 
 923         if ((err = lx_futex_fini()) != 0)
 924                 goto done;
 925         futex_done = 1;
 926 
 927         err = mod_remove(&modlinkage);
 928 
 929 done:
 930         if (err) {
 931                 /*
 932                  * If we can't unload the module, then we have to get it
 933                  * back into a sane state.
 934                  */
 935                 lx_pid_init();
 936 
 937                 if (futex_done)
 938                         lx_futex_init();
 939 
 940         }
 941 
 942         return (err);
 943 }