1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright 2019 Joyent, Inc.
  25  */
  26 
  27 #include <sys/param.h>
  28 #include <sys/vmparam.h>
  29 #include <sys/types.h>
  30 #include <sys/sysmacros.h>
  31 #include <sys/systm.h>
  32 #include <sys/cmn_err.h>
  33 #include <sys/signal.h>
  34 #include <sys/stack.h>
  35 #include <sys/cred.h>
  36 #include <sys/user.h>
  37 #include <sys/debug.h>
  38 #include <sys/errno.h>
  39 #include <sys/proc.h>
  40 #include <sys/var.h>
  41 #include <sys/inline.h>
  42 #include <sys/syscall.h>
  43 #include <sys/ucontext.h>
  44 #include <sys/cpuvar.h>
  45 #include <sys/siginfo.h>
  46 #include <sys/trap.h>
  47 #include <sys/machtrap.h>
  48 #include <sys/sysinfo.h>
  49 #include <sys/procfs.h>
  50 #include <sys/prsystm.h>
  51 #include <sys/fpu/fpusystm.h>
  52 #include <sys/modctl.h>
  53 #include <sys/aio_impl.h>
  54 #include <c2/audit.h>
  55 #include <sys/tnf.h>
  56 #include <sys/tnf_probe.h>
  57 #include <sys/machpcb.h>
  58 #include <sys/privregs.h>
  59 #include <sys/copyops.h>
  60 #include <sys/timer.h>
  61 #include <sys/priv.h>
  62 #include <sys/msacct.h>
  63 
  64 int syscalltrace = 0;
  65 #ifdef SYSCALLTRACE
  66 static kmutex_t systrace_lock;          /* syscall tracing lock */
  67 #endif /* SYSCALLTRACE */
  68 
  69 static krwlock_t *lock_syscall(struct sysent *, uint_t);
  70 
  71 #ifdef _SYSCALL32_IMPL
  72 static struct sysent *
  73 lwp_getsysent(klwp_t *lwp)
  74 {
  75         if (lwp_getdatamodel(lwp) == DATAMODEL_NATIVE)
  76                 return (sysent);
  77         return (sysent32);
  78 }
  79 #define LWP_GETSYSENT(lwp)      (lwp_getsysent(lwp))
  80 #else
  81 #define LWP_GETSYSENT(lwp)      (sysent)
  82 #endif
  83 
  84 /*
  85  * Called to restore the lwp's register window just before
  86  * returning to user level (only if the registers have been
  87  * fetched or modified through /proc).
  88  */
  89 /*ARGSUSED1*/
  90 void
  91 xregrestore(klwp_t *lwp, int shared)
  92 {
  93         /*
  94          * If locals+ins were modified by /proc copy them out.
  95          * Also copy to the shared window, if necessary.
  96          */
  97         if (lwp->lwp_pcb.pcb_xregstat == XREGMODIFIED) {
  98                 struct machpcb *mpcb = lwptompcb(lwp);
  99                 caddr_t sp = (caddr_t)lwptoregs(lwp)->r_sp;
 100 
 101                 size_t rwinsize;
 102                 caddr_t rwp;
 103                 int is64;
 104 
 105                 if (lwp_getdatamodel(lwp) == DATAMODEL_LP64) {
 106                         rwinsize = sizeof (struct rwindow);
 107                         rwp = sp + STACK_BIAS;
 108                         is64 = 1;
 109                 } else {
 110                         rwinsize = sizeof (struct rwindow32);
 111                         sp = (caddr_t)(uintptr_t)(caddr32_t)(uintptr_t)sp;
 112                         rwp = sp;
 113                         is64 = 0;
 114                 }
 115 
 116                 if (is64)
 117                         (void) copyout_nowatch(&lwp->lwp_pcb.pcb_xregs,
 118                             rwp, rwinsize);
 119                 else {
 120                         struct rwindow32 rwindow32;
 121                         int watched;
 122 
 123                         watched = watch_disable_addr(rwp, rwinsize, S_WRITE);
 124                         rwindow_nto32(&lwp->lwp_pcb.pcb_xregs, &rwindow32);
 125                         (void) copyout(&rwindow32, rwp, rwinsize);
 126                         if (watched)
 127                                 watch_enable_addr(rwp, rwinsize, S_WRITE);
 128                 }
 129 
 130                 /* also copy to the user return window */
 131                 mpcb->mpcb_rsp[0] = sp;
 132                 mpcb->mpcb_rsp[1] = NULL;
 133                 bcopy(&lwp->lwp_pcb.pcb_xregs, &mpcb->mpcb_rwin[0],
 134                     sizeof (lwp->lwp_pcb.pcb_xregs));
 135         }
 136         lwp->lwp_pcb.pcb_xregstat = XREGNONE;
 137 }
 138 
 139 
 140 /*
 141  * Get the arguments to the current system call.
 142  *      lwp->lwp_ap normally points to the out regs in the reg structure.
 143  *      If the user is going to change the out registers and might want to
 144  *      get the args (for /proc tracing), it must copy the args elsewhere
 145  *      via save_syscall_args().
 146  */
 147 uint_t
 148 get_syscall_args(klwp_t *lwp, long *argp, int *nargsp)
 149 {
 150         kthread_t       *t = lwptot(lwp);
 151         uint_t  code = t->t_sysnum;
 152         long    mask;
 153         long    *ap;
 154         int     nargs;
 155 
 156         if (lwptoproc(lwp)->p_model == DATAMODEL_ILP32)
 157                 mask = (uint32_t)0xffffffffU;
 158         else
 159                 mask = 0xffffffffffffffff;
 160 
 161         if (code != 0 && code < NSYSCALL) {
 162 
 163                 nargs = LWP_GETSYSENT(lwp)[code].sy_narg;
 164 
 165                 ASSERT(nargs <= MAXSYSARGS);
 166 
 167                 *nargsp = nargs;
 168                 ap = lwp->lwp_ap;
 169                 while (nargs-- > 0)
 170                         *argp++ = *ap++ & mask;
 171         } else {
 172                 *nargsp = 0;
 173         }
 174         return (code);
 175 }
 176 
 177 #ifdef _SYSCALL32_IMPL
 178 /*
 179  * Get the arguments to the current 32-bit system call.
 180  */
 181 uint_t
 182 get_syscall32_args(klwp_t *lwp, int *argp, int *nargsp)
 183 {
 184         long args[MAXSYSARGS];
 185         uint_t i, code;
 186 
 187         code = get_syscall_args(lwp, args, nargsp);
 188         for (i = 0; i != *nargsp; i++)
 189                 *argp++ = (int)args[i];
 190         return (code);
 191 }
 192 #endif
 193 
 194 /*
 195  *      Save the system call arguments in a safe place.
 196  *      lwp->lwp_ap normally points to the out regs in the reg structure.
 197  *      If the user is going to change the out registers, g1, or the stack,
 198  *      and might want to get the args (for /proc tracing), it must copy
 199  *      the args elsewhere via save_syscall_args().
 200  *
 201  *      This may be called from stop() even when we're not in a system call.
 202  *      Since there's no easy way to tell, this must be safe (not panic).
 203  *      If the copyins get data faults, return non-zero.
 204  */
 205 int
 206 save_syscall_args()
 207 {
 208         kthread_t       *t = curthread;
 209         klwp_t          *lwp = ttolwp(t);
 210         struct regs     *rp = lwptoregs(lwp);
 211         uint_t          code = t->t_sysnum;
 212         uint_t          nargs;
 213         int             i;
 214         caddr_t         ua;
 215         model_t         datamodel;
 216 
 217         if (lwp->lwp_argsaved || code == 0)
 218                 return (0);             /* args already saved or not needed */
 219 
 220         if (code >= NSYSCALL) {
 221                 nargs = 0;              /* illegal syscall */
 222         } else {
 223                 struct sysent *se = LWP_GETSYSENT(lwp);
 224                 struct sysent *callp = se + code;
 225 
 226                 nargs = callp->sy_narg;
 227                 if (LOADABLE_SYSCALL(callp) && nargs == 0) {
 228                         krwlock_t       *module_lock;
 229 
 230                         /*
 231                          * Find out how many arguments the system
 232                          * call uses.
 233                          *
 234                          * We have the property that loaded syscalls
 235                          * never change the number of arguments they
 236                          * use after they've been loaded once.  This
 237                          * allows us to stop for /proc tracing without
 238                          * holding the module lock.
 239                          * /proc is assured that sy_narg is valid.
 240                          */
 241                         module_lock = lock_syscall(se, code);
 242                         nargs = callp->sy_narg;
 243                         rw_exit(module_lock);
 244                 }
 245         }
 246 
 247         /*
 248          * Fetch the system call arguments.
 249          */
 250         if (nargs == 0)
 251                 goto out;
 252 
 253 
 254         ASSERT(nargs <= MAXSYSARGS);
 255 
 256         if ((datamodel = lwp_getdatamodel(lwp)) == DATAMODEL_ILP32) {
 257 
 258                 if (rp->r_g1 == 0) { /* indirect syscall */
 259 
 260                         lwp->lwp_arg[0] = (uint32_t)rp->r_o1;
 261                         lwp->lwp_arg[1] = (uint32_t)rp->r_o2;
 262                         lwp->lwp_arg[2] = (uint32_t)rp->r_o3;
 263                         lwp->lwp_arg[3] = (uint32_t)rp->r_o4;
 264                         lwp->lwp_arg[4] = (uint32_t)rp->r_o5;
 265                         if (nargs > 5) {
 266                                 ua = (caddr_t)(uintptr_t)(caddr32_t)(uintptr_t)
 267                                     (rp->r_sp + MINFRAME32);
 268                                 for (i = 5; i < nargs; i++) {
 269                                         uint32_t a;
 270                                         if (fuword32(ua, &a) != 0)
 271                                                 return (-1);
 272                                         lwp->lwp_arg[i] = a;
 273                                         ua += sizeof (a);
 274                                 }
 275                         }
 276                 } else {
 277                         lwp->lwp_arg[0] = (uint32_t)rp->r_o0;
 278                         lwp->lwp_arg[1] = (uint32_t)rp->r_o1;
 279                         lwp->lwp_arg[2] = (uint32_t)rp->r_o2;
 280                         lwp->lwp_arg[3] = (uint32_t)rp->r_o3;
 281                         lwp->lwp_arg[4] = (uint32_t)rp->r_o4;
 282                         lwp->lwp_arg[5] = (uint32_t)rp->r_o5;
 283                         if (nargs > 6) {
 284                                 ua = (caddr_t)(uintptr_t)(caddr32_t)(uintptr_t)
 285                                     (rp->r_sp + MINFRAME32);
 286                                 for (i = 6; i < nargs; i++) {
 287                                         uint32_t a;
 288                                         if (fuword32(ua, &a) != 0)
 289                                                 return (-1);
 290                                         lwp->lwp_arg[i] = a;
 291                                         ua += sizeof (a);
 292                                 }
 293                         }
 294                 }
 295         } else {
 296                 ASSERT(datamodel == DATAMODEL_LP64);
 297                 lwp->lwp_arg[0] = rp->r_o0;
 298                 lwp->lwp_arg[1] = rp->r_o1;
 299                 lwp->lwp_arg[2] = rp->r_o2;
 300                 lwp->lwp_arg[3] = rp->r_o3;
 301                 lwp->lwp_arg[4] = rp->r_o4;
 302                 lwp->lwp_arg[5] = rp->r_o5;
 303                 if (nargs > 6) {
 304                         ua = (caddr_t)rp->r_sp + MINFRAME + STACK_BIAS;
 305                         for (i = 6; i < nargs; i++) {
 306                                 unsigned long a;
 307                                 if (fulword(ua, &a) != 0)
 308                                         return (-1);
 309                                 lwp->lwp_arg[i] = a;
 310                                 ua += sizeof (a);
 311                         }
 312                 }
 313         }
 314 
 315 out:
 316         lwp->lwp_ap = lwp->lwp_arg;
 317         lwp->lwp_argsaved = 1;
 318         t->t_post_sys = 1;   /* so lwp_ap will be reset */
 319         return (0);
 320 }
 321 
 322 void
 323 reset_syscall_args(void)
 324 {
 325         klwp_t *lwp = ttolwp(curthread);
 326 
 327         lwp->lwp_ap = (long *)&lwptoregs(lwp)->r_o0;
 328         lwp->lwp_argsaved = 0;
 329 }
 330 
 331 /*
 332  * nonexistent system call-- signal lwp (may want to handle it)
 333  * flag error if lwp won't see signal immediately
 334  * This works for old or new calling sequence.
 335  */
 336 int64_t
 337 nosys()
 338 {
 339         tsignal(curthread, SIGSYS);
 340         return ((int64_t)set_errno(ENOSYS));
 341 }
 342 
 343 /*
 344  * Perform pre-system-call processing, including stopping for tracing,
 345  * auditing, microstate-accounting, etc.
 346  *
 347  * This routine is called only if the t_pre_sys flag is set.  Any condition
 348  * requiring pre-syscall handling must set the t_pre_sys flag.  If the
 349  * condition is persistent, this routine will repost t_pre_sys.
 350  */
 351 int
 352 pre_syscall(int arg0)
 353 {
 354         unsigned int code;
 355         kthread_t *t = curthread;
 356         proc_t *p = ttoproc(t);
 357         klwp_t *lwp = ttolwp(t);
 358         struct regs *rp = lwptoregs(lwp);
 359         int     repost;
 360 
 361         t->t_pre_sys = repost = 0;   /* clear pre-syscall processing flag */
 362 
 363         ASSERT(t->t_schedflag & TS_DONT_SWAP);
 364 
 365         syscall_mstate(LMS_USER, LMS_SYSTEM);
 366 
 367         /*
 368          * The syscall arguments in the out registers should be pointed to
 369          * by lwp_ap.  If the args need to be copied so that the outs can
 370          * be changed without losing the ability to get the args for /proc,
 371          * they can be saved by save_syscall_args(), and lwp_ap will be
 372          * restored by post_syscall().
 373          */
 374         ASSERT(lwp->lwp_ap == (long *)&rp->r_o0);
 375 
 376         /*
 377          * Make sure the thread is holding the latest credentials for the
 378          * process.  The credentials in the process right now apply to this
 379          * thread for the entire system call.
 380          */
 381         if (t->t_cred != p->p_cred) {
 382                 cred_t *oldcred = t->t_cred;
 383                 /*
 384                  * DTrace accesses t_cred in probe context.  t_cred must
 385                  * always be either NULL, or point to a valid, allocated cred
 386                  * structure.
 387                  */
 388                 t->t_cred = crgetcred();
 389                 crfree(oldcred);
 390         }
 391 
 392         /*
 393          * Undo special arrangements to single-step the lwp
 394          * so that a debugger will see valid register contents.
 395          * Also so that the pc is valid for syncfpu().
 396          * Also so that a syscall like exec() can be stepped.
 397          */
 398         if (lwp->lwp_pcb.pcb_step != STEP_NONE) {
 399                 (void) prundostep();
 400                 repost = 1;
 401         }
 402 
 403         /*
 404          * Check for indirect system call in case we stop for tracing.
 405          * Don't allow multiple indirection.
 406          */
 407         code = t->t_sysnum;
 408         if (code == 0 && arg0 != 0) {           /* indirect syscall */
 409                 code = arg0;
 410                 t->t_sysnum = arg0;
 411         }
 412 
 413         /*
 414          * From the proc(4) manual page:
 415          * When entry to a system call is being traced, the traced process
 416          * stops after having begun the call to the system but before the
 417          * system call arguments have been fetched from the process.
 418          * If proc changes the args we must refetch them after starting.
 419          */
 420         if (PTOU(p)->u_systrap) {
 421                 if (prismember(&PTOU(p)->u_entrymask, code)) {
 422                         /*
 423                          * Recheck stop condition, now that lock is held.
 424                          */
 425                         mutex_enter(&p->p_lock);
 426                         if (PTOU(p)->u_systrap &&
 427                             prismember(&PTOU(p)->u_entrymask, code)) {
 428                                 stop(PR_SYSENTRY, code);
 429                                 /*
 430                                  * Must refetch args since they were
 431                                  * possibly modified by /proc.  Indicate
 432                                  * that the valid copy is in the
 433                                  * registers.
 434                                  */
 435                                 lwp->lwp_argsaved = 0;
 436                                 lwp->lwp_ap = (long *)&rp->r_o0;
 437                         }
 438                         mutex_exit(&p->p_lock);
 439                 }
 440                 repost = 1;
 441         }
 442 
 443         if (lwp->lwp_sysabort) {
 444                 /*
 445                  * lwp_sysabort may have been set via /proc while the process
 446                  * was stopped on PR_SYSENTRY.  If so, abort the system call.
 447                  * Override any error from the copyin() of the arguments.
 448                  */
 449                 lwp->lwp_sysabort = 0;
 450                 (void) set_errno(EINTR); /* sets post-sys processing */
 451                 t->t_pre_sys = 1;    /* repost anyway */
 452                 return (1);             /* don't do system call, return EINTR */
 453         }
 454 
 455         /* begin auditing for this syscall */
 456         if (audit_active == C2AUDIT_LOADED) {
 457                 uint32_t auditing = au_zone_getstate(NULL);
 458 
 459                 if (auditing & AU_AUDIT_MASK) {
 460                         int error;
 461                         if (error = audit_start(T_SYSCALL, code, auditing, \
 462                             0, lwp)) {
 463                                 t->t_pre_sys = 1;    /* repost anyway */
 464                                 lwp->lwp_error = 0;  /* for old drivers */
 465                                 return (error);
 466                         }
 467                         repost = 1;
 468                 }
 469         }
 470 
 471 #ifndef NPROBE
 472         /* Kernel probe */
 473         if (tnf_tracing_active) {
 474                 TNF_PROBE_1(syscall_start, "syscall thread", /* CSTYLED */,
 475                         tnf_sysnum,     sysnum,         t->t_sysnum);
 476                 t->t_post_sys = 1;   /* make sure post_syscall runs */
 477                 repost = 1;
 478         }
 479 #endif /* NPROBE */
 480 
 481 #ifdef SYSCALLTRACE
 482         if (syscalltrace) {
 483                 int i;
 484                 long *ap;
 485                 char *cp;
 486                 char *sysname;
 487                 struct sysent *callp;
 488 
 489                 if (code >= NSYSCALL)
 490                         callp = &nosys_ent; /* nosys has no args */
 491                 else
 492                         callp = LWP_GETSYSENT(lwp) + code;
 493                 (void) save_syscall_args();
 494                 mutex_enter(&systrace_lock);
 495                 printf("%d: ", p->p_pid);
 496                 if (code >= NSYSCALL)
 497                         printf("0x%x", code);
 498                 else {
 499                         sysname = mod_getsysname(code);
 500                         printf("%s[0x%x]", sysname == NULL ? "NULL" :
 501                             sysname, code);
 502                 }
 503                 cp = "(";
 504                 for (i = 0, ap = lwp->lwp_ap; i < callp->sy_narg; i++, ap++) {
 505                         printf("%s%lx", cp, *ap);
 506                         cp = ", ";
 507                 }
 508                 if (i)
 509                         printf(")");
 510                 printf(" %s id=0x%p\n", PTOU(p)->u_comm, curthread);
 511                 mutex_exit(&systrace_lock);
 512         }
 513 #endif /* SYSCALLTRACE */
 514 
 515         /*
 516          * If there was a continuing reason for pre-syscall processing,
 517          * set the t_pre_sys flag for the next system call.
 518          */
 519         if (repost)
 520                 t->t_pre_sys = 1;
 521         lwp->lwp_error = 0;  /* for old drivers */
 522         lwp->lwp_badpriv = PRIV_NONE;        /* for privilege tracing */
 523         return (0);
 524 }
 525 
 526 /*
 527  * Post-syscall processing.  Perform abnormal system call completion
 528  * actions such as /proc tracing, profiling, signals, preemption, etc.
 529  *
 530  * This routine is called only if t_post_sys, t_sig_check, or t_astflag is set.
 531  * Any condition requiring pre-syscall handling must set one of these.
 532  * If the condition is persistent, this routine will repost t_post_sys.
 533  */
 534 void
 535 post_syscall(long rval1, long rval2)
 536 {
 537         kthread_t       *t = curthread;
 538         proc_t  *p = curproc;
 539         klwp_t  *lwp = ttolwp(t);
 540         struct regs *rp = lwptoregs(lwp);
 541         uint_t  error;
 542         int     code = t->t_sysnum;
 543         int     repost = 0;
 544         int     proc_stop = 0;          /* non-zero if stopping for /proc */
 545         int     sigprof = 0;            /* non-zero if sending SIGPROF */
 546 
 547         t->t_post_sys = 0;
 548 
 549         error = lwp->lwp_errno;
 550 
 551         /*
 552          * Code can be zero if this is a new LWP returning after a forkall(),
 553          * other than the one which matches the one in the parent which called
 554          * forkall().  In these LWPs, skip most of post-syscall activity.
 555          */
 556         if (code == 0)
 557                 goto sig_check;
 558 
 559         /* put out audit record for this syscall */
 560         if (AU_AUDITING()) {
 561                 rval_t  rval;   /* fix audit_finish() someday */
 562 
 563                 /* XX64 -- truncation of 64-bit return values? */
 564                 rval.r_val1 = (int)rval1;
 565                 rval.r_val2 = (int)rval2;
 566                 audit_finish(T_SYSCALL, code, error, &rval);
 567                 repost = 1;
 568         }
 569 
 570         if (curthread->t_pdmsg != NULL) {
 571                 char *m = curthread->t_pdmsg;
 572 
 573                 uprintf("%s", m);
 574                 kmem_free(m, strlen(m) + 1);
 575                 curthread->t_pdmsg = NULL;
 576         }
 577 
 578         /*
 579          * If we're going to stop for /proc tracing, set the flag and
 580          * save the arguments so that the return values don't smash them.
 581          */
 582         if (PTOU(p)->u_systrap) {
 583                 if (prismember(&PTOU(p)->u_exitmask, code)) {
 584                         proc_stop = 1;
 585                         (void) save_syscall_args();
 586                 }
 587                 repost = 1;
 588         }
 589 
 590         /*
 591          * Similarly check to see if SIGPROF might be sent.
 592          */
 593         if (curthread->t_rprof != NULL &&
 594             curthread->t_rprof->rp_anystate != 0) {
 595                 (void) save_syscall_args();
 596                 sigprof = 1;
 597         }
 598 
 599         if (lwp->lwp_eosys == NORMALRETURN) {
 600                 if (error == 0) {
 601 #ifdef SYSCALLTRACE
 602                         if (syscalltrace) {
 603                                 mutex_enter(&systrace_lock);
 604                                 printf(
 605                                     "%d: r_val1=0x%lx, r_val2=0x%lx, id 0x%p\n",
 606                                     p->p_pid, rval1, rval2, curthread);
 607                                 mutex_exit(&systrace_lock);
 608                         }
 609 #endif /* SYSCALLTRACE */
 610                         rp->r_tstate &= ~TSTATE_IC;
 611                         rp->r_o0 = rval1;
 612                         rp->r_o1 = rval2;
 613                 } else {
 614                         int sig;
 615 
 616 #ifdef SYSCALLTRACE
 617                         if (syscalltrace) {
 618                                 mutex_enter(&systrace_lock);
 619                                 printf("%d: error=%d, id 0x%p\n",
 620                                     p->p_pid, error, curthread);
 621                                 mutex_exit(&systrace_lock);
 622                         }
 623 #endif /* SYSCALLTRACE */
 624                         if (error == EINTR && t->t_activefd.a_stale)
 625                                 error = EBADF;
 626                         if (error == EINTR &&
 627                             (sig = lwp->lwp_cursig) != 0 &&
 628                             sigismember(&PTOU(p)->u_sigrestart, sig) &&
 629                             PTOU(p)->u_signal[sig - 1] != SIG_DFL &&
 630                             PTOU(p)->u_signal[sig - 1] != SIG_IGN)
 631                                 error = ERESTART;
 632                         rp->r_o0 = error;
 633                         rp->r_tstate |= TSTATE_IC;
 634                 }
 635                 /*
 636                  * The default action is to redo the trap instruction.
 637                  * We increment the pc and npc past it for NORMALRETURN.
 638                  * JUSTRETURN has set up a new pc and npc already.
 639                  * If we are a cloned thread of forkall(), don't
 640                  * adjust here because we have already inherited
 641                  * the adjusted values from our clone.
 642                  */
 643                 if (!(t->t_flag & T_FORKALL)) {
 644                         rp->r_pc = rp->r_npc;
 645                         rp->r_npc += 4;
 646                 }
 647         }
 648 
 649         /*
 650          * From the proc(4) manual page:
 651          * When exit from a system call is being traced, the traced process
 652          * stops on completion of the system call just prior to checking for
 653          * signals and returning to user level.  At this point all return
 654          * values have been stored into the traced process's saved registers.
 655          */
 656         if (proc_stop) {
 657                 mutex_enter(&p->p_lock);
 658                 if (PTOU(p)->u_systrap &&
 659                     prismember(&PTOU(p)->u_exitmask, code))
 660                         stop(PR_SYSEXIT, code);
 661                 mutex_exit(&p->p_lock);
 662         }
 663 
 664         /*
 665          * If we are the parent returning from a successful
 666          * vfork, wait for the child to exec or exit.
 667          * This code must be here and not in the bowels of the system
 668          * so that /proc can intercept exit from vfork in a timely way.
 669          */
 670         if (t->t_flag & T_VFPARENT) {
 671                 ASSERT(code == SYS_vfork || code == SYS_forksys);
 672                 ASSERT(rp->r_o1 == 0 && error == 0);
 673                 vfwait((pid_t)rval1);
 674                 t->t_flag &= ~T_VFPARENT;
 675         }
 676 
 677         /*
 678          * If profiling is active, bill the current PC in user-land
 679          * and keep reposting until profiling is disabled.
 680          */
 681         if (p->p_prof.pr_scale) {
 682                 if (lwp->lwp_oweupc)
 683                         profil_tick(rp->r_pc);
 684                 repost = 1;
 685         }
 686 
 687 sig_check:
 688         /*
 689          * Reset flag for next time.
 690          * We must do this after stopping on PR_SYSEXIT
 691          * because /proc uses the information in lwp_eosys.
 692          */
 693         lwp->lwp_eosys = NORMALRETURN;
 694         clear_stale_fd();
 695         t->t_flag &= ~T_FORKALL;
 696 
 697         if (t->t_astflag | t->t_sig_check) {
 698                 /*
 699                  * Turn off the AST flag before checking all the conditions that
 700                  * may have caused an AST.  This flag is on whenever a signal or
 701                  * unusual condition should be handled after the next trap or
 702                  * syscall.
 703                  */
 704                 astoff(t);
 705                 t->t_sig_check = 0;
 706 
 707                 /*
 708                  * The following check is legal for the following reasons:
 709                  *      1) The thread we are checking, is ourselves, so there is
 710                  *         no way the proc can go away.
 711                  *      2) The only time we need to be protected by the
 712                  *         lock is if the binding is changed.
 713                  *
 714                  *      Note we will still take the lock and check the binding
 715                  *      if the condition was true without the lock held.  This
 716                  *      prevents lock contention among threads owned by the
 717                  *      same proc.
 718                  */
 719 
 720                 if (curthread->t_proc_flag & TP_CHANGEBIND) {
 721                         mutex_enter(&p->p_lock);
 722                         if (curthread->t_proc_flag & TP_CHANGEBIND) {
 723                                 timer_lwpbind();
 724                                 curthread->t_proc_flag &= ~TP_CHANGEBIND;
 725                         }
 726                         mutex_exit(&p->p_lock);
 727                 }
 728 
 729                 /*
 730                  * for kaio requests on the special kaio poll queue,
 731                  * copyout their results to user memory.
 732                  */
 733                 if (p->p_aio)
 734                         aio_cleanup(0);
 735 
 736                 /*
 737                  * If this LWP was asked to hold, call holdlwp(), which will
 738                  * stop.  holdlwps() sets this up and calls pokelwps() which
 739                  * sets the AST flag.
 740                  *
 741                  * Also check TP_EXITLWP, since this is used by fresh new LWPs
 742                  * through lwp_rtt().  That flag is set if the lwp_create(2)
 743                  * syscall failed after creating the LWP.
 744                  */
 745                 if (ISHOLD(p) || (t->t_proc_flag & TP_EXITLWP))
 746                         holdlwp();
 747 
 748                 /*
 749                  * All code that sets signals and makes ISSIG_PENDING
 750                  * evaluate true must set t_sig_check afterwards.
 751                  */
 752                 if (ISSIG_PENDING(t, lwp, p)) {
 753                         if (issig(FORREAL))
 754                                 psig();
 755                         t->t_sig_check = 1;  /* recheck next time */
 756                 }
 757 
 758                 if (sigprof) {
 759                         int nargs = (code > 0 && code < NSYSCALL)?
 760                             LWP_GETSYSENT(lwp)[code].sy_narg : 0;
 761                         realsigprof(code, nargs, error);
 762                         t->t_sig_check = 1;  /* recheck next time */
 763                 }
 764 
 765                 /*
 766                  * If a performance counter overflow interrupt was
 767                  * delivered *during* the syscall, then re-enable the
 768                  * AST so that we take a trip through trap() to cause
 769                  * the SIGEMT to be delivered.
 770                  */
 771                 if (lwp->lwp_pcb.pcb_flags & CPC_OVERFLOW)
 772                         aston(t);
 773 
 774                 /*
 775                  * If an asynchronous hardware error is pending, turn AST flag
 776                  * back on.  AST will be checked again before we return to user
 777                  * mode and we'll come back through trap() to handle the error.
 778                  */
 779                 if (lwp->lwp_pcb.pcb_flags & ASYNC_HWERR)
 780                         aston(t);
 781         }
 782 
 783         /*
 784          * Restore register window if a debugger modified it.
 785          * Set up to perform a single-step if a debugger requested it.
 786          */
 787         if (lwp->lwp_pcb.pcb_xregstat != XREGNONE)
 788                 xregrestore(lwp, 1);
 789 
 790         lwp->lwp_errno = 0;          /* clear error for next time */
 791 
 792 #ifndef NPROBE
 793         /* Kernel probe */
 794         if (tnf_tracing_active) {
 795                 TNF_PROBE_3(syscall_end, "syscall thread", /* CSTYLED */,
 796                     tnf_long,   rval1,          rval1,
 797                     tnf_long,   rval2,          rval2,
 798                     tnf_long,   errno,          (long)error);
 799                 repost = 1;
 800         }
 801 #endif /* NPROBE */
 802 
 803         /*
 804          * Set state to LWP_USER here so preempt won't give us a kernel
 805          * priority if it occurs after this point.  Call CL_TRAPRET() to
 806          * restore the user-level priority.
 807          *
 808          * It is important that no locks (other than spinlocks) be entered
 809          * after this point before returning to user mode (unless lwp_state
 810          * is set back to LWP_SYS).
 811          *
 812          * Sampled times past this point are charged to the user.
 813          */
 814         lwp->lwp_state = LWP_USER;
 815 
 816         if (t->t_trapret) {
 817                 t->t_trapret = 0;
 818                 thread_lock(t);
 819                 CL_TRAPRET(t);
 820                 thread_unlock(t);
 821         }
 822         if (CPU->cpu_runrun || t->t_schedflag & TS_ANYWAITQ)
 823                 preempt();
 824         prunstop();
 825 
 826         /*
 827          * t_post_sys will be set if pcb_step is active.
 828          */
 829         if (lwp->lwp_pcb.pcb_step != STEP_NONE) {
 830                 prdostep();
 831                 repost = 1;
 832         }
 833 
 834         t->t_sysnum = 0;     /* no longer in a system call */
 835 
 836         /*
 837          * In case the args were copied to the lwp, reset the
 838          * pointer so the next syscall will have the right lwp_ap pointer.
 839          */
 840         lwp->lwp_ap = (long *)&rp->r_o0;
 841         lwp->lwp_argsaved = 0;
 842 
 843         /*
 844          * If there was a continuing reason for post-syscall processing,
 845          * set the t_post_sys flag for the next system call.
 846          */
 847         if (repost)
 848                 t->t_post_sys = 1;
 849 
 850         /*
 851          * If there is a ustack registered for this lwp, and the stack rlimit
 852          * has been altered, read in the ustack. If the saved stack rlimit
 853          * matches the bounds of the ustack, update the ustack to reflect
 854          * the new rlimit. If the new stack rlimit is RLIM_INFINITY, disable
 855          * stack checking by setting the size to 0.
 856          */
 857         if (lwp->lwp_ustack != 0 && lwp->lwp_old_stk_ctl != 0) {
 858                 rlim64_t new_size;
 859                 model_t model;
 860                 caddr_t top;
 861                 struct rlimit64 rl;
 862 
 863                 mutex_enter(&p->p_lock);
 864                 new_size = p->p_stk_ctl;
 865                 model = p->p_model;
 866                 top = p->p_usrstack;
 867                 (void) rctl_rlimit_get(rctlproc_legacy[RLIMIT_STACK], p, &rl);
 868                 mutex_exit(&p->p_lock);
 869 
 870                 if (rl.rlim_cur == RLIM64_INFINITY)
 871                         new_size = 0;
 872 
 873                 if (model == DATAMODEL_NATIVE) {
 874                         stack_t stk;
 875 
 876                         if (copyin((stack_t *)lwp->lwp_ustack, &stk,
 877                             sizeof (stack_t)) == 0 &&
 878                             (stk.ss_size == lwp->lwp_old_stk_ctl ||
 879                             stk.ss_size == 0) &&
 880                             stk.ss_sp == top - stk.ss_size) {
 881                                 stk.ss_sp = (void *)((uintptr_t)stk.ss_sp +
 882                                     stk.ss_size - new_size);
 883                                 stk.ss_size = new_size;
 884 
 885                                 (void) copyout(&stk,
 886                                     (stack_t *)lwp->lwp_ustack,
 887                                     sizeof (stack_t));
 888                         }
 889                 } else {
 890                         stack32_t stk32;
 891 
 892                         if (copyin((stack32_t *)lwp->lwp_ustack, &stk32,
 893                             sizeof (stack32_t)) == 0 &&
 894                             (stk32.ss_size == lwp->lwp_old_stk_ctl ||
 895                             stk32.ss_size == 0) &&
 896                             stk32.ss_sp ==
 897                             (caddr32_t)(uintptr_t)(top - stk32.ss_size)) {
 898                                 stk32.ss_sp += stk32.ss_size - new_size;
 899                                 stk32.ss_size = new_size;
 900 
 901                                 (void) copyout(&stk32,
 902                                     (stack32_t *)lwp->lwp_ustack,
 903                                     sizeof (stack32_t));
 904                         }
 905                 }
 906 
 907                 lwp->lwp_old_stk_ctl = 0;
 908         }
 909 
 910         syscall_mstate(LMS_SYSTEM, LMS_USER);
 911 }
 912 
 913 /*
 914  * Call a system call which takes a pointer to the user args struct and
 915  * a pointer to the return values.  This is a bit slower than the standard
 916  * C arg-passing method in some cases.
 917  */
 918 int64_t
 919 syscall_ap()
 920 {
 921         uint_t  error;
 922         struct sysent *callp;
 923         rval_t  rval;
 924         klwp_t  *lwp = ttolwp(curthread);
 925         struct regs *rp = lwptoregs(lwp);
 926 
 927         callp = LWP_GETSYSENT(lwp) + curthread->t_sysnum;
 928 
 929         /*
 930          * If the arguments don't fit in registers %o0 - o5, make sure they
 931          * have been copied to the lwp_arg array.
 932          */
 933         if (callp->sy_narg > 6 && save_syscall_args())
 934                 return ((int64_t)set_errno(EFAULT));
 935 
 936         rval.r_val1 = 0;
 937         rval.r_val2 = (int)rp->r_o1;
 938         lwp->lwp_error = 0;  /* for old drivers */
 939         error = (*(callp->sy_call))(lwp->lwp_ap, &rval);
 940         if (error)
 941                 return ((int64_t)set_errno(error));
 942         return (rval.r_vals);
 943 }
 944 
 945 /*
 946  * Load system call module.
 947  *      Returns with pointer to held read lock for module.
 948  */
 949 static krwlock_t *
 950 lock_syscall(struct sysent *table, uint_t code)
 951 {
 952         krwlock_t       *module_lock;
 953         struct modctl   *modp;
 954         int             id;
 955         struct sysent   *callp;
 956 
 957         module_lock = table[code].sy_lock;
 958         callp = &table[code];
 959 
 960         /*
 961          * Optimization to only call modload if we don't have a loaded
 962          * syscall.
 963          */
 964         rw_enter(module_lock, RW_READER);
 965         if (LOADED_SYSCALL(callp))
 966                 return (module_lock);
 967         rw_exit(module_lock);
 968 
 969         for (;;) {
 970                 if ((id = modload("sys", syscallnames[code])) == -1)
 971                         break;
 972 
 973                 /*
 974                  * If we loaded successfully at least once, the modctl
 975                  * will still be valid, so we try to grab it by filename.
 976                  * If this call fails, it's because the mod_filename
 977                  * was changed after the call to modload() (mod_hold_by_name()
 978                  * is the likely culprit).  We can safely just take
 979                  * another lap if this is the case;  the modload() will
 980                  * change the mod_filename back to one by which we can
 981                  * find the modctl.
 982                  */
 983                 modp = mod_find_by_filename("sys", syscallnames[code]);
 984 
 985                 if (modp == NULL)
 986                         continue;
 987 
 988                 mutex_enter(&mod_lock);
 989 
 990                 if (!modp->mod_installed) {
 991                         mutex_exit(&mod_lock);
 992                         continue;
 993                 }
 994                 break;
 995         }
 996 
 997         rw_enter(module_lock, RW_READER);
 998 
 999         if (id != -1)
1000                 mutex_exit(&mod_lock);
1001 
1002         return (module_lock);
1003 }
1004 
1005 /*
1006  * Loadable syscall support.
1007  *      If needed, load the module, then reserve it by holding a read
1008  *      lock for the duration of the call.
1009  *      Later, if the syscall is not unloadable, it could patch the vector.
1010  */
1011 /*ARGSUSED*/
1012 int64_t
1013 loadable_syscall(
1014     long a0, long a1, long a2, long a3,
1015     long a4, long a5, long a6, long a7)
1016 {
1017         int64_t         rval;
1018         struct sysent   *callp;
1019         struct sysent   *se = LWP_GETSYSENT(ttolwp(curthread));
1020         krwlock_t       *module_lock;
1021         int             code;
1022 
1023         code = curthread->t_sysnum;
1024         callp = se + code;
1025 
1026         /*
1027          * Try to autoload the system call if necessary.
1028          */
1029         module_lock = lock_syscall(se, code);
1030 
1031         /*
1032          * we've locked either the loaded syscall or nosys
1033          */
1034         if (callp->sy_flags & SE_ARGC) {
1035                 int64_t (*sy_call)();
1036 
1037                 sy_call = (int64_t (*)())callp->sy_call;
1038                 rval = (*sy_call)(a0, a1, a2, a3, a4, a5);
1039         } else {
1040                 rval = syscall_ap();
1041         }
1042 
1043         rw_exit(module_lock);
1044         return (rval);
1045 }
1046 
1047 /*
1048  * Handle indirect system calls.
1049  *      This interface should be deprecated.  The library can handle
1050  *      this more efficiently, but keep this implementation for old binaries.
1051  *
1052  * XX64 Needs some work.
1053  */
1054 int64_t
1055 indir(int code, long a0, long a1, long a2, long a3, long a4)
1056 {
1057         klwp_t          *lwp = ttolwp(curthread);
1058         struct sysent   *callp;
1059 
1060         if (code <= 0 || code >= NSYSCALL)
1061                 return (nosys());
1062 
1063         ASSERT(lwp->lwp_ap != NULL);
1064 
1065         curthread->t_sysnum = code;
1066         callp = LWP_GETSYSENT(lwp) + code;
1067 
1068         /*
1069          * Handle argument setup, unless already done in pre_syscall().
1070          */
1071         if (callp->sy_narg > 5) {
1072                 if (save_syscall_args())        /* move args to LWP array */
1073                         return ((int64_t)set_errno(EFAULT));
1074         } else if (!lwp->lwp_argsaved) {
1075                 long *ap;
1076 
1077                 ap = lwp->lwp_ap;            /* args haven't been saved */
1078                 lwp->lwp_ap = ap + 1;                /* advance arg pointer */
1079                 curthread->t_post_sys = 1;   /* so lwp_ap will be reset */
1080         }
1081         return ((*callp->sy_callc)(a0, a1, a2, a3, a4, lwp->lwp_arg[5]));
1082 }
1083 
1084 /*
1085  * set_errno - set an error return from the current system call.
1086  *      This could be a macro.
1087  *      This returns the value it is passed, so that the caller can
1088  *      use tail-recursion-elimination and do return (set_errno(ERRNO));
1089  */
1090 uint_t
1091 set_errno(uint_t error)
1092 {
1093         ASSERT(error != 0);             /* must not be used to clear errno */
1094 
1095         curthread->t_post_sys = 1;   /* have post_syscall do error return */
1096         return (ttolwp(curthread)->lwp_errno = error);
1097 }
1098 
1099 /*
1100  * set_proc_pre_sys - Set pre-syscall processing for entire process.
1101  */
1102 void
1103 set_proc_pre_sys(proc_t *p)
1104 {
1105         kthread_t       *t;
1106         kthread_t       *first;
1107 
1108         ASSERT(MUTEX_HELD(&p->p_lock));
1109 
1110         t = first = p->p_tlist;
1111         do {
1112                 t->t_pre_sys = 1;
1113         } while ((t = t->t_forw) != first);
1114 }
1115 
1116 /*
1117  * set_proc_post_sys - Set post-syscall processing for entire process.
1118  */
1119 void
1120 set_proc_post_sys(proc_t *p)
1121 {
1122         kthread_t       *t;
1123         kthread_t       *first;
1124 
1125         ASSERT(MUTEX_HELD(&p->p_lock));
1126 
1127         t = first = p->p_tlist;
1128         do {
1129                 t->t_post_sys = 1;
1130         } while ((t = t->t_forw) != first);
1131 }
1132 
1133 /*
1134  * set_proc_sys - Set pre- and post-syscall processing for entire process.
1135  */
1136 void
1137 set_proc_sys(proc_t *p)
1138 {
1139         kthread_t       *t;
1140         kthread_t       *first;
1141 
1142         ASSERT(MUTEX_HELD(&p->p_lock));
1143 
1144         t = first = p->p_tlist;
1145         do {
1146                 t->t_pre_sys = 1;
1147                 t->t_post_sys = 1;
1148         } while ((t = t->t_forw) != first);
1149 }
1150 
1151 /*
1152  * set_all_proc_sys - set pre- and post-syscall processing flags for all
1153  * user processes.
1154  *
1155  * This is needed when auditing, tracing, or other facilities which affect
1156  * all processes are turned on.
1157  */
1158 void
1159 set_all_proc_sys()
1160 {
1161         kthread_t       *t;
1162         kthread_t       *first;
1163 
1164         mutex_enter(&pidlock);
1165         t = first = curthread;
1166         do {
1167                 t->t_pre_sys = 1;
1168                 t->t_post_sys = 1;
1169         } while ((t = t->t_next) != first);
1170         mutex_exit(&pidlock);
1171 }
1172 
1173 /*
1174  * set_all_zone_usr_proc_sys - set pre- and post-syscall processing flags for
1175  * all user processes running in the zone of the current process
1176  *
1177  * This is needed when auditing is turned on.
1178  */
1179 void
1180 set_all_zone_usr_proc_sys(zoneid_t zoneid)
1181 {
1182         proc_t      *p;
1183         kthread_t   *t;
1184 
1185         mutex_enter(&pidlock);
1186         for (p = practive; p != NULL; p = p->p_next) {
1187                 /* skip kernel processes */
1188                 if (p->p_exec == NULLVP || p->p_as == &kas ||
1189                     p->p_stat == SIDL || p->p_stat == SZOMB ||
1190                     (p->p_flag & (SSYS | SEXITING | SEXITLWPS)))
1191                         continue;
1192                 /*
1193                  * Only processes in the given zone (eventually in
1194                  * all zones) are taken into account
1195                  */
1196                 if (zoneid == ALL_ZONES || p->p_zone->zone_id == zoneid) {
1197                         mutex_enter(&p->p_lock);
1198                         if ((t = p->p_tlist) == NULL) {
1199                                 mutex_exit(&p->p_lock);
1200                                 continue;
1201                         }
1202                         /*
1203                          * Set pre- and post-syscall processing flags
1204                          * for all threads of the process
1205                          */
1206                         do {
1207                                 t->t_pre_sys = 1;
1208                                 t->t_post_sys = 1;
1209                         } while (p->p_tlist != (t = t->t_forw));
1210                         mutex_exit(&p->p_lock);
1211                 }
1212         }
1213         mutex_exit(&pidlock);
1214 }
1215 
1216 /*
1217  * set_proc_ast - Set asynchronous service trap (AST) flag for all
1218  * threads in process.
1219  */
1220 void
1221 set_proc_ast(proc_t *p)
1222 {
1223         kthread_t       *t;
1224         kthread_t       *first;
1225 
1226         ASSERT(MUTEX_HELD(&p->p_lock));
1227 
1228         t = first = p->p_tlist;
1229         do {
1230                 aston(t);
1231         } while ((t = t->t_forw) != first);
1232 }