Print this page
de-linting of .s files
remove inlines,some other files

Split Close
Expand all
Collapse all
          --- old/usr/src/uts/i86pc/ml/syscall_asm.s
          +++ new/usr/src/uts/i86pc/ml/syscall_asm.s
↓ open down ↓ 12 lines elided ↑ open up ↑
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
       23 + * Copyright 2019 Joyent, Inc.
  23   24   * Copyright (c) 2016 by Delphix. All rights reserved.
  24   25   */
  25   26  
  26      -/*      Copyright (c) 1990, 1991 UNIX System Laboratories, Inc. */
  27      -/*      Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T   */
  28      -/*        All Rights Reserved                                   */
  29      -
  30      -/*      Copyright (c) 1987, 1988 Microsoft Corporation          */
  31      -/*        All Rights Reserved                                   */
  32      -
  33   27  #include <sys/asm_linkage.h>
  34   28  #include <sys/asm_misc.h>
  35   29  #include <sys/regset.h>
       30 +#include <sys/privregs.h>
  36   31  #include <sys/psw.h>
  37      -#include <sys/x86_archext.h>
  38   32  #include <sys/machbrand.h>
  39      -#include <sys/privregs.h>
  40   33  
  41      -#if defined(__lint)
  42      -
  43      -#include <sys/types.h>
  44      -#include <sys/thread.h>
  45      -#include <sys/systm.h>
  46      -
  47      -#else   /* __lint */
  48      -
  49   34  #include <sys/segments.h>
  50   35  #include <sys/pcb.h>
  51   36  #include <sys/trap.h>
  52   37  #include <sys/ftrace.h>
  53   38  #include <sys/traptrace.h>
  54   39  #include <sys/clock.h>
       40 +#include <sys/model.h>
  55   41  #include <sys/panic.h>
  56      -#include "assym.h"
  57   42  
  58      -#endif  /* __lint */
       43 +#if defined(__xpv)
       44 +#include <sys/hypervisor.h>
       45 +#endif
  59   46  
       47 +#include "assym.h"
       48 +
  60   49  /*
  61      - * We implement two flavours of system call entry points
       50 + * We implement five flavours of system call entry points
  62   51   *
  63      - * -    {int,lcall}/iret        (i386)
  64      - * -    sysenter/sysexit        (Pentium II and beyond)
       52 + * -    syscall/sysretq         (amd64 generic)
       53 + * -    syscall/sysretl         (i386 plus SYSC bit)
       54 + * -    sysenter/sysexit        (i386 plus SEP bit)
       55 + * -    int/iret                (i386 generic)
       56 + * -    lcall/iret              (i386 generic)
  65   57   *
  66      - * The basic pattern used in the handlers is to check to see if we can
  67      - * do fast (simple) version of the system call; if we can't we use various
  68      - * C routines that handle corner cases and debugging.
       58 + * The current libc included in Solaris uses int/iret as the base unoptimized
       59 + * kernel entry method. Older libc implementations and legacy binaries may use
       60 + * the lcall call gate, so it must continue to be supported.
  69   61   *
  70      - * To reduce the amount of assembler replication, yet keep the system call
  71      - * implementations vaguely comprehensible, the common code in the body
  72      - * of the handlers is broken up into a set of preprocessor definitions
  73      - * below.
       62 + * System calls that use an lcall call gate are processed in trap() via a
       63 + * segment-not-present trap, i.e. lcalls are extremely slow(!).
       64 + *
       65 + * The basic pattern used in the 32-bit SYSC handler at this point in time is
       66 + * to have the bare minimum of assembler, and get to the C handlers as
       67 + * quickly as possible.
       68 + *
       69 + * The 64-bit handler is much closer to the sparcv9 handler; that's
       70 + * because of passing arguments in registers.  The 32-bit world still
       71 + * passes arguments on the stack -- that makes that handler substantially
       72 + * more complex.
       73 + *
       74 + * The two handlers share a few code fragments which are broken
       75 + * out into preprocessor macros below.
       76 + *
       77 + * XX64 come back and speed all this up later.  The 32-bit stuff looks
       78 + * especially easy to speed up the argument copying part ..
       79 + *
       80 + *
       81 + * Notes about segment register usage (c.f. the 32-bit kernel)
       82 + *
       83 + * In the 32-bit kernel, segment registers are dutifully saved and
       84 + * restored on all mode transitions because the kernel uses them directly.
       85 + * When the processor is running in 64-bit mode, segment registers are
       86 + * largely ignored.
       87 + *
       88 + * %cs and %ss
       89 + *      controlled by the hardware mechanisms that make mode transitions
       90 + *
       91 + * The remaining segment registers have to either be pointing at a valid
       92 + * descriptor i.e. with the 'present' bit set, or they can NULL descriptors
       93 + *
       94 + * %ds and %es
       95 + *      always ignored
       96 + *
       97 + * %fs and %gs
       98 + *      fsbase and gsbase are used to control the place they really point at.
       99 + *      The kernel only depends on %gs, and controls its own gsbase via swapgs
      100 + *
      101 + * Note that loading segment registers is still costly because the GDT
      102 + * lookup still happens (this is because the hardware can't know that we're
      103 + * not setting up these segment registers for a 32-bit program).  Thus we
      104 + * avoid doing this in the syscall path, and defer them to lwp context switch
      105 + * handlers, so the register values remain virtualized to the lwp.
  74  106   */
  75  107  
  76      -/*
  77      - * When we have SYSCALLTRACE defined, we sneak an extra
  78      - * predicate into a couple of tests.
  79      - */
  80  108  #if defined(SYSCALLTRACE)
  81      -#define ORL_SYSCALLTRACE(r32)   \
  82      -        orl     syscalltrace, r32
      109 +#define ORL_SYSCALLTRACE(r32)           \
      110 +        orl     syscalltrace(%rip), r32
  83  111  #else
  84  112  #define ORL_SYSCALLTRACE(r32)
  85  113  #endif
  86  114  
  87  115  /*
  88      - * This check is false whenever we want to go fast i.e.
      116 + * In the 32-bit kernel, we do absolutely nothing before getting into the
      117 + * brand callback checks.  In 64-bit land, we do swapgs and then come here.
      118 + * We assume that the %rsp- and %r15-stashing fields in the CPU structure
      119 + * are still unused.
  89  120   *
  90      - *      if (code >= NSYSCALL ||
  91      - *          t->t_pre_sys || (t->t_proc_flag & TP_WATCHPT) != 0)
  92      - *              do full version
  93      - * #ifdef SYSCALLTRACE
  94      - *      if (syscalltrace)
  95      - *              do full version
  96      - * #endif
  97      - *
  98      - * Preconditions:
  99      - * -    t       curthread
 100      - * -    code    contains the syscall number
 101      - * Postconditions:
 102      - * -    %ecx and %edi are smashed
 103      - * -    condition code flag ZF is cleared if pre-sys is too complex
 104      - */
 105      -#define CHECK_PRESYS_NE(t, code)                \
 106      -        movzbl  T_PRE_SYS(t), %edi;             \
 107      -        movzwl  T_PROC_FLAG(t), %ecx;           \
 108      -        andl    $TP_WATCHPT, %ecx;              \
 109      -        orl     %ecx, %edi;                     \
 110      -        cmpl    $NSYSCALL, code;                \
 111      -        setae   %cl;                            \
 112      -        movzbl  %cl, %ecx;                      \
 113      -        orl     %ecx, %edi;                     \
 114      -        ORL_SYSCALLTRACE(%edi)
 115      -
 116      -/*
 117  121   * Check if a brand_mach_ops callback is defined for the specified callback_id
 118      - * type.  If so invoke it with the user's %gs value loaded and the following
      122 + * type.  If so invoke it with the kernel's %gs value loaded and the following
 119  123   * data on the stack:
 120      - *         --------------------------------------
 121      - *         | user's %ss                         |
 122      - *    |    | user's %esp                        |
 123      - *    |    | EFLAGS register                    |
 124      - *    |    | user's %cs                         |
 125      - *    |    | user's %eip (user return address)  |
 126      - *    |    | 'scratch space'                    |
 127      - *    |    | user's %ebx                        |
 128      - *    |    | user's %gs selector                |
 129      - *    v    | lwp pointer                        |
 130      - *         | callback wrapper return addr       |
      124 + *
      125 + * stack:  --------------------------------------
      126 + *      32 | callback pointer                   |
      127 + *    | 24 | user (or interrupt) stack pointer  |
      128 + *    | 16 | lwp pointer                        |
      129 + *    v  8 | userland return address            |
      130 + *       0 | callback wrapper return addr       |
 131  131   *         --------------------------------------
 132  132   *
 133      - * If the brand code returns, we assume that we are meant to execute the
 134      - * normal system call path.
      133 + * Since we're pushing the userland return address onto the kernel stack
      134 + * we need to get that address without accessing the user's stack (since we
      135 + * can't trust that data).  There are different ways to get the userland
      136 + * return address depending on how the syscall trap was made:
 135  137   *
 136      - * The interface to the brand callbacks on the 32-bit kernel assumes %ebx
      138 + * a) For sys_syscall and sys_syscall32 the return address is in %rcx.
      139 + * b) For sys_sysenter the return address is in %rdx.
      140 + * c) For sys_int80 and sys_syscall_int (int91), upon entry into the macro,
      141 + *    the stack pointer points at the state saved when we took the interrupt:
      142 + *       ------------------------
      143 + *    |  | user's %ss           |
      144 + *    |  | user's %esp          |
      145 + *    |  | EFLAGS register      |
      146 + *    v  | user's %cs           |
      147 + *       | user's %eip          |
      148 + *       ------------------------
      149 + *
      150 + * The 2nd parameter to the BRAND_CALLBACK macro is either the
      151 + * BRAND_URET_FROM_REG or BRAND_URET_FROM_INTR_STACK macro.  These macros are
      152 + * used to generate the proper code to get the userland return address for
      153 + * each syscall entry point.
      154 + *
      155 + * The interface to the brand callbacks on the 64-bit kernel assumes %r15
 137  156   * is available as a scratch register within the callback.  If the callback
 138      - * returns within the kernel then this macro will restore %ebx.  If the
      157 + * returns within the kernel then this macro will restore %r15.  If the
 139  158   * callback is going to return directly to userland then it should restore
 140      - * %ebx before returning to userland.
      159 + * %r15 before returning to userland.
 141  160   */
 142      -#define BRAND_CALLBACK(callback_id)                                         \
 143      -        subl    $4, %esp                /* save some scratch space      */ ;\
 144      -        pushl   %ebx                    /* save %ebx to use for scratch */ ;\
 145      -        pushl   %gs                     /* save the user %gs            */ ;\
 146      -        movl    $KGS_SEL, %ebx                                             ;\
 147      -        movw    %bx, %gs                /* switch to the kernel's %gs   */ ;\
 148      -        movl    %gs:CPU_THREAD, %ebx    /* load the thread pointer      */ ;\
 149      -        movl    T_LWP(%ebx), %ebx       /* load the lwp pointer         */ ;\
 150      -        pushl   %ebx                    /* push the lwp pointer         */ ;\
 151      -        movl    LWP_PROCP(%ebx), %ebx   /* load the proc pointer        */ ;\
 152      -        movl    P_BRAND(%ebx), %ebx     /* load the brand pointer       */ ;\
 153      -        movl    B_MACHOPS(%ebx), %ebx   /* load the machops pointer     */ ;\
 154      -        movl    _CONST(_MUL(callback_id, CPTRSIZE))(%ebx), %ebx            ;\
 155      -        cmpl    $0, %ebx                                                   ;\
      161 +#define BRAND_URET_FROM_REG(rip_reg)                                    \
      162 +        pushq   rip_reg                 /* push the return address      */
      163 +
      164 +/*
      165 + * The interrupt stack pointer we saved on entry to the BRAND_CALLBACK macro
      166 + * is currently pointing at the user return address (%eip).
      167 + */
      168 +#define BRAND_URET_FROM_INTR_STACK()                                    \
      169 +        movq    %gs:CPU_RTMP_RSP, %r15  /* grab the intr. stack pointer */ ;\
      170 +        pushq   (%r15)                  /* push the return address      */
      171 +
      172 +#define BRAND_CALLBACK(callback_id, push_userland_ret)                      \
      173 +        movq    %rsp, %gs:CPU_RTMP_RSP  /* save the stack pointer       */ ;\
      174 +        movq    %r15, %gs:CPU_RTMP_R15  /* save %r15                    */ ;\
      175 +        movq    %gs:CPU_THREAD, %r15    /* load the thread pointer      */ ;\
      176 +        movq    T_STACK(%r15), %rsp     /* switch to the kernel stack   */ ;\
      177 +        subq    $16, %rsp               /* save space for 2 pointers    */ ;\
      178 +        pushq   %r14                    /* save %r14                    */ ;\
      179 +        movq    %gs:CPU_RTMP_RSP, %r14                                     ;\
      180 +        movq    %r14, 8(%rsp)           /* stash the user stack pointer */ ;\
      181 +        popq    %r14                    /* restore %r14                 */ ;\
      182 +        movq    T_LWP(%r15), %r15       /* load the lwp pointer         */ ;\
      183 +        pushq   %r15                    /* push the lwp pointer         */ ;\
      184 +        movq    LWP_PROCP(%r15), %r15   /* load the proc pointer        */ ;\
      185 +        movq    P_BRAND(%r15), %r15     /* load the brand pointer       */ ;\
      186 +        movq    B_MACHOPS(%r15), %r15   /* load the machops pointer     */ ;\
      187 +        movq    _CONST(_MUL(callback_id, CPTRSIZE))(%r15), %r15            ;\
      188 +        cmpq    $0, %r15                                                   ;\
 156  189          je      1f                                                         ;\
 157      -        movl    %ebx, 12(%esp)          /* save callback to scratch     */ ;\
 158      -        movl    4(%esp), %ebx           /* grab the user %gs            */ ;\
 159      -        movw    %bx, %gs                /* restore the user %gs         */ ;\
 160      -        call    *12(%esp)               /* call callback in scratch     */ ;\
 161      -1:      movl    4(%esp), %ebx           /* restore user %gs (re-do if   */ ;\
 162      -        movw    %bx, %gs                /* branch due to no callback)   */ ;\
 163      -        movl    8(%esp), %ebx           /* restore user's %ebx          */ ;\
 164      -        addl    $16, %esp               /* restore stack ptr            */
      190 +        movq    %r15, 16(%rsp)          /* save the callback pointer    */ ;\
      191 +        push_userland_ret               /* push the return address      */ ;\
      192 +        movq    24(%rsp), %r15          /* load callback pointer        */ ;\
      193 +        INDIRECT_CALL_REG(r15)          /* call callback                */ ;\
      194 +1:      movq    %gs:CPU_RTMP_R15, %r15  /* restore %r15                 */ ;\
      195 +        movq    %gs:CPU_RTMP_RSP, %rsp  /* restore the stack pointer    */
 165  196  
 166  197  #define MSTATE_TRANSITION(from, to)             \
 167      -        pushl   $to;                            \
 168      -        pushl   $from;                          \
 169      -        call    syscall_mstate;                 \
 170      -        addl    $0x8, %esp
      198 +        movl    $from, %edi;                    \
      199 +        movl    $to, %esi;                      \
      200 +        call    syscall_mstate
 171  201  
 172  202  /*
 173      - * aka CPU_STATS_ADDQ(CPU, sys.syscall, 1)
 174      - * This must be called with interrupts or preemption disabled.
      203 + * Check to see if a simple (direct) return is possible i.e.
      204 + *
      205 + *      if (t->t_post_sys_ast | syscalltrace |
      206 + *          lwp->lwp_pcb.pcb_rupdate == 1)
      207 + *              do full version ;
      208 + *
      209 + * Preconditions:
      210 + * -    t is curthread
      211 + * Postconditions:
      212 + * -    condition code NE is set if post-sys is too complex
      213 + * -    rtmp is zeroed if it isn't (we rely on this!)
      214 + * -    ltmp is smashed
 175  215   */
 176      -#define CPU_STATS_SYS_SYSCALL_INC                       \
 177      -        addl    $1, %gs:CPU_STATS_SYS_SYSCALL;          \
 178      -        adcl    $0, %gs:CPU_STATS_SYS_SYSCALL+4;
      216 +#define CHECK_POSTSYS_NE(t, ltmp, rtmp)                 \
      217 +        movq    T_LWP(t), ltmp;                         \
      218 +        movzbl  PCB_RUPDATE(ltmp), rtmp;                \
      219 +        ORL_SYSCALLTRACE(rtmp);                         \
      220 +        orl     T_POST_SYS_AST(t), rtmp;                \
      221 +        cmpl    $0, rtmp
 179  222  
 180      -#if !defined(__lint)
      223 +/*
      224 + * Fix up the lwp, thread, and eflags for a successful return
      225 + *
      226 + * Preconditions:
      227 + * -    zwreg contains zero
      228 + */
      229 +#define SIMPLE_SYSCALL_POSTSYS(t, lwp, zwreg)           \
      230 +        movb    $LWP_USER, LWP_STATE(lwp);              \
      231 +        movw    zwreg, T_SYSNUM(t);                     \
      232 +        andb    $_CONST(0xffff - PS_C), REGOFF_RFL(%rsp)
 181  233  
 182  234  /*
 183  235   * ASSERT(lwptoregs(lwp) == rp);
 184  236   *
 185      - * this may seem obvious, but very odd things happen if this
      237 + * This may seem obvious, but very odd things happen if this
 186  238   * assertion is false
 187  239   *
 188  240   * Preconditions:
      241 + *      (%rsp is ready for normal call sequence)
      242 + * Postconditions (if assertion is true):
      243 + *      %r11 is smashed
      244 + *
      245 + * ASSERT(rp->r_cs == descnum)
      246 + *
      247 + * The code selector is written into the regs structure when the
      248 + * lwp stack is created.  We use this ASSERT to validate that
      249 + * the regs structure really matches how we came in.
      250 + *
      251 + * Preconditions:
      252 + *      (%rsp is ready for normal call sequence)
      253 + * Postconditions (if assertion is true):
 189  254   *      -none-
      255 + *
      256 + * ASSERT(lwp->lwp_pcb.pcb_rupdate == 0);
      257 + *
      258 + * If this is false, it meant that we returned to userland without
      259 + * updating the segment registers as we were supposed to.
      260 + *
      261 + * Note that we must ensure no interrupts or other traps intervene
      262 + * between entering privileged mode and performing the assertion,
      263 + * otherwise we may perform a context switch on the thread, which
      264 + * will end up setting pcb_rupdate to 1 again.
      265 + *
      266 + * ASSERT(%cr0 & CR0_TS == 0);
      267 + * Preconditions:
      268 + *      (%rsp is ready for normal call sequence)
 190  269   * Postconditions (if assertion is true):
 191      - *      %esi and %edi are smashed
      270 + *      (specified register is clobbered)
      271 + *
      272 + * Check to make sure that we are returning to user land and that CR0.TS
      273 + * is not set. This is required as part of the eager FPU (see
      274 + * uts/intel/ia32/os/fpu.c for more information).
 192  275   */
      276 +
 193  277  #if defined(DEBUG)
 194  278  
 195  279  __lwptoregs_msg:
 196  280          .string "syscall_asm.s:%d lwptoregs(%p) [%p] != rp [%p]"
 197  281  
 198      -#define ASSERT_LWPTOREGS(t, rp)                         \
 199      -        movl    T_LWP(t), %esi;                         \
 200      -        movl    LWP_REGS(%esi), %edi;                   \
 201      -        cmpl    rp, %edi;                               \
      282 +__codesel_msg:
      283 +        .string "syscall_asm.s:%d rp->r_cs [%ld] != %ld"
      284 +
      285 +__no_rupdate_msg:
      286 +        .string "syscall_asm.s:%d lwp %p, pcb_rupdate != 0"
      287 +
      288 +__bad_ts_msg:
      289 +        .string "syscall_asm.s:%d CR0.TS set on user return"
      290 +
      291 +#define ASSERT_LWPTOREGS(lwp, rp)                       \
      292 +        movq    LWP_REGS(lwp), %r11;                    \
      293 +        cmpq    rp, %r11;                               \
 202  294          je      7f;                                     \
 203      -        pushl   rp;                                     \
 204      -        pushl   %edi;                                   \
 205      -        pushl   %esi;                                   \
 206      -        pushl   $__LINE__;                              \
 207      -        pushl   $__lwptoregs_msg;                       \
      295 +        leaq    __lwptoregs_msg(%rip), %rdi;            \
      296 +        movl    $__LINE__, %esi;                        \
      297 +        movq    lwp, %rdx;                              \
      298 +        movq    %r11, %rcx;                             \
      299 +        movq    rp, %r8;                                \
      300 +        xorl    %eax, %eax;                             \
 208  301          call    panic;                                  \
 209  302  7:
      303 +
      304 +#define ASSERT_NO_RUPDATE_PENDING(lwp)                  \
      305 +        testb   $0x1, PCB_RUPDATE(lwp);                 \
      306 +        je      8f;                                     \
      307 +        movq    lwp, %rdx;                              \
      308 +        leaq    __no_rupdate_msg(%rip), %rdi;           \
      309 +        movl    $__LINE__, %esi;                        \
      310 +        xorl    %eax, %eax;                             \
      311 +        call    panic;                                  \
      312 +8:
      313 +
      314 +#define ASSERT_CR0TS_ZERO(reg)                          \
      315 +        movq    %cr0, reg;                              \
      316 +        testq   $CR0_TS, reg;                           \
      317 +        jz      9f;                                     \
      318 +        leaq    __bad_ts_msg(%rip), %rdi;               \
      319 +        movl    $__LINE__, %esi;                        \
      320 +        xorl    %eax, %eax;                             \
      321 +        call    panic;                                  \
      322 +9:
      323 +
 210  324  #else
 211      -#define ASSERT_LWPTOREGS(t, rp)
      325 +#define ASSERT_LWPTOREGS(lwp, rp)
      326 +#define ASSERT_NO_RUPDATE_PENDING(lwp)
      327 +#define ASSERT_CR0TS_ZERO(reg)
 212  328  #endif
 213  329  
 214      -#endif  /* __lint */
 215      -
 216  330  /*
 217      - * This is an assembler version of this fragment:
 218      - *
 219      - * lwp->lwp_state = LWP_SYS;
 220      - * lwp->lwp_ru.sysc++;
 221      - * lwp->lwp_eosys = NORMALRETURN;
 222      - * lwp->lwp_ap = argp;
 223      - *
 224      - * Preconditions:
 225      - *      -none-
 226      - * Postconditions:
 227      - *      -none-
      331 + * Do the traptrace thing and restore any registers we used
      332 + * in situ.  Assumes that %rsp is pointing at the base of
      333 + * the struct regs, obviously ..
 228  334   */
 229      -#define SET_LWP(lwp, argp)                              \
 230      -        movb    $LWP_SYS, LWP_STATE(lwp);               \
 231      -        addl    $1, LWP_RU_SYSC(lwp);                   \
 232      -        adcl    $0, LWP_RU_SYSC+4(lwp);                 \
 233      -        movb    $NORMALRETURN, LWP_EOSYS(lwp);          \
 234      -        movl    argp, LWP_AP(lwp)
      335 +#ifdef TRAPTRACE
      336 +#define SYSCALL_TRAPTRACE(ttype)                                \
      337 +        TRACE_PTR(%rdi, %rbx, %ebx, %rcx, ttype);               \
      338 +        TRACE_REGS(%rdi, %rsp, %rbx, %rcx);                     \
      339 +        TRACE_STAMP(%rdi);      /* rdtsc clobbers %eax, %edx */ \
      340 +        movq    REGOFF_RAX(%rsp), %rax;                         \
      341 +        movq    REGOFF_RBX(%rsp), %rbx;                         \
      342 +        movq    REGOFF_RCX(%rsp), %rcx;                         \
      343 +        movq    REGOFF_RDX(%rsp), %rdx;                         \
      344 +        movl    %eax, TTR_SYSNUM(%rdi);                         \
      345 +        movq    REGOFF_RDI(%rsp), %rdi
 235  346  
 236      -/*
 237      - * Set up the thread, lwp, find the handler, and copy
 238      - * in the arguments from userland to the kernel stack.
 239      - *
 240      - * Preconditions:
 241      - * -    %eax contains the syscall number
 242      - * Postconditions:
 243      - * -    %eax contains a pointer to the sysent structure
 244      - * -    %ecx is zeroed
 245      - * -    %esi, %edi are smashed
 246      - * -    %esp is SYS_DROPped ready for the syscall
 247      - */
 248      -#define SIMPLE_SYSCALL_PRESYS(t, faultlabel)            \
 249      -        movl    T_LWP(t), %esi;                         \
 250      -        movw    %ax, T_SYSNUM(t);                       \
 251      -        subl    $SYS_DROP, %esp;                        \
 252      -        shll    $SYSENT_SIZE_SHIFT, %eax;                       \
 253      -        SET_LWP(%esi, %esp);                            \
 254      -        leal    sysent(%eax), %eax;                     \
 255      -        movzbl  SY_NARG(%eax), %ecx;                    \
 256      -        testl   %ecx, %ecx;                             \
 257      -        jz      4f;                                     \
 258      -        movl    %esp, %edi;                             \
 259      -        movl    SYS_DROP + REGOFF_UESP(%esp), %esi;     \
 260      -        movl    $faultlabel, T_LOFAULT(t);              \
 261      -        addl    $4, %esi;                               \
 262      -        rep;                                            \
 263      -          smovl;                                        \
 264      -        movl    %ecx, T_LOFAULT(t);                     \
 265      -4:
      347 +#define SYSCALL_TRAPTRACE32(ttype)                              \
      348 +        SYSCALL_TRAPTRACE(ttype);                               \
      349 +        /* paranoia: clean the top 32-bits of the registers */  \
      350 +        orl     %eax, %eax;                                     \
      351 +        orl     %ebx, %ebx;                                     \
      352 +        orl     %ecx, %ecx;                                     \
      353 +        orl     %edx, %edx;                                     \
      354 +        orl     %edi, %edi
      355 +#else   /* TRAPTRACE */
      356 +#define SYSCALL_TRAPTRACE(ttype)
      357 +#define SYSCALL_TRAPTRACE32(ttype)
      358 +#endif  /* TRAPTRACE */
 266  359  
 267  360  /*
 268      - * Check to see if a simple return is possible i.e.
      361 + * The 64-bit libc syscall wrapper does this:
 269  362   *
 270      - *      if ((t->t_post_sys_ast | syscalltrace) != 0)
 271      - *              do full version;
      363 + * fn(<args>)
      364 + * {
      365 + *      movq    %rcx, %r10      -- because syscall smashes %rcx
      366 + *      movl    $CODE, %eax
      367 + *      syscall
      368 + *      <error processing>
      369 + * }
 272  370   *
 273      - * Preconditions:
 274      - * -    t is curthread
 275      - * Postconditions:
 276      - * -    condition code NE is set if post-sys is too complex
 277      - * -    rtmp is zeroed if it isn't (we rely on this!)
 278      - */
 279      -#define CHECK_POSTSYS_NE(t, rtmp)                       \
 280      -        xorl    rtmp, rtmp;                             \
 281      -        ORL_SYSCALLTRACE(rtmp);                         \
 282      -        orl     T_POST_SYS_AST(t), rtmp;                \
 283      -        cmpl    $0, rtmp
 284      -
 285      -/*
 286      - * Fix up the lwp, thread, and eflags for a successful return
      371 + * Thus when we come into the kernel:
 287  372   *
 288      - * Preconditions:
 289      - * -    zwreg contains zero
 290      - * Postconditions:
 291      - * -    %esp has been unSYS_DROPped
 292      - * -    %esi is smashed (points to lwp)
 293      - */
 294      -#define SIMPLE_SYSCALL_POSTSYS(t, zwreg)                \
 295      -        movl    T_LWP(t), %esi;                         \
 296      -        addl    $SYS_DROP, %esp;                        \
 297      -        movw    zwreg, T_SYSNUM(t);                     \
 298      -        movb    $LWP_USER, LWP_STATE(%esi);             \
 299      -        andb    $_CONST(0xffff - PS_C), REGOFF_EFL(%esp)
 300      -
 301      -/*
 302      - * System call handler.  This is the destination of both the call
 303      - * gate (lcall 0x27) _and_ the interrupt gate (int 0x91). For our purposes,
 304      - * there are two significant differences between an interrupt gate and a call
 305      - * gate:
      373 + *      %rdi, %rsi, %rdx, %r10, %r8, %r9 contain first six args
      374 + *      %rax is the syscall number
      375 + *      %r12-%r15 contain caller state
 306  376   *
 307      - * 1) An interrupt gate runs the handler with interrupts disabled, whereas a
 308      - * call gate runs the handler with whatever EFLAGS settings were in effect at
 309      - * the time of the call.
      377 + * The syscall instruction arranges that:
 310  378   *
 311      - * 2) An interrupt gate pushes the contents of the EFLAGS register at the time
 312      - * of the interrupt onto the stack, whereas a call gate does not.
      379 + *      %rcx contains the return %rip
      380 + *      %r11d contains bottom 32-bits of %rflags
      381 + *      %rflags is masked (as determined by the SFMASK msr)
      382 + *      %cs is set to UCS_SEL (as determined by the STAR msr)
      383 + *      %ss is set to UDS_SEL (as determined by the STAR msr)
      384 + *      %rip is set to sys_syscall (as determined by the LSTAR msr)
 313  385   *
 314      - * Because we use the following code sequence to handle system calls made from
 315      - * _both_ a call gate _and_ an interrupt gate, these two differences must be
 316      - * respected. In regards to number 1) above, the handler must ensure that a sane
 317      - * EFLAGS snapshot is stored on the stack so that when the kernel returns back
 318      - * to the user via iret (which returns to user with the EFLAGS value saved on
 319      - * the stack), interrupts are re-enabled.
      386 + * Or in other words, we have no registers available at all.
      387 + * Only swapgs can save us!
 320  388   *
 321      - * In regards to number 2) above, the handler must always put a current snapshot
 322      - * of EFLAGS onto the stack in the appropriate place. If we came in via an
 323      - * interrupt gate, we will be clobbering the EFLAGS value that was pushed by
 324      - * the interrupt gate. This is OK, as the only bit that was changed by the
 325      - * hardware was the IE (interrupt enable) bit, which for an interrupt gate is
 326      - * now off. If we were to do nothing, the stack would contain an EFLAGS with
 327      - * IE off, resulting in us eventually returning back to the user with interrupts
 328      - * disabled. The solution is to turn on the IE bit in the EFLAGS value saved on
 329      - * the stack.
      389 + * Under the hypervisor, the swapgs has happened already.  However, the
      390 + * state of the world is very different from that we're familiar with.
 330  391   *
 331      - * Another subtlety which deserves mention is the difference between the two
 332      - * descriptors. The call gate descriptor is set to instruct the hardware to copy
 333      - * one parameter from the user stack to the kernel stack, whereas the interrupt
 334      - * gate descriptor doesn't use the parameter passing mechanism at all. The
 335      - * kernel doesn't actually use the parameter that is copied by the hardware; the
 336      - * only reason it does this is so that there is a space on the stack large
 337      - * enough to hold an EFLAGS register value, which happens to be in the correct
 338      - * place for use by iret when we go back to userland. How convenient.
      392 + * In particular, we have a stack structure like that for interrupt
      393 + * gates, except that the %cs and %ss registers are modified for reasons
      394 + * that are not entirely clear.  Critically, the %rcx/%r11 values do
      395 + * *not* reflect the usage of those registers under a 'real' syscall[1];
      396 + * the stack, therefore, looks like this:
 339  397   *
 340      - * Stack frame description in syscall() and callees.
      398 + *      0x0(rsp)        potentially junk %rcx
      399 + *      0x8(rsp)        potentially junk %r11
      400 + *      0x10(rsp)       user %rip
      401 + *      0x18(rsp)       modified %cs
      402 + *      0x20(rsp)       user %rflags
      403 + *      0x28(rsp)       user %rsp
      404 + *      0x30(rsp)       modified %ss
 341  405   *
 342      - * |------------|
 343      - * | regs       | +(8*4)+4      registers
 344      - * |------------|
 345      - * | 8 args     | <- %esp       MAXSYSARGS (currently 8) arguments
 346      - * |------------|
 347  406   *
      407 + * and before continuing on, we must load the %rip into %rcx and the
      408 + * %rflags into %r11.
      409 + *
      410 + * [1] They used to, and we relied on it, but this was broken in 3.1.1.
      411 + * Sigh.
 348  412   */
 349      -#define SYS_DROP        _CONST(_MUL(MAXSYSARGS, 4))
      413 +#if defined(__xpv)
      414 +#define XPV_SYSCALL_PROD                                                \
      415 +        movq    0x10(%rsp), %rcx;                                       \
      416 +        movq    0x20(%rsp), %r11;                                       \
      417 +        movq    0x28(%rsp), %rsp
      418 +#else
      419 +#define XPV_SYSCALL_PROD /* nothing */
      420 +#endif
 350  421  
 351      -#if defined(__lint)
      422 +        ENTRY_NP2(brand_sys_syscall,_allsyscalls)
      423 +        SWAPGS                          /* kernel gsbase */
      424 +        XPV_SYSCALL_PROD
      425 +        BRAND_CALLBACK(BRAND_CB_SYSCALL, BRAND_URET_FROM_REG(%rcx))
      426 +        jmp     noprod_sys_syscall
 352  427  
 353      -/*ARGSUSED*/
 354      -void
 355      -sys_call()
 356      -{}
      428 +        ALTENTRY(sys_syscall)
      429 +        SWAPGS                          /* kernel gsbase */
      430 +        XPV_SYSCALL_PROD
 357  431  
 358      -void
 359      -_allsyscalls()
 360      -{}
      432 +noprod_sys_syscall:
      433 +        movq    %r15, %gs:CPU_RTMP_R15
      434 +        movq    %rsp, %gs:CPU_RTMP_RSP
 361  435  
 362      -size_t _allsyscalls_size;
      436 +        movq    %gs:CPU_THREAD, %r15
      437 +        movq    T_STACK(%r15), %rsp     /* switch from user to kernel stack */
 363  438  
 364      -#else   /* __lint */
      439 +        ASSERT_UPCALL_MASK_IS_SET
 365  440  
 366      -        ENTRY_NP2(brand_sys_call, _allsyscalls)
 367      -        BRAND_CALLBACK(BRAND_CB_SYSCALL)
      441 +        movl    $UCS_SEL, REGOFF_CS(%rsp)
      442 +        movq    %rcx, REGOFF_RIP(%rsp)          /* syscall: %rip -> %rcx */
      443 +        movq    %r11, REGOFF_RFL(%rsp)          /* syscall: %rfl -> %r11d */
      444 +        movl    $UDS_SEL, REGOFF_SS(%rsp)
 368  445  
 369      -        ALTENTRY(sys_call)
 370      -        / on entry      eax = system call number
      446 +        movl    %eax, %eax                      /* wrapper: sysc# -> %eax */
      447 +        movq    %rdi, REGOFF_RDI(%rsp)
      448 +        movq    %rsi, REGOFF_RSI(%rsp)
      449 +        movq    %rdx, REGOFF_RDX(%rsp)
      450 +        movq    %r10, REGOFF_RCX(%rsp)          /* wrapper: %rcx -> %r10 */
      451 +        movq    %r10, %rcx                      /* arg[3] for direct calls */
 371  452  
 372      -        / set up the stack to look as in reg.h
 373      -        subl    $8, %esp        / pad the stack with ERRCODE and TRAPNO
      453 +        movq    %r8, REGOFF_R8(%rsp)
      454 +        movq    %r9, REGOFF_R9(%rsp)
      455 +        movq    %rax, REGOFF_RAX(%rsp)
      456 +        movq    %rbx, REGOFF_RBX(%rsp)
 374  457  
 375      -        SYSCALL_PUSH
      458 +        movq    %rbp, REGOFF_RBP(%rsp)
      459 +        movq    %r10, REGOFF_R10(%rsp)
      460 +        movq    %gs:CPU_RTMP_RSP, %r11
      461 +        movq    %r11, REGOFF_RSP(%rsp)
      462 +        movq    %r12, REGOFF_R12(%rsp)
 376  463  
 377      -#ifdef TRAPTRACE
 378      -        TRACE_PTR(%edi, %ebx, %ebx, %ecx, $TT_SYSCALL) / Uses labels "8" and "9"
 379      -        TRACE_REGS(%edi, %esp, %ebx, %ecx)      / Uses label "9"
 380      -        pushl   %eax
 381      -        TRACE_STAMP(%edi)               / Clobbers %eax, %edx, uses "9"
 382      -        popl    %eax
 383      -        movl    %eax, TTR_SYSNUM(%edi)
      464 +        movq    %r13, REGOFF_R13(%rsp)
      465 +        movq    %r14, REGOFF_R14(%rsp)
      466 +        movq    %gs:CPU_RTMP_R15, %r10
      467 +        movq    %r10, REGOFF_R15(%rsp)
      468 +        movq    $0, REGOFF_SAVFP(%rsp)
      469 +        movq    $0, REGOFF_SAVPC(%rsp)
      470 +
      471 +        /*
      472 +         * Copy these registers here in case we end up stopped with
      473 +         * someone (like, say, /proc) messing with our register state.
      474 +         * We don't -restore- them unless we have to in update_sregs.
      475 +         *
      476 +         * Since userland -can't- change fsbase or gsbase directly,
      477 +         * and capturing them involves two serializing instructions,
      478 +         * we don't bother to capture them here.
      479 +         */
      480 +        xorl    %ebx, %ebx
      481 +        movw    %ds, %bx
      482 +        movq    %rbx, REGOFF_DS(%rsp)
      483 +        movw    %es, %bx
      484 +        movq    %rbx, REGOFF_ES(%rsp)
      485 +        movw    %fs, %bx
      486 +        movq    %rbx, REGOFF_FS(%rsp)
      487 +        movw    %gs, %bx
      488 +        movq    %rbx, REGOFF_GS(%rsp)
      489 +
      490 +        /*
      491 +         * If we're trying to use TRAPTRACE though, I take that back: we're
      492 +         * probably debugging some problem in the SWAPGS logic and want to know
      493 +         * what the incoming gsbase was.
      494 +         *
      495 +         * Since we already did SWAPGS, record the KGSBASE.
      496 +         */
      497 +#if defined(DEBUG) && defined(TRAPTRACE) && !defined(__xpv)
      498 +        movl    $MSR_AMD_KGSBASE, %ecx
      499 +        rdmsr
      500 +        movl    %eax, REGOFF_GSBASE(%rsp)
      501 +        movl    %edx, REGOFF_GSBASE+4(%rsp)
 384  502  #endif
 385  503  
 386      -_watch_do_syscall:
 387      -        movl    %esp, %ebp
      504 +        /*
      505 +         * Machine state saved in the regs structure on the stack
      506 +         * First six args in %rdi, %rsi, %rdx, %rcx, %r8, %r9
      507 +         * %eax is the syscall number
      508 +         * %rsp is the thread's stack, %r15 is curthread
      509 +         * REG_RSP(%rsp) is the user's stack
      510 +         */
 388  511  
 389      -        / Interrupts may be enabled here, so we must make sure this thread
 390      -        / doesn't migrate off the CPU while it updates the CPU stats.
 391      -        /
 392      -        / XXX This is only true if we got here via call gate thru the LDT for
 393      -        / old style syscalls. Perhaps this preempt++-- will go away soon?
 394      -        movl    %gs:CPU_THREAD, %ebx
 395      -        addb    $1, T_PREEMPT(%ebx)
 396      -        CPU_STATS_SYS_SYSCALL_INC
 397      -        subb    $1, T_PREEMPT(%ebx)
      512 +        SYSCALL_TRAPTRACE($TT_SYSC64)
 398  513  
      514 +        movq    %rsp, %rbp
      515 +
      516 +        movq    T_LWP(%r15), %r14
      517 +        ASSERT_NO_RUPDATE_PENDING(%r14)
 399  518          ENABLE_INTR_FLAGS
 400  519  
 401      -        pushl   %eax                            / preserve across mstate call
 402  520          MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM)
 403      -        popl    %eax
      521 +        movl    REGOFF_RAX(%rsp), %eax  /* (%rax damaged by mstate call) */
 404  522  
 405      -        movl    %gs:CPU_THREAD, %ebx
      523 +        ASSERT_LWPTOREGS(%r14, %rsp)
 406  524  
 407      -        ASSERT_LWPTOREGS(%ebx, %esp)
      525 +        movb    $LWP_SYS, LWP_STATE(%r14)
      526 +        incq    LWP_RU_SYSC(%r14)
      527 +        movb    $NORMALRETURN, LWP_EOSYS(%r14)
 408  528  
 409      -        CHECK_PRESYS_NE(%ebx, %eax)
 410      -        jne     _full_syscall_presys
 411      -        SIMPLE_SYSCALL_PRESYS(%ebx, _syscall_fault)
      529 +        incq    %gs:CPU_STATS_SYS_SYSCALL
 412  530  
 413      -_syslcall_call:
 414      -        call    *SY_CALLC(%eax)
      531 +        movw    %ax, T_SYSNUM(%r15)
      532 +        movzbl  T_PRE_SYS(%r15), %ebx
      533 +        ORL_SYSCALLTRACE(%ebx)
      534 +        testl   %ebx, %ebx
      535 +        jne     _syscall_pre
 415  536  
 416      -_syslcall_done:
 417      -        CHECK_POSTSYS_NE(%ebx, %ecx)
 418      -        jne     _full_syscall_postsys
 419      -        SIMPLE_SYSCALL_POSTSYS(%ebx, %cx)
 420      -        movl    %eax, REGOFF_EAX(%esp)
 421      -        movl    %edx, REGOFF_EDX(%esp)
      537 +_syscall_invoke:
      538 +        movq    REGOFF_RDI(%rbp), %rdi
      539 +        movq    REGOFF_RSI(%rbp), %rsi
      540 +        movq    REGOFF_RDX(%rbp), %rdx
      541 +        movq    REGOFF_RCX(%rbp), %rcx
      542 +        movq    REGOFF_R8(%rbp), %r8
      543 +        movq    REGOFF_R9(%rbp), %r9
 422  544  
      545 +        cmpl    $NSYSCALL, %eax
      546 +        jae     _syscall_ill
      547 +        shll    $SYSENT_SIZE_SHIFT, %eax
      548 +        leaq    sysent(%rax), %rbx
      549 +
      550 +        movq    SY_CALLC(%rbx), %rax
      551 +        INDIRECT_CALL_REG(rax)
      552 +
      553 +        movq    %rax, %r12
      554 +        movq    %rdx, %r13
      555 +
      556 +        /*
      557 +         * If the handler returns two ints, then we need to split the
      558 +         * 64-bit return value into two 32-bit values.
      559 +         */
      560 +        testw   $SE_32RVAL2, SY_FLAGS(%rbx)
      561 +        je      5f
      562 +        movq    %r12, %r13
      563 +        shrq    $32, %r13       /* upper 32-bits into %edx */
      564 +        movl    %r12d, %r12d    /* lower 32-bits into %eax */
      565 +5:
      566 +        /*
      567 +         * Optimistically assume that there's no post-syscall
      568 +         * work to do.  (This is to avoid having to call syscall_mstate()
      569 +         * with interrupts disabled)
      570 +         */
 423  571          MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER)
 424  572  
 425      -        /
 426      -        / get back via iret
 427      -        /
 428      -        CLI(%edx)
 429      -        jmp     sys_rtt_syscall
      573 +        /*
      574 +         * We must protect ourselves from being descheduled here;
      575 +         * If we were, and we ended up on another cpu, or another
      576 +         * lwp got in ahead of us, it could change the segment
      577 +         * registers without us noticing before we return to userland.
      578 +         */
      579 +        CLI(%r14)
      580 +        CHECK_POSTSYS_NE(%r15, %r14, %ebx)
      581 +        jne     _syscall_post
 430  582  
 431      -_full_syscall_presys:
 432      -        movl    T_LWP(%ebx), %esi
 433      -        subl    $SYS_DROP, %esp
 434      -        movb    $LWP_SYS, LWP_STATE(%esi)
 435      -        pushl   %esp
 436      -        pushl   %ebx
 437      -        call    syscall_entry
 438      -        addl    $8, %esp
 439      -        jmp     _syslcall_call
      583 +        /*
      584 +         * We need to protect ourselves against non-canonical return values
      585 +         * because Intel doesn't check for them on sysret (AMD does).  Canonical
      586 +         * addresses on current amd64 processors only use 48-bits for VAs; an
      587 +         * address is canonical if all upper bits (47-63) are identical. If we
      588 +         * find a non-canonical %rip, we opt to go through the full
      589 +         * _syscall_post path which takes us into an iretq which is not
      590 +         * susceptible to the same problems sysret is.
      591 +         *
      592 +         * We're checking for a canonical address by first doing an arithmetic
      593 +         * shift. This will fill in the remaining bits with the value of bit 63.
      594 +         * If the address were canonical, the register would now have either all
      595 +         * zeroes or all ones in it. Therefore we add one (inducing overflow)
      596 +         * and compare against 1. A canonical address will either be zero or one
      597 +         * at this point, hence the use of ja.
      598 +         *
      599 +         * At this point, r12 and r13 have the return value so we can't use
      600 +         * those registers.
      601 +         */
      602 +        movq    REGOFF_RIP(%rsp), %rcx
      603 +        sarq    $47, %rcx
      604 +        incq    %rcx
      605 +        cmpq    $1, %rcx
      606 +        ja      _syscall_post
 440  607  
 441      -_full_syscall_postsys:
 442      -        addl    $SYS_DROP, %esp
 443      -        pushl   %edx
 444      -        pushl   %eax
 445      -        pushl   %ebx
 446      -        call    syscall_exit
 447      -        addl    $12, %esp
      608 +
      609 +        SIMPLE_SYSCALL_POSTSYS(%r15, %r14, %bx)
      610 +
      611 +        movq    %r12, REGOFF_RAX(%rsp)
      612 +        movq    %r13, REGOFF_RDX(%rsp)
      613 +
      614 +        /*
      615 +         * Clobber %r11 as we check CR0.TS.
      616 +         */
      617 +        ASSERT_CR0TS_ZERO(%r11)
      618 +
      619 +        /*
      620 +         * Unlike other cases, because we need to restore the user stack pointer
      621 +         * before exiting the kernel we must clear the microarch state before
      622 +         * getting here. This should be safe because it means that the only
      623 +         * values on the bus after this are based on the user's registers and
      624 +         * potentially the addresses where we stored them. Given the constraints
      625 +         * of sysret, that's how it has to be.
      626 +         */
      627 +        call    x86_md_clear
      628 +
      629 +        /*
      630 +         * To get back to userland, we need the return %rip in %rcx and
      631 +         * the return %rfl in %r11d.  The sysretq instruction also arranges
      632 +         * to fix up %cs and %ss; everything else is our responsibility.
      633 +         */
      634 +        movq    REGOFF_RDI(%rsp), %rdi
      635 +        movq    REGOFF_RSI(%rsp), %rsi
      636 +        movq    REGOFF_RDX(%rsp), %rdx
      637 +        /* %rcx used to restore %rip value */
      638 +
      639 +        movq    REGOFF_R8(%rsp), %r8
      640 +        movq    REGOFF_R9(%rsp), %r9
      641 +        movq    REGOFF_RAX(%rsp), %rax
      642 +        movq    REGOFF_RBX(%rsp), %rbx
      643 +
      644 +        movq    REGOFF_RBP(%rsp), %rbp
      645 +        movq    REGOFF_R10(%rsp), %r10
      646 +        /* %r11 used to restore %rfl value */
      647 +        movq    REGOFF_R12(%rsp), %r12
      648 +
      649 +        movq    REGOFF_R13(%rsp), %r13
      650 +        movq    REGOFF_R14(%rsp), %r14
      651 +        movq    REGOFF_R15(%rsp), %r15
      652 +
      653 +        movq    REGOFF_RIP(%rsp), %rcx
      654 +        movl    REGOFF_RFL(%rsp), %r11d
      655 +
      656 +#if defined(__xpv)
      657 +        addq    $REGOFF_RIP, %rsp
      658 +#else
      659 +        movq    REGOFF_RSP(%rsp), %rsp
      660 +#endif
      661 +
      662 +        /*
      663 +         * There can be no instructions between the ALTENTRY below and
      664 +         * SYSRET or we could end up breaking brand support. See label usage
      665 +         * in sn1_brand_syscall_callback for an example.
      666 +         */
      667 +        ASSERT_UPCALL_MASK_IS_SET
      668 +#if defined(__xpv)
      669 +        SYSRETQ
      670 +        ALTENTRY(nopop_sys_syscall_swapgs_sysretq)
      671 +
      672 +        /*
      673 +         * We can only get here after executing a brand syscall
      674 +         * interposition callback handler and simply need to
      675 +         * "sysretq" back to userland. On the hypervisor this
      676 +         * involves the iret hypercall which requires us to construct
      677 +         * just enough of the stack needed for the hypercall.
      678 +         * (rip, cs, rflags, rsp, ss).
      679 +         */
      680 +        movq    %rsp, %gs:CPU_RTMP_RSP          /* save user's rsp */
      681 +        movq    %gs:CPU_THREAD, %r11
      682 +        movq    T_STACK(%r11), %rsp
      683 +
      684 +        movq    %rcx, REGOFF_RIP(%rsp)
      685 +        movl    $UCS_SEL, REGOFF_CS(%rsp)
      686 +        movq    %gs:CPU_RTMP_RSP, %r11
      687 +        movq    %r11, REGOFF_RSP(%rsp)
      688 +        pushfq
      689 +        popq    %r11                            /* hypercall enables ints */
      690 +        movq    %r11, REGOFF_RFL(%rsp)
      691 +        movl    $UDS_SEL, REGOFF_SS(%rsp)
      692 +        addq    $REGOFF_RIP, %rsp
      693 +        /*
      694 +         * XXPV: see comment in SYSRETQ definition for future optimization
      695 +         *       we could take.
      696 +         */
      697 +        ASSERT_UPCALL_MASK_IS_SET
      698 +        SYSRETQ
      699 +#else
      700 +        ALTENTRY(nopop_sys_syscall_swapgs_sysretq)
      701 +        jmp     tr_sysretq
      702 +#endif
      703 +        /*NOTREACHED*/
      704 +        SET_SIZE(nopop_sys_syscall_swapgs_sysretq)
      705 +
      706 +_syscall_pre:
      707 +        call    pre_syscall
      708 +        movl    %eax, %r12d
      709 +        testl   %eax, %eax
      710 +        jne     _syscall_post_call
      711 +        /*
      712 +         * Didn't abort, so reload the syscall args and invoke the handler.
      713 +         */
      714 +        movzwl  T_SYSNUM(%r15), %eax
      715 +        jmp     _syscall_invoke
      716 +
      717 +_syscall_ill:
      718 +        call    nosys
      719 +        movq    %rax, %r12
      720 +        movq    %rdx, %r13
      721 +        jmp     _syscall_post_call
      722 +
      723 +_syscall_post:
      724 +        STI
      725 +        /*
      726 +         * Sigh, our optimism wasn't justified, put it back to LMS_SYSTEM
      727 +         * so that we can account for the extra work it takes us to finish.
      728 +         */
      729 +        MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM)
      730 +_syscall_post_call:
      731 +        movq    %r12, %rdi
      732 +        movq    %r13, %rsi
      733 +        call    post_syscall
 448  734          MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER)
 449  735          jmp     _sys_rtt
      736 +        SET_SIZE(sys_syscall)
      737 +        SET_SIZE(brand_sys_syscall)
 450  738  
 451      -_syscall_fault:
 452      -        push    $0xe                    / EFAULT
 453      -        call    set_errno
 454      -        addl    $4, %esp
 455      -        xorl    %eax, %eax              / fake syscall_err()
 456      -        xorl    %edx, %edx
 457      -        jmp     _syslcall_done
 458      -        SET_SIZE(sys_call)
 459      -        SET_SIZE(brand_sys_call)
      739 +        ENTRY_NP(brand_sys_syscall32)
      740 +        SWAPGS                          /* kernel gsbase */
      741 +        XPV_TRAP_POP
      742 +        BRAND_CALLBACK(BRAND_CB_SYSCALL32, BRAND_URET_FROM_REG(%rcx))
      743 +        jmp     nopop_sys_syscall32
 460  744  
 461      -#endif  /* __lint */
      745 +        ALTENTRY(sys_syscall32)
      746 +        SWAPGS                          /* kernel gsbase */
      747 +        XPV_TRAP_POP
 462  748  
      749 +nopop_sys_syscall32:
      750 +        movl    %esp, %r10d
      751 +        movq    %gs:CPU_THREAD, %r15
      752 +        movq    T_STACK(%r15), %rsp
      753 +        movl    %eax, %eax
      754 +
      755 +        movl    $U32CS_SEL, REGOFF_CS(%rsp)
      756 +        movl    %ecx, REGOFF_RIP(%rsp)          /* syscall: %rip -> %rcx */
      757 +        movq    %r11, REGOFF_RFL(%rsp)          /* syscall: %rfl -> %r11d */
      758 +        movq    %r10, REGOFF_RSP(%rsp)
      759 +        movl    $UDS_SEL, REGOFF_SS(%rsp)
      760 +
      761 +_syscall32_save:
      762 +        movl    %edi, REGOFF_RDI(%rsp)
      763 +        movl    %esi, REGOFF_RSI(%rsp)
      764 +        movl    %ebp, REGOFF_RBP(%rsp)
      765 +        movl    %ebx, REGOFF_RBX(%rsp)
      766 +        movl    %edx, REGOFF_RDX(%rsp)
      767 +        movl    %ecx, REGOFF_RCX(%rsp)
      768 +        movl    %eax, REGOFF_RAX(%rsp)          /* wrapper: sysc# -> %eax */
      769 +        movq    $0, REGOFF_SAVFP(%rsp)
      770 +        movq    $0, REGOFF_SAVPC(%rsp)
      771 +
      772 +        /*
      773 +         * Copy these registers here in case we end up stopped with
      774 +         * someone (like, say, /proc) messing with our register state.
      775 +         * We don't -restore- them unless we have to in update_sregs.
      776 +         *
      777 +         * Since userland -can't- change fsbase or gsbase directly,
      778 +         * we don't bother to capture them here.
      779 +         */
      780 +        xorl    %ebx, %ebx
      781 +        movw    %ds, %bx
      782 +        movq    %rbx, REGOFF_DS(%rsp)
      783 +        movw    %es, %bx
      784 +        movq    %rbx, REGOFF_ES(%rsp)
      785 +        movw    %fs, %bx
      786 +        movq    %rbx, REGOFF_FS(%rsp)
      787 +        movw    %gs, %bx
      788 +        movq    %rbx, REGOFF_GS(%rsp)
      789 +
      790 +        /*
      791 +         * If we're trying to use TRAPTRACE though, I take that back: we're
      792 +         * probably debugging some problem in the SWAPGS logic and want to know
      793 +         * what the incoming gsbase was.
      794 +         *
      795 +         * Since we already did SWAPGS, record the KGSBASE.
      796 +         */
      797 +#if defined(DEBUG) && defined(TRAPTRACE) && !defined(__xpv)
      798 +        movl    $MSR_AMD_KGSBASE, %ecx
      799 +        rdmsr
      800 +        movl    %eax, REGOFF_GSBASE(%rsp)
      801 +        movl    %edx, REGOFF_GSBASE+4(%rsp)
      802 +#endif
      803 +
      804 +        /*
      805 +         * Application state saved in the regs structure on the stack
      806 +         * %eax is the syscall number
      807 +         * %rsp is the thread's stack, %r15 is curthread
      808 +         * REG_RSP(%rsp) is the user's stack
      809 +         */
      810 +
      811 +        SYSCALL_TRAPTRACE32($TT_SYSC)
      812 +
      813 +        movq    %rsp, %rbp
      814 +
      815 +        movq    T_LWP(%r15), %r14
      816 +        ASSERT_NO_RUPDATE_PENDING(%r14)
      817 +
      818 +        ENABLE_INTR_FLAGS
      819 +
      820 +        MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM)
      821 +        movl    REGOFF_RAX(%rsp), %eax  /* (%rax damaged by mstate call) */
      822 +
      823 +        ASSERT_LWPTOREGS(%r14, %rsp)
      824 +
      825 +        incq     %gs:CPU_STATS_SYS_SYSCALL
      826 +
      827 +        /*
      828 +         * Make some space for MAXSYSARGS (currently 8) 32-bit args placed
      829 +         * into 64-bit (long) arg slots, maintaining 16 byte alignment.  Or
      830 +         * more succinctly:
      831 +         *
      832 +         *      SA(MAXSYSARGS * sizeof (long)) == 64
      833 +         */
      834 +#define SYS_DROP        64                      /* drop for args */
      835 +        subq    $SYS_DROP, %rsp
      836 +        movb    $LWP_SYS, LWP_STATE(%r14)
      837 +        movq    %r15, %rdi
      838 +        movq    %rsp, %rsi
      839 +        call    syscall_entry
      840 +
      841 +        /*
      842 +         * Fetch the arguments copied onto the kernel stack and put
      843 +         * them in the right registers to invoke a C-style syscall handler.
      844 +         * %rax contains the handler address.
      845 +         *
      846 +         * Ideas for making all this go faster of course include simply
      847 +         * forcibly fetching 6 arguments from the user stack under lofault
      848 +         * protection, reverting to copyin_args only when watchpoints
      849 +         * are in effect.
      850 +         *
      851 +         * (If we do this, make sure that exec and libthread leave
      852 +         * enough space at the top of the stack to ensure that we'll
      853 +         * never do a fetch from an invalid page.)
      854 +         *
      855 +         * Lots of ideas here, but they won't really help with bringup B-)
      856 +         * Correctness can't wait, performance can wait a little longer ..
      857 +         */
      858 +
      859 +        movq    %rax, %rbx
      860 +        movl    0(%rsp), %edi
      861 +        movl    8(%rsp), %esi
      862 +        movl    0x10(%rsp), %edx
      863 +        movl    0x18(%rsp), %ecx
      864 +        movl    0x20(%rsp), %r8d
      865 +        movl    0x28(%rsp), %r9d
      866 +
      867 +        movq    SY_CALLC(%rbx), %rax
      868 +        INDIRECT_CALL_REG(rax)
      869 +
      870 +        movq    %rbp, %rsp      /* pop the args */
      871 +
      872 +        /*
      873 +         * amd64 syscall handlers -always- return a 64-bit value in %rax.
      874 +         * On the 32-bit kernel, they always return that value in %eax:%edx
      875 +         * as required by the 32-bit ABI.
      876 +         *
      877 +         * Simulate the same behaviour by unconditionally splitting the
      878 +         * return value in the same way.
      879 +         */
      880 +        movq    %rax, %r13
      881 +        shrq    $32, %r13       /* upper 32-bits into %edx */
      882 +        movl    %eax, %r12d     /* lower 32-bits into %eax */
      883 +
      884 +        /*
      885 +         * Optimistically assume that there's no post-syscall
      886 +         * work to do.  (This is to avoid having to call syscall_mstate()
      887 +         * with interrupts disabled)
      888 +         */
      889 +        MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER)
      890 +
      891 +        /*
      892 +         * We must protect ourselves from being descheduled here;
      893 +         * If we were, and we ended up on another cpu, or another
      894 +         * lwp got in ahead of us, it could change the segment
      895 +         * registers without us noticing before we return to userland.
      896 +         */
      897 +        CLI(%r14)
      898 +        CHECK_POSTSYS_NE(%r15, %r14, %ebx)
      899 +        jne     _full_syscall_postsys32
      900 +        SIMPLE_SYSCALL_POSTSYS(%r15, %r14, %bx)
      901 +
      902 +        /*
      903 +         * Clobber %r11 as we check CR0.TS.
      904 +         */
      905 +        ASSERT_CR0TS_ZERO(%r11)
      906 +
      907 +        /*
      908 +         * Unlike other cases, because we need to restore the user stack pointer
      909 +         * before exiting the kernel we must clear the microarch state before
      910 +         * getting here. This should be safe because it means that the only
      911 +         * values on the bus after this are based on the user's registers and
      912 +         * potentially the addresses where we stored them. Given the constraints
      913 +         * of sysret, that's how it has to be.
      914 +         */
      915 +        call    x86_md_clear
      916 +
      917 +        /*
      918 +         * To get back to userland, we need to put the return %rip in %rcx and
      919 +         * the return %rfl in %r11d.  The sysret instruction also arranges
      920 +         * to fix up %cs and %ss; everything else is our responsibility.
      921 +         */
      922 +
      923 +        movl    %r12d, %eax                     /* %eax: rval1 */
      924 +        movl    REGOFF_RBX(%rsp), %ebx
      925 +        /* %ecx used for return pointer */
      926 +        movl    %r13d, %edx                     /* %edx: rval2 */
      927 +        movl    REGOFF_RBP(%rsp), %ebp
      928 +        movl    REGOFF_RSI(%rsp), %esi
      929 +        movl    REGOFF_RDI(%rsp), %edi
      930 +
      931 +        movl    REGOFF_RFL(%rsp), %r11d         /* %r11 -> eflags */
      932 +        movl    REGOFF_RIP(%rsp), %ecx          /* %ecx -> %eip */
      933 +        movl    REGOFF_RSP(%rsp), %esp
      934 +
      935 +        ASSERT_UPCALL_MASK_IS_SET
      936 +        ALTENTRY(nopop_sys_syscall32_swapgs_sysretl)
      937 +        jmp     tr_sysretl
      938 +        SET_SIZE(nopop_sys_syscall32_swapgs_sysretl)
      939 +        /*NOTREACHED*/
      940 +
      941 +_full_syscall_postsys32:
      942 +        STI
      943 +        /*
      944 +         * Sigh, our optimism wasn't justified, put it back to LMS_SYSTEM
      945 +         * so that we can account for the extra work it takes us to finish.
      946 +         */
      947 +        MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM)
      948 +        movq    %r15, %rdi
      949 +        movq    %r12, %rsi                      /* rval1 - %eax */
      950 +        movq    %r13, %rdx                      /* rval2 - %edx */
      951 +        call    syscall_exit
      952 +        MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER)
      953 +        jmp     _sys_rtt
      954 +        SET_SIZE(sys_syscall32)
      955 +        SET_SIZE(brand_sys_syscall32)
      956 +
 463  957  /*
 464  958   * System call handler via the sysenter instruction
      959 + * Used only for 32-bit system calls on the 64-bit kernel.
 465  960   *
 466      - * Here's how syscall entry usually works (see sys_call for details).
 467      - *
 468      - * There, the caller (lcall or int) in userland has arranged that:
 469      - *
 470      - * -    %eax contains the syscall number
 471      - * -    the user stack contains the args to the syscall
 472      - *
 473      - * Normally the lcall instruction into the call gate causes the processor
 474      - * to push %ss, %esp, <top-of-stack>, %cs, %eip onto the kernel stack.
 475      - * The sys_call handler then leaves space for r_trapno and r_err, and
 476      - * pusha's {%eax, %ecx, %edx, %ebx, %esp, %ebp, %esi, %edi}, followed
 477      - * by %ds, %es, %fs and %gs to capture a 'struct regs' on the stack.
 478      - * Then the kernel sets %ds, %es and %gs to kernel selectors, and finally
 479      - * extracts %efl and puts it into r_efl (which happens to live at the offset
 480      - * that <top-of-stack> was copied into). Note that the value in r_efl has
 481      - * the IF (interrupt enable) flag turned on. (The int instruction into the
 482      - * interrupt gate does essentially the same thing, only instead of
 483      - * <top-of-stack> we get eflags - see comment above.)
 484      - *
 485      - * In the sysenter case, things are a lot more primitive.
 486      - *
 487  961   * The caller in userland has arranged that:
 488  962   *
 489  963   * -    %eax contains the syscall number
 490  964   * -    %ecx contains the user %esp
 491  965   * -    %edx contains the return %eip
 492  966   * -    the user stack contains the args to the syscall
 493  967   *
 494      - * e.g.
 495      - *      <args on the stack>
 496      - *      mov     $SYS_callnum, %eax
 497      - *      mov     $1f, %edx       / return %eip
 498      - *      mov     %esp, %ecx      / return %esp
 499      - *      sysenter
 500      - * 1:
 501      - *
 502  968   * Hardware and (privileged) initialization code have arranged that by
 503  969   * the time the sysenter instructions completes:
 504  970   *
 505      - * - %eip is pointing to sys_sysenter (below).
      971 + * - %rip is pointing to sys_sysenter (below).
 506  972   * - %cs and %ss are set to kernel text and stack (data) selectors.
 507      - * - %esp is pointing at the lwp's stack
 508      - * - Interrupts have been disabled.
      973 + * - %rsp is pointing at the lwp's stack
      974 + * - interrupts have been disabled.
 509  975   *
 510      - * The task for the sysenter handler is:
      976 + * Note that we are unable to return both "rvals" to userland with
      977 + * this call, as %edx is used by the sysexit instruction.
 511  978   *
 512      - * -    recreate the same regs structure on the stack and the same
 513      - *      kernel state as if we'd come in on an lcall
 514      - * -    do the normal work of a syscall
 515      - * -    execute the system call epilogue, use sysexit to return to userland.
 516      - *
 517      - * Note that we are unable to return both "rvals" to userland with this
 518      - * call, as %edx is used by the sysexit instruction.
 519      - *
 520  979   * One final complication in this routine is its interaction with
 521      - * single-stepping in a debugger.  For most of the system call mechanisms,
 522      - * the CPU automatically clears the single-step flag before we enter the
 523      - * kernel.  The sysenter mechanism does not clear the flag, so a user
 524      - * single-stepping through a libc routine may suddenly find themself
 525      - * single-stepping through the kernel.  To detect this, kmdb compares the
 526      - * trap %pc to the [brand_]sys_enter addresses on each single-step trap.
 527      - * If it finds that we have single-stepped to a sysenter entry point, it
 528      - * explicitly clears the flag and executes the sys_sysenter routine.
      980 + * single-stepping in a debugger.  For most of the system call mechanisms, the
      981 + * CPU automatically clears the single-step flag before we enter the kernel.
      982 + * The sysenter mechanism does not clear the flag, so a user single-stepping
      983 + * through a libc routine may suddenly find themself single-stepping through the
      984 + * kernel.  To detect this, kmdb and trap() both compare the trap %pc to the
      985 + * [brand_]sys_enter addresses on each single-step trap.  If it finds that we
      986 + * have single-stepped to a sysenter entry point, it explicitly clears the flag
      987 + * and executes the sys_sysenter routine.
 529  988   *
 530      - * One final complication in this final complication is the fact that we
 531      - * have two different entry points for sysenter: brand_sys_sysenter and
 532      - * sys_sysenter.  If we enter at brand_sys_sysenter and start single-stepping
 533      - * through the kernel with kmdb, we will eventually hit the instruction at
 534      - * sys_sysenter.  kmdb cannot distinguish between that valid single-step
 535      - * and the undesirable one mentioned above.  To avoid this situation, we
 536      - * simply add a jump over the instruction at sys_sysenter to make it
 537      - * impossible to single-step to it.
      989 + * One final complication in this final complication is the fact that we have
      990 + * two different entry points for sysenter: brand_sys_sysenter and sys_sysenter.
      991 + * If we enter at brand_sys_sysenter and start single-stepping through the
      992 + * kernel with kmdb, we will eventually hit the instruction at sys_sysenter.
      993 + * kmdb cannot distinguish between that valid single-step and the undesirable
      994 + * one mentioned above.  To avoid this situation, we simply add a jump over the
      995 + * instruction at sys_sysenter to make it impossible to single-step to it.
 538  996   */
 539      -#if defined(__lint)
 540  997  
 541      -void
 542      -sys_sysenter()
 543      -{}
 544      -
 545      -#else   /* __lint */
 546      -
 547  998          ENTRY_NP(brand_sys_sysenter)
 548      -        pushl   %edx
 549      -        BRAND_CALLBACK(BRAND_CB_SYSENTER)
 550      -        popl    %edx
      999 +        SWAPGS                          /* kernel gsbase */
     1000 +        ALTENTRY(_brand_sys_sysenter_post_swapgs)
     1001 +
     1002 +        BRAND_CALLBACK(BRAND_CB_SYSENTER, BRAND_URET_FROM_REG(%rdx))
 551 1003          /*
 552 1004           * Jump over sys_sysenter to allow single-stepping as described
 553 1005           * above.
 554 1006           */
 555      -        ja      1f
     1007 +        jmp     _sys_sysenter_post_swapgs
 556 1008  
 557 1009          ALTENTRY(sys_sysenter)
 558      -        nop
 559      -1:
 560      -        /
 561      -        / do what the call gate would've done to the stack ..
 562      -        /
 563      -        pushl   $UDS_SEL        / (really %ss, but it's the same ..)
 564      -        pushl   %ecx            / userland makes this a copy of %esp
 565      -        pushfl
 566      -        orl     $PS_IE, (%esp)  / turn interrupts on when we return to user
 567      -        pushl   $UCS_SEL
 568      -        pushl   %edx            / userland makes this a copy of %eip
 569      -        /
 570      -        / done.  finish building the stack frame
 571      -        /
 572      -        subl    $8, %esp        / leave space for ERR and TRAPNO
     1010 +        SWAPGS                          /* kernel gsbase */
     1011 +        ALTENTRY(_sys_sysenter_post_swapgs)
 573 1012  
 574      -        SYSENTER_PUSH
     1013 +        movq    %gs:CPU_THREAD, %r15
 575 1014  
 576      -#ifdef TRAPTRACE
 577      -        TRACE_PTR(%edi, %ebx, %ebx, %ecx, $TT_SYSENTER) / uses labels 8 and 9
 578      -        TRACE_REGS(%edi, %esp, %ebx, %ecx)              / uses label 9
 579      -        pushl   %eax
 580      -        TRACE_STAMP(%edi)               / clobbers %eax, %edx, uses label 9
 581      -        popl    %eax
 582      -        movl    %eax, TTR_SYSNUM(%edi)
     1015 +        movl    $U32CS_SEL, REGOFF_CS(%rsp)
     1016 +        movl    %ecx, REGOFF_RSP(%rsp)          /* wrapper: %esp -> %ecx */
     1017 +        movl    %edx, REGOFF_RIP(%rsp)          /* wrapper: %eip -> %edx */
     1018 +        /*
     1019 +         * NOTE: none of the instructions that run before we get here should
     1020 +         * clobber bits in (R)FLAGS! This includes the kpti trampoline.
     1021 +         */
     1022 +        pushfq
     1023 +        popq    %r10
     1024 +        movl    $UDS_SEL, REGOFF_SS(%rsp)
     1025 +
     1026 +        /*
     1027 +         * Set the interrupt flag before storing the flags to the
     1028 +         * flags image on the stack so we can return to user with
     1029 +         * interrupts enabled if we return via sys_rtt_syscall32
     1030 +         */
     1031 +        orq     $PS_IE, %r10
     1032 +        movq    %r10, REGOFF_RFL(%rsp)
     1033 +
     1034 +        movl    %edi, REGOFF_RDI(%rsp)
     1035 +        movl    %esi, REGOFF_RSI(%rsp)
     1036 +        movl    %ebp, REGOFF_RBP(%rsp)
     1037 +        movl    %ebx, REGOFF_RBX(%rsp)
     1038 +        movl    %edx, REGOFF_RDX(%rsp)
     1039 +        movl    %ecx, REGOFF_RCX(%rsp)
     1040 +        movl    %eax, REGOFF_RAX(%rsp)          /* wrapper: sysc# -> %eax */
     1041 +        movq    $0, REGOFF_SAVFP(%rsp)
     1042 +        movq    $0, REGOFF_SAVPC(%rsp)
     1043 +
     1044 +        /*
     1045 +         * Copy these registers here in case we end up stopped with
     1046 +         * someone (like, say, /proc) messing with our register state.
     1047 +         * We don't -restore- them unless we have to in update_sregs.
     1048 +         *
     1049 +         * Since userland -can't- change fsbase or gsbase directly,
     1050 +         * we don't bother to capture them here.
     1051 +         */
     1052 +        xorl    %ebx, %ebx
     1053 +        movw    %ds, %bx
     1054 +        movq    %rbx, REGOFF_DS(%rsp)
     1055 +        movw    %es, %bx
     1056 +        movq    %rbx, REGOFF_ES(%rsp)
     1057 +        movw    %fs, %bx
     1058 +        movq    %rbx, REGOFF_FS(%rsp)
     1059 +        movw    %gs, %bx
     1060 +        movq    %rbx, REGOFF_GS(%rsp)
     1061 +
     1062 +        /*
     1063 +         * If we're trying to use TRAPTRACE though, I take that back: we're
     1064 +         * probably debugging some problem in the SWAPGS logic and want to know
     1065 +         * what the incoming gsbase was.
     1066 +         *
     1067 +         * Since we already did SWAPGS, record the KGSBASE.
     1068 +         */
     1069 +#if defined(DEBUG) && defined(TRAPTRACE) && !defined(__xpv)
     1070 +        movl    $MSR_AMD_KGSBASE, %ecx
     1071 +        rdmsr
     1072 +        movl    %eax, REGOFF_GSBASE(%rsp)
     1073 +        movl    %edx, REGOFF_GSBASE+4(%rsp)
 583 1074  #endif
 584      -        movl    %esp, %ebp
 585 1075  
 586      -        CPU_STATS_SYS_SYSCALL_INC
     1076 +        /*
     1077 +         * Application state saved in the regs structure on the stack
     1078 +         * %eax is the syscall number
     1079 +         * %rsp is the thread's stack, %r15 is curthread
     1080 +         * REG_RSP(%rsp) is the user's stack
     1081 +         */
 587 1082  
     1083 +        SYSCALL_TRAPTRACE($TT_SYSENTER)
     1084 +
     1085 +        movq    %rsp, %rbp
     1086 +
     1087 +        movq    T_LWP(%r15), %r14
     1088 +        ASSERT_NO_RUPDATE_PENDING(%r14)
     1089 +
 588 1090          ENABLE_INTR_FLAGS
 589 1091  
 590      -        pushl   %eax                            / preserve across mstate call
     1092 +        /*
     1093 +         * Catch 64-bit process trying to issue sysenter instruction
     1094 +         * on Nocona based systems.
     1095 +         */
     1096 +        movq    LWP_PROCP(%r14), %rax
     1097 +        cmpq    $DATAMODEL_ILP32, P_MODEL(%rax)
     1098 +        je      7f
     1099 +
     1100 +        /*
     1101 +         * For a non-32-bit process, simulate a #ud, since that's what
     1102 +         * native hardware does.  The traptrace entry (above) will
     1103 +         * let you know what really happened.
     1104 +         */
     1105 +        movq    $T_ILLINST, REGOFF_TRAPNO(%rsp)
     1106 +        movq    REGOFF_CS(%rsp), %rdi
     1107 +        movq    %rdi, REGOFF_ERR(%rsp)
     1108 +        movq    %rsp, %rdi
     1109 +        movq    REGOFF_RIP(%rsp), %rsi
     1110 +        movl    %gs:CPU_ID, %edx
     1111 +        call    trap
     1112 +        jmp     _sys_rtt
     1113 +7:
     1114 +
 591 1115          MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM)
 592      -        popl    %eax
     1116 +        movl    REGOFF_RAX(%rsp), %eax  /* (%rax damaged by mstate calls) */
 593 1117  
 594      -        movl    %gs:CPU_THREAD, %ebx
     1118 +        ASSERT_LWPTOREGS(%r14, %rsp)
 595 1119  
 596      -        ASSERT_LWPTOREGS(%ebx, %esp)
     1120 +        incq    %gs:CPU_STATS_SYS_SYSCALL
 597 1121  
 598      -        CHECK_PRESYS_NE(%ebx, %eax)
 599      -        jne     _full_syscall_presys
 600      -        SIMPLE_SYSCALL_PRESYS(%ebx, _syscall_fault)
     1122 +        /*
     1123 +         * Make some space for MAXSYSARGS (currently 8) 32-bit args
     1124 +         * placed into 64-bit (long) arg slots, plus one 64-bit
     1125 +         * (long) arg count, maintaining 16 byte alignment.
     1126 +         */
     1127 +        subq    $SYS_DROP, %rsp
     1128 +        movb    $LWP_SYS, LWP_STATE(%r14)
     1129 +        movq    %r15, %rdi
     1130 +        movq    %rsp, %rsi
     1131 +        call    syscall_entry
 601 1132  
 602      -_sysenter_call:
 603      -        call    *SY_CALLC(%eax)
     1133 +        /*
     1134 +         * Fetch the arguments copied onto the kernel stack and put
     1135 +         * them in the right registers to invoke a C-style syscall handler.
     1136 +         * %rax contains the handler address.
     1137 +         */
     1138 +        movq    %rax, %rbx
     1139 +        movl    0(%rsp), %edi
     1140 +        movl    8(%rsp), %esi
     1141 +        movl    0x10(%rsp), %edx
     1142 +        movl    0x18(%rsp), %ecx
     1143 +        movl    0x20(%rsp), %r8d
     1144 +        movl    0x28(%rsp), %r9d
 604 1145  
 605      -_sysenter_done:
 606      -        CHECK_POSTSYS_NE(%ebx, %ecx)
 607      -        jne     _full_syscall_postsys
 608      -        SIMPLE_SYSCALL_POSTSYS(%ebx, %cx)
 609      -        /
 610      -        / sysexit uses %edx to restore %eip, so we can't use it
 611      -        / to return a value, sigh.
 612      -        /
 613      -        movl    %eax, REGOFF_EAX(%esp)
 614      -        / movl  %edx, REGOFF_EDX(%esp)
     1146 +        movq    SY_CALLC(%rbx), %rax
     1147 +        INDIRECT_CALL_REG(rax)
 615 1148  
 616      -        / Interrupts will be turned on by the 'sti' executed just before
 617      -        / sysexit. The following ensures that restoring the user's EFLAGS
 618      -        / doesn't enable interrupts too soon.
 619      -        andl    $_BITNOT(PS_IE), REGOFF_EFL(%esp)
     1149 +        movq    %rbp, %rsp      /* pop the args */
 620 1150  
     1151 +        /*
     1152 +         * amd64 syscall handlers -always- return a 64-bit value in %rax.
     1153 +         * On the 32-bit kernel, the always return that value in %eax:%edx
     1154 +         * as required by the 32-bit ABI.
     1155 +         *
     1156 +         * Simulate the same behaviour by unconditionally splitting the
     1157 +         * return value in the same way.
     1158 +         */
     1159 +        movq    %rax, %r13
     1160 +        shrq    $32, %r13       /* upper 32-bits into %edx */
     1161 +        movl    %eax, %r12d     /* lower 32-bits into %eax */
     1162 +
     1163 +        /*
     1164 +         * Optimistically assume that there's no post-syscall
     1165 +         * work to do.  (This is to avoid having to call syscall_mstate()
     1166 +         * with interrupts disabled)
     1167 +         */
 621 1168          MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER)
 622 1169  
     1170 +        /*
     1171 +         * We must protect ourselves from being descheduled here;
     1172 +         * If we were, and we ended up on another cpu, or another
     1173 +         * lwp got int ahead of us, it could change the segment
     1174 +         * registers without us noticing before we return to userland.
     1175 +         *
     1176 +         * This cli is undone in the tr_sysexit trampoline code.
     1177 +         */
 623 1178          cli
     1179 +        CHECK_POSTSYS_NE(%r15, %r14, %ebx)
     1180 +        jne     _full_syscall_postsys32
     1181 +        SIMPLE_SYSCALL_POSTSYS(%r15, %r14, %bx)
 624 1182  
 625      -        SYSCALL_POP
     1183 +        /*
     1184 +         * To get back to userland, load up the 32-bit registers and
     1185 +         * sysexit back where we came from.
     1186 +         */
 626 1187  
 627      -        popl    %edx                    / sysexit: %edx -> %eip
 628      -        addl    $4, %esp                / get CS off the stack
 629      -        popfl                           / EFL
 630      -        popl    %ecx                    / sysexit: %ecx -> %esp
 631      -        sti
 632      -        sysexit
     1188 +        /*
     1189 +         * Interrupts will be turned on by the 'sti' executed just before
     1190 +         * sysexit.  The following ensures that restoring the user's rflags
     1191 +         * doesn't enable interrupts too soon.
     1192 +         */
     1193 +        andq    $_BITNOT(PS_IE), REGOFF_RFL(%rsp)
     1194 +
     1195 +        /*
     1196 +         * Clobber %r11 as we check CR0.TS.
     1197 +         */
     1198 +        ASSERT_CR0TS_ZERO(%r11)
     1199 +
     1200 +        /*
     1201 +         * (There's no point in loading up %edx because the sysexit
     1202 +         * mechanism smashes it.)
     1203 +         */
     1204 +        movl    %r12d, %eax
     1205 +        movl    REGOFF_RBX(%rsp), %ebx
     1206 +        movl    REGOFF_RBP(%rsp), %ebp
     1207 +        movl    REGOFF_RSI(%rsp), %esi
     1208 +        movl    REGOFF_RDI(%rsp), %edi
     1209 +
     1210 +        movl    REGOFF_RIP(%rsp), %edx  /* sysexit: %edx -> %eip */
     1211 +        pushq   REGOFF_RFL(%rsp)
     1212 +        popfq
     1213 +        movl    REGOFF_RSP(%rsp), %ecx  /* sysexit: %ecx -> %esp */
     1214 +        ALTENTRY(sys_sysenter_swapgs_sysexit)
     1215 +        call    x86_md_clear
     1216 +        jmp     tr_sysexit
     1217 +        SET_SIZE(sys_sysenter_swapgs_sysexit)
 633 1218          SET_SIZE(sys_sysenter)
     1219 +        SET_SIZE(_sys_sysenter_post_swapgs)
 634 1220          SET_SIZE(brand_sys_sysenter)
 635 1221  
 636 1222  /*
     1223 + * This is the destination of the "int $T_SYSCALLINT" interrupt gate, used by
     1224 + * the generic i386 libc to do system calls. We do a small amount of setup
     1225 + * before jumping into the existing sys_syscall32 path.
     1226 + */
     1227 +
     1228 +        ENTRY_NP(brand_sys_syscall_int)
     1229 +        SWAPGS                          /* kernel gsbase */
     1230 +        XPV_TRAP_POP
     1231 +        call    smap_enable
     1232 +        BRAND_CALLBACK(BRAND_CB_INT91, BRAND_URET_FROM_INTR_STACK())
     1233 +        jmp     nopop_syscall_int
     1234 +
     1235 +        ALTENTRY(sys_syscall_int)
     1236 +        SWAPGS                          /* kernel gsbase */
     1237 +        XPV_TRAP_POP
     1238 +        call    smap_enable
     1239 +
     1240 +nopop_syscall_int:
     1241 +        movq    %gs:CPU_THREAD, %r15
     1242 +        movq    T_STACK(%r15), %rsp
     1243 +        movl    %eax, %eax
     1244 +        /*
     1245 +         * Set t_post_sys on this thread to force ourselves out via the slow
     1246 +         * path. It might be possible at some later date to optimize this out
     1247 +         * and use a faster return mechanism.
     1248 +         */
     1249 +        movb    $1, T_POST_SYS(%r15)
     1250 +        CLEAN_CS
     1251 +        jmp     _syscall32_save
     1252 +        /*
     1253 +         * There should be no instructions between this label and SWAPGS/IRET
     1254 +         * or we could end up breaking branded zone support. See the usage of
     1255 +         * this label in lx_brand_int80_callback and sn1_brand_int91_callback
     1256 +         * for examples.
     1257 +         *
     1258 +         * We want to swapgs to maintain the invariant that all entries into
     1259 +         * tr_iret_user are done on the user gsbase.
     1260 +         */
     1261 +        ALTENTRY(sys_sysint_swapgs_iret)
     1262 +        call    x86_md_clear
     1263 +        SWAPGS
     1264 +        jmp     tr_iret_user
     1265 +        /*NOTREACHED*/
     1266 +        SET_SIZE(sys_sysint_swapgs_iret)
     1267 +        SET_SIZE(sys_syscall_int)
     1268 +        SET_SIZE(brand_sys_syscall_int)
     1269 +
     1270 +/*
     1271 + * Legacy 32-bit applications and old libc implementations do lcalls;
     1272 + * we should never get here because the LDT entry containing the syscall
     1273 + * segment descriptor has the "segment present" bit cleared, which means
     1274 + * we end up processing those system calls in trap() via a not-present trap.
     1275 + *
     1276 + * We do it this way because a call gate unhelpfully does -nothing- to the
     1277 + * interrupt flag bit, so an interrupt can run us just after the lcall
     1278 + * completes, but just before the swapgs takes effect.   Thus the INTR_PUSH and
     1279 + * INTR_POP paths would have to be slightly more complex to dance around
     1280 + * this problem, and end up depending explicitly on the first
     1281 + * instruction of this handler being either swapgs or cli.
     1282 + */
     1283 +
     1284 +        ENTRY_NP(sys_lcall32)
     1285 +        SWAPGS                          /* kernel gsbase */
     1286 +        pushq   $0
     1287 +        pushq   %rbp
     1288 +        movq    %rsp, %rbp
     1289 +        leaq    __lcall_panic_str(%rip), %rdi
     1290 +        xorl    %eax, %eax
     1291 +        call    panic
     1292 +        SET_SIZE(sys_lcall32)
     1293 +
     1294 +__lcall_panic_str:
     1295 +        .string "sys_lcall32: shouldn't be here!"
     1296 +
     1297 +/*
 637 1298   * Declare a uintptr_t which covers the entire pc range of syscall
 638 1299   * handlers for the stack walkers that need this.
 639 1300   */
 640 1301          .align  CPTRSIZE
 641 1302          .globl  _allsyscalls_size
 642 1303          .type   _allsyscalls_size, @object
 643 1304  _allsyscalls_size:
 644 1305          .NWORD  . - _allsyscalls
 645 1306          SET_SIZE(_allsyscalls_size)
 646 1307  
 647      -#endif  /* __lint */
 648      -
 649 1308  /*
 650 1309   * These are the thread context handlers for lwps using sysenter/sysexit.
 651 1310   */
 652 1311  
 653      -#if defined(__lint)
 654      -
 655      -/*ARGSUSED*/
 656      -void
 657      -sep_save(void *ksp)
 658      -{}
 659      -
 660      -/*ARGSUSED*/
 661      -void
 662      -sep_restore(void *ksp)
 663      -{}
 664      -
 665      -#else   /* __lint */
 666      -
 667 1312          /*
 668 1313           * setting this value to zero as we switch away causes the
 669 1314           * stack-pointer-on-sysenter to be NULL, ensuring that we
 670 1315           * don't silently corrupt another (preempted) thread stack
 671 1316           * when running an lwp that (somehow) didn't get sep_restore'd
 672 1317           */
 673 1318          ENTRY_NP(sep_save)
 674 1319          xorl    %edx, %edx
 675 1320          xorl    %eax, %eax
 676 1321          movl    $MSR_INTC_SEP_ESP, %ecx
 677 1322          wrmsr
 678 1323          ret
 679 1324          SET_SIZE(sep_save)
 680 1325  
 681 1326          /*
 682 1327           * Update the kernel stack pointer as we resume onto this cpu.
 683 1328           */
 684 1329          ENTRY_NP(sep_restore)
 685      -        movl    4(%esp), %eax                   /* per-lwp kernel sp */
 686      -        xorl    %edx, %edx
     1330 +        movq    %rdi, %rdx
     1331 +        shrq    $32, %rdx
     1332 +        movl    %edi, %eax
 687 1333          movl    $MSR_INTC_SEP_ESP, %ecx
 688 1334          wrmsr
 689 1335          ret
 690 1336          SET_SIZE(sep_restore)
 691 1337  
 692      -#endif  /* __lint */
 693      -
 694      -/*
 695      - * Call syscall().  Called from trap() on watchpoint at lcall 0,7
 696      - */
 697      -
 698      -#if defined(__lint)
 699      -
 700      -void
 701      -watch_syscall(void)
 702      -{}
 703      -
 704      -#else   /* __lint */
 705      -
 706      -        ENTRY_NP(watch_syscall)
 707      -        CLI(%eax)
 708      -        movl    %gs:CPU_THREAD, %ebx
 709      -        movl    T_STACK(%ebx), %esp             / switch to the thread stack
 710      -        movl    REGOFF_EAX(%esp), %eax          / recover original syscall#
 711      -        jmp     _watch_do_syscall
 712      -        SET_SIZE(watch_syscall)
 713      -
 714      -#endif  /* __lint */
    
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX