Print this page
8956 Implement KPTI
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>

Split Close
Expand all
Collapse all
          --- old/usr/src/uts/i86pc/ml/syscall_asm_amd64.s
          +++ new/usr/src/uts/i86pc/ml/syscall_asm_amd64.s
↓ open down ↓ 12 lines elided ↑ open up ↑
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
  23      - * Copyright 2015 Joyent, Inc.
       23 + * Copyright 2018 Joyent, Inc.
  24   24   * Copyright (c) 2016 by Delphix. All rights reserved.
  25   25   */
  26   26  
  27   27  #include <sys/asm_linkage.h>
  28   28  #include <sys/asm_misc.h>
  29   29  #include <sys/regset.h>
  30   30  #include <sys/privregs.h>
  31   31  #include <sys/psw.h>
  32   32  #include <sys/machbrand.h>
  33   33  
↓ open down ↓ 450 lines elided ↑ open up ↑
 484  484          movw    %ds, %bx
 485  485          movq    %rbx, REGOFF_DS(%rsp)
 486  486          movw    %es, %bx
 487  487          movq    %rbx, REGOFF_ES(%rsp)
 488  488          movw    %fs, %bx
 489  489          movq    %rbx, REGOFF_FS(%rsp)
 490  490          movw    %gs, %bx
 491  491          movq    %rbx, REGOFF_GS(%rsp)
 492  492  
 493  493          /*
      494 +         * If we're trying to use TRAPTRACE though, I take that back: we're
      495 +         * probably debugging some problem in the SWAPGS logic and want to know
      496 +         * what the incoming gsbase was.
      497 +         *
      498 +         * Since we already did SWAPGS, record the KGSBASE.
      499 +         */
      500 +#if defined(DEBUG) && defined(TRAPTRACE) && !defined(__xpv)
      501 +        movl    $MSR_AMD_KGSBASE, %ecx
      502 +        rdmsr
      503 +        movl    %eax, REGOFF_GSBASE(%rsp)
      504 +        movl    %edx, REGOFF_GSBASE+4(%rsp)
      505 +#endif
      506 +
      507 +        /*
 494  508           * Machine state saved in the regs structure on the stack
 495  509           * First six args in %rdi, %rsi, %rdx, %rcx, %r8, %r9
 496  510           * %eax is the syscall number
 497  511           * %rsp is the thread's stack, %r15 is curthread
 498  512           * REG_RSP(%rsp) is the user's stack
 499  513           */
 500  514  
 501  515          SYSCALL_TRAPTRACE($TT_SYSC64)
 502  516  
 503  517          movq    %rsp, %rbp
↓ open down ↓ 160 lines elided ↑ open up ↑
 664  678          movl    $UDS_SEL, REGOFF_SS(%rsp)
 665  679          addq    $REGOFF_RIP, %rsp
 666  680          /*
 667  681           * XXPV: see comment in SYSRETQ definition for future optimization
 668  682           *       we could take.
 669  683           */
 670  684          ASSERT_UPCALL_MASK_IS_SET
 671  685          SYSRETQ
 672  686  #else
 673  687          ALTENTRY(nopop_sys_syscall_swapgs_sysretq)
 674      -        SWAPGS                          /* user gsbase */
 675      -        SYSRETQ
      688 +        jmp     tr_sysretq
 676  689  #endif
 677  690          /*NOTREACHED*/
 678  691          SET_SIZE(nopop_sys_syscall_swapgs_sysretq)
 679  692  
 680  693  _syscall_pre:
 681  694          call    pre_syscall
 682  695          movl    %eax, %r12d
 683  696          testl   %eax, %eax
 684  697          jne     _syscall_post_call
 685  698          /*
↓ open down ↓ 80 lines elided ↑ open up ↑
 766  779          movw    %ds, %bx
 767  780          movq    %rbx, REGOFF_DS(%rsp)
 768  781          movw    %es, %bx
 769  782          movq    %rbx, REGOFF_ES(%rsp)
 770  783          movw    %fs, %bx
 771  784          movq    %rbx, REGOFF_FS(%rsp)
 772  785          movw    %gs, %bx
 773  786          movq    %rbx, REGOFF_GS(%rsp)
 774  787  
 775  788          /*
      789 +         * If we're trying to use TRAPTRACE though, I take that back: we're
      790 +         * probably debugging some problem in the SWAPGS logic and want to know
      791 +         * what the incoming gsbase was.
      792 +         *
      793 +         * Since we already did SWAPGS, record the KGSBASE.
      794 +         */
      795 +#if defined(DEBUG) && defined(TRAPTRACE) && !defined(__xpv)
      796 +        movl    $MSR_AMD_KGSBASE, %ecx
      797 +        rdmsr
      798 +        movl    %eax, REGOFF_GSBASE(%rsp)
      799 +        movl    %edx, REGOFF_GSBASE+4(%rsp)
      800 +#endif
      801 +
      802 +        /*
 776  803           * Application state saved in the regs structure on the stack
 777  804           * %eax is the syscall number
 778  805           * %rsp is the thread's stack, %r15 is curthread
 779  806           * REG_RSP(%rsp) is the user's stack
 780  807           */
 781  808  
 782  809          SYSCALL_TRAPTRACE32($TT_SYSC)
 783  810  
 784  811          movq    %rsp, %rbp
 785  812  
↓ open down ↓ 96 lines elided ↑ open up ↑
 882  909          movl    REGOFF_RBP(%rsp), %ebp
 883  910          movl    REGOFF_RSI(%rsp), %esi
 884  911          movl    REGOFF_RDI(%rsp), %edi
 885  912  
 886  913          movl    REGOFF_RFL(%rsp), %r11d         /* %r11 -> eflags */
 887  914          movl    REGOFF_RIP(%rsp), %ecx          /* %ecx -> %eip */
 888  915          movl    REGOFF_RSP(%rsp), %esp
 889  916  
 890  917          ASSERT_UPCALL_MASK_IS_SET
 891  918          ALTENTRY(nopop_sys_syscall32_swapgs_sysretl)
 892      -        SWAPGS                          /* user gsbase */
 893      -        SYSRETL
      919 +        jmp     tr_sysretl
 894  920          SET_SIZE(nopop_sys_syscall32_swapgs_sysretl)
 895  921          /*NOTREACHED*/
 896  922  
 897  923  _full_syscall_postsys32:
 898  924          STI
 899  925          /*
 900  926           * Sigh, our optimism wasn't justified, put it back to LMS_SYSTEM
 901  927           * so that we can account for the extra work it takes us to finish.
 902  928           */
 903  929          MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM)
↓ open down ↓ 24 lines elided ↑ open up ↑
 928  954   *
 929  955   * - %rip is pointing to sys_sysenter (below).
 930  956   * - %cs and %ss are set to kernel text and stack (data) selectors.
 931  957   * - %rsp is pointing at the lwp's stack
 932  958   * - interrupts have been disabled.
 933  959   *
 934  960   * Note that we are unable to return both "rvals" to userland with
 935  961   * this call, as %edx is used by the sysexit instruction.
 936  962   *
 937  963   * One final complication in this routine is its interaction with
 938      - * single-stepping in a debugger.  For most of the system call mechanisms,
 939      - * the CPU automatically clears the single-step flag before we enter the
 940      - * kernel.  The sysenter mechanism does not clear the flag, so a user
 941      - * single-stepping through a libc routine may suddenly find themself
 942      - * single-stepping through the kernel.  To detect this, kmdb compares the
 943      - * trap %pc to the [brand_]sys_enter addresses on each single-step trap.
 944      - * If it finds that we have single-stepped to a sysenter entry point, it
 945      - * explicitly clears the flag and executes the sys_sysenter routine.
      964 + * single-stepping in a debugger.  For most of the system call mechanisms, the
      965 + * CPU automatically clears the single-step flag before we enter the kernel.
      966 + * The sysenter mechanism does not clear the flag, so a user single-stepping
      967 + * through a libc routine may suddenly find themself single-stepping through the
      968 + * kernel.  To detect this, kmdb and trap() both compare the trap %pc to the
      969 + * [brand_]sys_enter addresses on each single-step trap.  If it finds that we
      970 + * have single-stepped to a sysenter entry point, it explicitly clears the flag
      971 + * and executes the sys_sysenter routine.
 946  972   *
 947      - * One final complication in this final complication is the fact that we
 948      - * have two different entry points for sysenter: brand_sys_sysenter and
 949      - * sys_sysenter.  If we enter at brand_sys_sysenter and start single-stepping
 950      - * through the kernel with kmdb, we will eventually hit the instruction at
 951      - * sys_sysenter.  kmdb cannot distinguish between that valid single-step
 952      - * and the undesirable one mentioned above.  To avoid this situation, we
 953      - * simply add a jump over the instruction at sys_sysenter to make it
 954      - * impossible to single-step to it.
      973 + * One final complication in this final complication is the fact that we have
      974 + * two different entry points for sysenter: brand_sys_sysenter and sys_sysenter.
      975 + * If we enter at brand_sys_sysenter and start single-stepping through the
      976 + * kernel with kmdb, we will eventually hit the instruction at sys_sysenter.
      977 + * kmdb cannot distinguish between that valid single-step and the undesirable
      978 + * one mentioned above.  To avoid this situation, we simply add a jump over the
      979 + * instruction at sys_sysenter to make it impossible to single-step to it.
 955  980   */
 956  981  #if defined(__lint)
 957  982  
 958  983  void
 959  984  sys_sysenter()
 960  985  {}
 961  986  
 962  987  #else   /* __lint */
 963  988  
 964  989          ENTRY_NP(brand_sys_sysenter)
 965  990          SWAPGS                          /* kernel gsbase */
 966  991          ALTENTRY(_brand_sys_sysenter_post_swapgs)
      992 +
 967  993          BRAND_CALLBACK(BRAND_CB_SYSENTER, BRAND_URET_FROM_REG(%rdx))
 968  994          /*
 969  995           * Jump over sys_sysenter to allow single-stepping as described
 970  996           * above.
 971  997           */
 972  998          jmp     _sys_sysenter_post_swapgs
 973  999  
 974 1000          ALTENTRY(sys_sysenter)
 975 1001          SWAPGS                          /* kernel gsbase */
 976      -
 977 1002          ALTENTRY(_sys_sysenter_post_swapgs)
     1003 +
 978 1004          movq    %gs:CPU_THREAD, %r15
 979 1005  
 980 1006          movl    $U32CS_SEL, REGOFF_CS(%rsp)
 981 1007          movl    %ecx, REGOFF_RSP(%rsp)          /* wrapper: %esp -> %ecx */
 982 1008          movl    %edx, REGOFF_RIP(%rsp)          /* wrapper: %eip -> %edx */
     1009 +        /*
     1010 +         * NOTE: none of the instructions that run before we get here should
     1011 +         * clobber bits in (R)FLAGS! This includes the kpti trampoline.
     1012 +         */
 983 1013          pushfq
 984 1014          popq    %r10
 985 1015          movl    $UDS_SEL, REGOFF_SS(%rsp)
 986 1016  
 987 1017          /*
 988 1018           * Set the interrupt flag before storing the flags to the
 989 1019           * flags image on the stack so we can return to user with
 990 1020           * interrupts enabled if we return via sys_rtt_syscall32
 991 1021           */
 992 1022          orq     $PS_IE, %r10
↓ open down ↓ 21 lines elided ↑ open up ↑
1014 1044          movw    %ds, %bx
1015 1045          movq    %rbx, REGOFF_DS(%rsp)
1016 1046          movw    %es, %bx
1017 1047          movq    %rbx, REGOFF_ES(%rsp)
1018 1048          movw    %fs, %bx
1019 1049          movq    %rbx, REGOFF_FS(%rsp)
1020 1050          movw    %gs, %bx
1021 1051          movq    %rbx, REGOFF_GS(%rsp)
1022 1052  
1023 1053          /*
     1054 +         * If we're trying to use TRAPTRACE though, I take that back: we're
     1055 +         * probably debugging some problem in the SWAPGS logic and want to know
     1056 +         * what the incoming gsbase was.
     1057 +         *
     1058 +         * Since we already did SWAPGS, record the KGSBASE.
     1059 +         */
     1060 +#if defined(DEBUG) && defined(TRAPTRACE) && !defined(__xpv)
     1061 +        movl    $MSR_AMD_KGSBASE, %ecx
     1062 +        rdmsr
     1063 +        movl    %eax, REGOFF_GSBASE(%rsp)
     1064 +        movl    %edx, REGOFF_GSBASE+4(%rsp)
     1065 +#endif
     1066 +
     1067 +        /*
1024 1068           * Application state saved in the regs structure on the stack
1025 1069           * %eax is the syscall number
1026 1070           * %rsp is the thread's stack, %r15 is curthread
1027 1071           * REG_RSP(%rsp) is the user's stack
1028 1072           */
1029 1073  
1030 1074          SYSCALL_TRAPTRACE($TT_SYSENTER)
1031 1075  
1032 1076          movq    %rsp, %rbp
1033 1077  
↓ open down ↓ 77 lines elided ↑ open up ↑
1111 1155           * work to do.  (This is to avoid having to call syscall_mstate()
1112 1156           * with interrupts disabled)
1113 1157           */
1114 1158          MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER)
1115 1159  
1116 1160          /*
1117 1161           * We must protect ourselves from being descheduled here;
1118 1162           * If we were, and we ended up on another cpu, or another
1119 1163           * lwp got int ahead of us, it could change the segment
1120 1164           * registers without us noticing before we return to userland.
     1165 +         *
     1166 +         * This cli is undone in the tr_sysexit trampoline code.
1121 1167           */
1122 1168          cli
1123 1169          CHECK_POSTSYS_NE(%r15, %r14, %ebx)
1124 1170          jne     _full_syscall_postsys32
1125 1171          SIMPLE_SYSCALL_POSTSYS(%r15, %r14, %bx)
1126 1172  
1127 1173          /*
1128 1174           * To get back to userland, load up the 32-bit registers and
1129 1175           * sysexit back where we came from.
1130 1176           */
↓ open down ↓ 13 lines elided ↑ open up ↑
1144 1190          movl    REGOFF_RBX(%rsp), %ebx
1145 1191          movl    REGOFF_RBP(%rsp), %ebp
1146 1192          movl    REGOFF_RSI(%rsp), %esi
1147 1193          movl    REGOFF_RDI(%rsp), %edi
1148 1194  
1149 1195          movl    REGOFF_RIP(%rsp), %edx  /* sysexit: %edx -> %eip */
1150 1196          pushq   REGOFF_RFL(%rsp)
1151 1197          popfq
1152 1198          movl    REGOFF_RSP(%rsp), %ecx  /* sysexit: %ecx -> %esp */
1153 1199          ALTENTRY(sys_sysenter_swapgs_sysexit)
1154      -        swapgs
1155      -        sti
1156      -        sysexit
     1200 +        jmp     tr_sysexit
1157 1201          SET_SIZE(sys_sysenter_swapgs_sysexit)
1158 1202          SET_SIZE(sys_sysenter)
1159 1203          SET_SIZE(_sys_sysenter_post_swapgs)
1160 1204          SET_SIZE(brand_sys_sysenter)
1161 1205  
1162 1206  #endif  /* __lint */
1163 1207  
1164 1208  /*
1165 1209   * This is the destination of the "int $T_SYSCALLINT" interrupt gate, used by
1166 1210   * the generic i386 libc to do system calls. We do a small amount of setup
↓ open down ↓ 30 lines elided ↑ open up ↑
1197 1241           * and use a faster return mechanism.
1198 1242           */
1199 1243          movb    $1, T_POST_SYS(%r15)
1200 1244          CLEAN_CS
1201 1245          jmp     _syscall32_save
1202 1246          /*
1203 1247           * There should be no instructions between this label and SWAPGS/IRET
1204 1248           * or we could end up breaking branded zone support. See the usage of
1205 1249           * this label in lx_brand_int80_callback and sn1_brand_int91_callback
1206 1250           * for examples.
     1251 +         *
     1252 +         * We want to swapgs to maintain the invariant that all entries into
     1253 +         * tr_iret_user are done on the user gsbase.
1207 1254           */
1208      -        ALTENTRY(sys_sysint_swapgs_iret)
1209      -        SWAPGS                          /* user gsbase */
1210      -        IRET
     1255 +        ALTENTRY(sys_sysint_swapgs_iret)
     1256 +        SWAPGS
     1257 +        jmp     tr_iret_user
1211 1258          /*NOTREACHED*/
1212 1259          SET_SIZE(sys_sysint_swapgs_iret)
1213 1260          SET_SIZE(sys_syscall_int)
1214 1261          SET_SIZE(brand_sys_syscall_int)
1215 1262  
1216 1263  #endif  /* __lint */
1217 1264  
1218 1265  /*
1219 1266   * Legacy 32-bit applications and old libc implementations do lcalls;
1220 1267   * we should never get here because the LDT entry containing the syscall
↓ open down ↓ 91 lines elided ↑ open up ↑
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX