Print this page
8956 Implement KPTI
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>

Split Close
Expand all
Collapse all
          --- old/usr/src/uts/intel/ia32/os/desctbls.c
          +++ new/usr/src/uts/intel/ia32/os/desctbls.c
↓ open down ↓ 16 lines elided ↑ open up ↑
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
  24   24   */
  25   25  
  26   26  /*
  27      - * Copyright 2011 Joyent, Inc. All rights reserved.
       27 + * Copyright 2018 Joyent, Inc. All rights reserved.
  28   28   */
  29   29  
  30   30  /*
  31   31   * Copyright (c) 1992 Terrence R. Lambert.
  32   32   * Copyright (c) 1990 The Regents of the University of California.
  33   33   * All rights reserved.
  34   34   *
  35   35   * This code is derived from software contributed to Berkeley by
  36   36   * William Jolitz.
  37   37   *
↓ open down ↓ 38 lines elided ↑ open up ↑
  76   76  #include <sys/x86_archext.h>
  77   77  #include <sys/controlregs.h>
  78   78  #include <sys/archsystm.h>
  79   79  #include <sys/machsystm.h>
  80   80  #include <sys/kobj.h>
  81   81  #include <sys/cmn_err.h>
  82   82  #include <sys/reboot.h>
  83   83  #include <sys/kdi.h>
  84   84  #include <sys/mach_mmu.h>
  85   85  #include <sys/systm.h>
       86 +#include <sys/note.h>
  86   87  
  87   88  #ifdef __xpv
  88   89  #include <sys/hypervisor.h>
  89   90  #include <vm/as.h>
  90   91  #endif
  91   92  
  92   93  #include <sys/promif.h>
  93   94  #include <sys/bootinfo.h>
  94   95  #include <vm/kboot_mmu.h>
  95   96  #include <vm/hat_pte.h>
↓ open down ↓ 25 lines elided ↑ open up ↑
 121  122  user_desc_t     zero_u32desc;           /* 32-bit compatibility procs */
 122  123  #endif  /* __amd64 */
 123  124  
 124  125  #if defined(__amd64)
 125  126  user_desc_t     ucs_on;
 126  127  user_desc_t     ucs_off;
 127  128  user_desc_t     ucs32_on;
 128  129  user_desc_t     ucs32_off;
 129  130  #endif  /* __amd64 */
 130  131  
 131      -#pragma align   16(dblfault_stack0)
 132      -char            dblfault_stack0[DEFAULTSTKSZ];
      132 +/*
      133 + * If the size of this is changed, you must update hat_pcp_setup() and the
      134 + * definitions in exception.s
      135 + */
      136 +extern char dblfault_stack0[DEFAULTSTKSZ];
      137 +extern char nmi_stack0[DEFAULTSTKSZ];
      138 +extern char mce_stack0[DEFAULTSTKSZ];
 133  139  
 134  140  extern void     fast_null(void);
 135  141  extern hrtime_t get_hrtime(void);
 136  142  extern hrtime_t gethrvtime(void);
 137  143  extern hrtime_t get_hrestime(void);
 138  144  extern uint64_t getlgrp(void);
 139  145  
 140  146  void (*(fasttable[]))(void) = {
 141  147          fast_null,                      /* T_FNULL routine */
 142  148          fast_null,                      /* T_FGETFP routine (initially null) */
↓ open down ↓ 160 lines elided ↑ open up ↑
 303  309          base = (uintptr_t)dp->ssd_lobase |
 304  310              (uintptr_t)dp->ssd_midbase << 16 |
 305  311              (uintptr_t)dp->ssd_hibase << (16 + 8);
 306  312          return ((void *)base);
 307  313  }
 308  314  
 309  315  #endif  /* __i386 */
 310  316  
 311  317  /*
 312  318   * Install gate segment descriptor for interrupt, trap, call and task gates.
      319 + *
      320 + * For 64 bit native if we have KPTI enabled, we use the IST stack mechanism on
      321 + * all interrupts.  We have different ISTs for each class of exceptions that are
      322 + * most likely to occur while handling an existing exception; while many of
      323 + * these are just going to panic, it's nice not to trample on the existing
      324 + * exception state for debugging purposes.
      325 + *
      326 + * Normal interrupts are all redirected unconditionally to the KPTI trampoline
      327 + * stack space. This unifies the trampoline handling between user and kernel
      328 + * space (and avoids the need to touch %gs).
      329 + *
      330 + * The KDI IDT *all* uses the DBG IST: consider single stepping tr_pftrap, when
      331 + * we do a read from KMDB that cause another #PF.  Without its own IST, this
      332 + * would stomp on the kernel's mcpu_kpti_flt frame.
 313  333   */
 314      -
 315      -#if defined(__amd64)
 316      -
 317      -/*ARGSUSED*/
 318      -void
 319      -set_gatesegd(gate_desc_t *dp, void (*func)(void), selector_t sel,
 320      -    uint_t type, uint_t dpl, uint_t vector)
      334 +uint_t
      335 +idt_vector_to_ist(uint_t vector)
 321  336  {
 322      -        dp->sgd_looffset = (uintptr_t)func;
 323      -        dp->sgd_hioffset = (uintptr_t)func >> 16;
 324      -        dp->sgd_hi64offset = (uintptr_t)func >> (16 + 16);
      337 +#if defined(__xpv)
      338 +        _NOTE(ARGUNUSED(vector));
      339 +        return (IST_NONE);
      340 +#else
      341 +        switch (vector) {
      342 +        /* These should always use IST even without KPTI enabled. */
      343 +        case T_DBLFLT:
      344 +                return (IST_DF);
      345 +        case T_NMIFLT:
      346 +                return (IST_NMI);
      347 +        case T_MCE:
      348 +                return (IST_MCE);
 325  349  
 326      -        dp->sgd_selector =  (uint16_t)sel;
 327      -
 328      -        /*
 329      -         * For 64 bit native we use the IST stack mechanism
 330      -         * for double faults. All other traps use the CPL = 0
 331      -         * (tss_rsp0) stack.
 332      -         */
 333      -#if !defined(__xpv)
 334      -        if (vector == T_DBLFLT)
 335      -                dp->sgd_ist = 1;
 336      -        else
      350 +        case T_BPTFLT:
      351 +        case T_SGLSTP:
      352 +                if (kpti_enable == 1) {
      353 +                        return (IST_DBG);
      354 +                }
      355 +                return (IST_NONE);
      356 +        case T_STKFLT:
      357 +        case T_GPFLT:
      358 +        case T_PGFLT:
      359 +                if (kpti_enable == 1) {
      360 +                        return (IST_NESTABLE);
      361 +                }
      362 +                return (IST_NONE);
      363 +        default:
      364 +                if (kpti_enable == 1) {
      365 +                        return (IST_DEFAULT);
      366 +                }
      367 +                return (IST_NONE);
      368 +        }
 337  369  #endif
 338      -                dp->sgd_ist = 0;
 339      -
 340      -        dp->sgd_type = type;
 341      -        dp->sgd_dpl = dpl;
 342      -        dp->sgd_p = 1;
 343  370  }
 344  371  
 345      -#elif defined(__i386)
 346      -
 347      -/*ARGSUSED*/
 348  372  void
 349  373  set_gatesegd(gate_desc_t *dp, void (*func)(void), selector_t sel,
 350      -    uint_t type, uint_t dpl, uint_t unused)
      374 +    uint_t type, uint_t dpl, uint_t ist)
 351  375  {
 352  376          dp->sgd_looffset = (uintptr_t)func;
 353  377          dp->sgd_hioffset = (uintptr_t)func >> 16;
 354      -
      378 +        dp->sgd_hi64offset = (uintptr_t)func >> (16 + 16);
 355  379          dp->sgd_selector =  (uint16_t)sel;
 356      -        dp->sgd_stkcpy = 0;     /* always zero bytes */
      380 +        dp->sgd_ist = ist;
 357  381          dp->sgd_type = type;
 358  382          dp->sgd_dpl = dpl;
 359  383          dp->sgd_p = 1;
 360  384  }
 361  385  
 362      -#endif  /* __i386 */
 363      -
 364  386  /*
 365  387   * Updates a single user descriptor in the the GDT of the current cpu.
 366  388   * Caller is responsible for preventing cpu migration.
 367  389   */
 368  390  
 369  391  void
 370  392  gdt_update_usegd(uint_t sidx, user_desc_t *udp)
 371  393  {
 372  394  #if defined(__xpv)
 373  395  
↓ open down ↓ 536 lines elided ↑ open up ↑
 910  932   * be in kernel mode (so that the trap prolog doesn't do a swapgs), but
 911  933   * %gsbase is really still pointing at something in userland. Bad things will
 912  934   * ensue. We also use interrupt gates for i386 as well even though this is not
 913  935   * required for some traps.
 914  936   *
 915  937   * Perhaps they should have invented a trap gate that does an atomic swapgs?
 916  938   */
 917  939  static void
 918  940  init_idt_common(gate_desc_t *idt)
 919  941  {
 920      -        set_gatesegd(&idt[T_ZERODIV], &div0trap, KCS_SEL, SDT_SYSIGT, TRP_KPL,
 921      -            0);
 922      -        set_gatesegd(&idt[T_SGLSTP], &dbgtrap, KCS_SEL, SDT_SYSIGT, TRP_KPL,
 923      -            0);
 924      -        set_gatesegd(&idt[T_NMIFLT], &nmiint, KCS_SEL, SDT_SYSIGT, TRP_KPL,
 925      -            0);
 926      -        set_gatesegd(&idt[T_BPTFLT], &brktrap, KCS_SEL, SDT_SYSIGT, TRP_UPL,
 927      -            0);
 928      -        set_gatesegd(&idt[T_OVFLW], &ovflotrap, KCS_SEL, SDT_SYSIGT, TRP_UPL,
 929      -            0);
 930      -        set_gatesegd(&idt[T_BOUNDFLT], &boundstrap, KCS_SEL, SDT_SYSIGT,
 931      -            TRP_KPL, 0);
 932      -        set_gatesegd(&idt[T_ILLINST], &invoptrap, KCS_SEL, SDT_SYSIGT, TRP_KPL,
 933      -            0);
 934      -        set_gatesegd(&idt[T_NOEXTFLT], &ndptrap,  KCS_SEL, SDT_SYSIGT, TRP_KPL,
 935      -            0);
      942 +        set_gatesegd(&idt[T_ZERODIV],
      943 +            (kpti_enable == 1) ? &tr_div0trap : &div0trap,
      944 +            KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_ZERODIV));
      945 +        set_gatesegd(&idt[T_SGLSTP],
      946 +            (kpti_enable == 1) ? &tr_dbgtrap : &dbgtrap,
      947 +            KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_SGLSTP));
      948 +        set_gatesegd(&idt[T_NMIFLT],
      949 +            (kpti_enable == 1) ? &tr_nmiint : &nmiint,
      950 +            KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_NMIFLT));
      951 +        set_gatesegd(&idt[T_BPTFLT],
      952 +            (kpti_enable == 1) ? &tr_brktrap : &brktrap,
      953 +            KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_BPTFLT));
      954 +        set_gatesegd(&idt[T_OVFLW],
      955 +            (kpti_enable == 1) ? &tr_ovflotrap : &ovflotrap,
      956 +            KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_OVFLW));
      957 +        set_gatesegd(&idt[T_BOUNDFLT],
      958 +            (kpti_enable == 1) ? &tr_boundstrap : &boundstrap,
      959 +            KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_BOUNDFLT));
      960 +        set_gatesegd(&idt[T_ILLINST],
      961 +            (kpti_enable == 1) ? &tr_invoptrap : &invoptrap,
      962 +            KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_ILLINST));
      963 +        set_gatesegd(&idt[T_NOEXTFLT],
      964 +            (kpti_enable == 1) ? &tr_ndptrap : &ndptrap,
      965 +            KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_NOEXTFLT));
 936  966  
 937  967          /*
 938  968           * double fault handler.
 939  969           *
 940  970           * Note that on the hypervisor a guest does not receive #df faults.
 941  971           * Instead a failsafe event is injected into the guest if its selectors
 942  972           * and/or stack is in a broken state. See xen_failsafe_callback.
 943  973           */
 944  974  #if !defined(__xpv)
 945      -#if defined(__amd64)
 946      -
 947      -        set_gatesegd(&idt[T_DBLFLT], &syserrtrap, KCS_SEL, SDT_SYSIGT, TRP_KPL,
 948      -            T_DBLFLT);
 949      -
 950      -#elif defined(__i386)
 951      -
 952      -        /*
 953      -         * task gate required.
 954      -         */
 955      -        set_gatesegd(&idt[T_DBLFLT], NULL, DFTSS_SEL, SDT_SYSTASKGT, TRP_KPL,
 956      -            0);
 957      -
 958      -#endif  /* __i386 */
      975 +        set_gatesegd(&idt[T_DBLFLT],
      976 +            (kpti_enable == 1) ? &tr_syserrtrap : &syserrtrap,
      977 +            KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_DBLFLT));
 959  978  #endif  /* !__xpv */
 960  979  
 961  980          /*
 962  981           * T_EXTOVRFLT coprocessor-segment-overrun not supported.
 963  982           */
      983 +        set_gatesegd(&idt[T_TSSFLT],
      984 +            (kpti_enable == 1) ? &tr_invtsstrap : &invtsstrap,
      985 +            KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_TSSFLT));
      986 +        set_gatesegd(&idt[T_SEGFLT],
      987 +            (kpti_enable == 1) ? &tr_segnptrap : &segnptrap,
      988 +            KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_SEGFLT));
      989 +        set_gatesegd(&idt[T_STKFLT],
      990 +            (kpti_enable == 1) ? &tr_stktrap : &stktrap,
      991 +            KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_STKFLT));
      992 +        set_gatesegd(&idt[T_GPFLT],
      993 +            (kpti_enable == 1) ? &tr_gptrap : &gptrap,
      994 +            KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_GPFLT));
      995 +        set_gatesegd(&idt[T_PGFLT],
      996 +            (kpti_enable == 1) ? &tr_pftrap : &pftrap,
      997 +            KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_PGFLT));
      998 +        set_gatesegd(&idt[T_EXTERRFLT],
      999 +            (kpti_enable == 1) ? &tr_ndperr : &ndperr,
     1000 +            KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_EXTERRFLT));
     1001 +        set_gatesegd(&idt[T_ALIGNMENT],
     1002 +            (kpti_enable == 1) ? &tr_achktrap : &achktrap,
     1003 +            KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_ALIGNMENT));
     1004 +        set_gatesegd(&idt[T_MCE],
     1005 +            (kpti_enable == 1) ? &tr_mcetrap : &mcetrap,
     1006 +            KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_MCE));
     1007 +        set_gatesegd(&idt[T_SIMDFPE],
     1008 +            (kpti_enable == 1) ? &tr_xmtrap : &xmtrap,
     1009 +            KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_SIMDFPE));
 964 1010  
 965      -        set_gatesegd(&idt[T_TSSFLT], &invtsstrap, KCS_SEL, SDT_SYSIGT, TRP_KPL,
 966      -            0);
 967      -        set_gatesegd(&idt[T_SEGFLT], &segnptrap, KCS_SEL, SDT_SYSIGT, TRP_KPL,
 968      -            0);
 969      -        set_gatesegd(&idt[T_STKFLT], &stktrap, KCS_SEL, SDT_SYSIGT, TRP_KPL, 0);
 970      -        set_gatesegd(&idt[T_GPFLT], &gptrap, KCS_SEL, SDT_SYSIGT, TRP_KPL, 0);
 971      -        set_gatesegd(&idt[T_PGFLT], &pftrap, KCS_SEL, SDT_SYSIGT, TRP_KPL, 0);
 972      -        set_gatesegd(&idt[T_EXTERRFLT], &ndperr, KCS_SEL, SDT_SYSIGT, TRP_KPL,
 973      -            0);
 974      -        set_gatesegd(&idt[T_ALIGNMENT], &achktrap, KCS_SEL, SDT_SYSIGT,
 975      -            TRP_KPL, 0);
 976      -        set_gatesegd(&idt[T_MCE], &mcetrap, KCS_SEL, SDT_SYSIGT, TRP_KPL, 0);
 977      -        set_gatesegd(&idt[T_SIMDFPE], &xmtrap, KCS_SEL, SDT_SYSIGT, TRP_KPL, 0);
 978      -
 979 1011          /*
 980 1012           * install fast trap handler at 210.
 981 1013           */
 982      -        set_gatesegd(&idt[T_FASTTRAP], &fasttrap, KCS_SEL, SDT_SYSIGT, TRP_UPL,
 983      -            0);
     1014 +        set_gatesegd(&idt[T_FASTTRAP],
     1015 +            (kpti_enable == 1) ? &tr_fasttrap : &fasttrap,
     1016 +            KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_FASTTRAP));
 984 1017  
 985 1018          /*
 986 1019           * System call handler.
 987 1020           */
 988      -#if defined(__amd64)
 989      -        set_gatesegd(&idt[T_SYSCALLINT], &sys_syscall_int, KCS_SEL, SDT_SYSIGT,
 990      -            TRP_UPL, 0);
     1021 +        set_gatesegd(&idt[T_SYSCALLINT],
     1022 +            (kpti_enable == 1) ? &tr_sys_syscall_int : &sys_syscall_int,
     1023 +            KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_SYSCALLINT));
 991 1024  
 992      -#elif defined(__i386)
 993      -        set_gatesegd(&idt[T_SYSCALLINT], &sys_call, KCS_SEL, SDT_SYSIGT,
 994      -            TRP_UPL, 0);
 995      -#endif  /* __i386 */
 996      -
 997 1025          /*
 998 1026           * Install the DTrace interrupt handler for the pid provider.
 999 1027           */
1000      -        set_gatesegd(&idt[T_DTRACE_RET], &dtrace_ret, KCS_SEL,
1001      -            SDT_SYSIGT, TRP_UPL, 0);
     1028 +        set_gatesegd(&idt[T_DTRACE_RET],
     1029 +            (kpti_enable == 1) ? &tr_dtrace_ret : &dtrace_ret,
     1030 +            KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_DTRACE_RET));
1002 1031  
1003 1032          /*
1004 1033           * Prepare interposing descriptor for the syscall handler
1005 1034           * and cache copy of the default descriptor.
1006 1035           */
1007 1036          brand_tbl[0].ih_inum = T_SYSCALLINT;
1008 1037          brand_tbl[0].ih_default_desc = idt0[T_SYSCALLINT];
1009 1038  
1010      -#if defined(__amd64)
1011      -        set_gatesegd(&(brand_tbl[0].ih_interp_desc), &brand_sys_syscall_int,
1012      -            KCS_SEL, SDT_SYSIGT, TRP_UPL, 0);
1013      -#elif defined(__i386)
1014      -        set_gatesegd(&(brand_tbl[0].ih_interp_desc), &brand_sys_call,
1015      -            KCS_SEL, SDT_SYSIGT, TRP_UPL, 0);
1016      -#endif  /* __i386 */
     1039 +        set_gatesegd(&(brand_tbl[0].ih_interp_desc),
     1040 +            (kpti_enable == 1) ? &tr_brand_sys_syscall_int :
     1041 +            &brand_sys_syscall_int, KCS_SEL, SDT_SYSIGT, TRP_UPL,
     1042 +            idt_vector_to_ist(T_SYSCALLINT));
1017 1043  
1018 1044          brand_tbl[1].ih_inum = 0;
1019 1045  }
1020 1046  
1021 1047  #if defined(__xpv)
1022 1048  
1023 1049  static void
1024 1050  init_idt(gate_desc_t *idt)
1025 1051  {
1026 1052          init_idt_common(idt);
↓ open down ↓ 7 lines elided ↑ open up ↑
1034 1060          char    ivctname[80];
1035 1061          void    (*ivctptr)(void);
1036 1062          int     i;
1037 1063  
1038 1064          /*
1039 1065           * Initialize entire table with 'reserved' trap and then overwrite
1040 1066           * specific entries. T_EXTOVRFLT (9) is unsupported and reserved
1041 1067           * since it can only be generated on a 386 processor. 15 is also
1042 1068           * unsupported and reserved.
1043 1069           */
1044      -        for (i = 0; i < NIDT; i++)
     1070 +#if !defined(__xpv)
     1071 +        for (i = 0; i < NIDT; i++) {
     1072 +                set_gatesegd(&idt[i],
     1073 +                    (kpti_enable == 1) ? &tr_resvtrap : &resvtrap,
     1074 +                    KCS_SEL, SDT_SYSIGT, TRP_KPL,
     1075 +                    idt_vector_to_ist(T_RESVTRAP));
     1076 +        }
     1077 +#else
     1078 +        for (i = 0; i < NIDT; i++) {
1045 1079                  set_gatesegd(&idt[i], &resvtrap, KCS_SEL, SDT_SYSIGT, TRP_KPL,
1046      -                    0);
     1080 +                    IST_NONE);
     1081 +        }
     1082 +#endif
1047 1083  
1048 1084          /*
1049 1085           * 20-31 reserved
1050 1086           */
1051      -        for (i = 20; i < 32; i++)
     1087 +#if !defined(__xpv)
     1088 +        for (i = 20; i < 32; i++) {
     1089 +                set_gatesegd(&idt[i],
     1090 +                    (kpti_enable == 1) ? &tr_invaltrap : &invaltrap,
     1091 +                    KCS_SEL, SDT_SYSIGT, TRP_KPL,
     1092 +                    idt_vector_to_ist(T_INVALTRAP));
     1093 +        }
     1094 +#else
     1095 +        for (i = 20; i < 32; i++) {
1052 1096                  set_gatesegd(&idt[i], &invaltrap, KCS_SEL, SDT_SYSIGT, TRP_KPL,
1053      -                    0);
     1097 +                    IST_NONE);
     1098 +        }
     1099 +#endif
1054 1100  
1055 1101          /*
1056 1102           * interrupts 32 - 255
1057 1103           */
1058 1104          for (i = 32; i < 256; i++) {
     1105 +#if !defined(__xpv)
     1106 +                (void) snprintf(ivctname, sizeof (ivctname),
     1107 +                    (kpti_enable == 1) ? "tr_ivct%d" : "ivct%d", i);
     1108 +#else
1059 1109                  (void) snprintf(ivctname, sizeof (ivctname), "ivct%d", i);
     1110 +#endif
1060 1111                  ivctptr = (void (*)(void))kobj_getsymvalue(ivctname, 0);
1061 1112                  if (ivctptr == NULL)
1062 1113                          panic("kobj_getsymvalue(%s) failed", ivctname);
1063 1114  
1064      -                set_gatesegd(&idt[i], ivctptr, KCS_SEL, SDT_SYSIGT, TRP_KPL, 0);
     1115 +                set_gatesegd(&idt[i], ivctptr, KCS_SEL, SDT_SYSIGT, TRP_KPL,
     1116 +                    idt_vector_to_ist(i));
1065 1117          }
1066 1118  
1067 1119          /*
1068 1120           * Now install the common ones. Note that it will overlay some
1069 1121           * entries installed above like T_SYSCALLINT, T_FASTTRAP etc.
1070 1122           */
1071 1123          init_idt_common(idt);
1072 1124  }
1073 1125  
1074 1126  #endif  /* __xpv */
↓ open down ↓ 8 lines elided ↑ open up ↑
1083 1135  init_ldt(void)
1084 1136  {
1085 1137  #if defined(__xpv)
1086 1138          xen_set_ldt(NULL, 0);
1087 1139  #else
1088 1140          wr_ldtr(0);
1089 1141  #endif
1090 1142  }
1091 1143  
1092 1144  #if !defined(__xpv)
1093      -#if defined(__amd64)
1094 1145  
1095 1146  static void
1096 1147  init_tss(void)
1097 1148  {
1098      -        /*
1099      -         * tss_rsp0 is dynamically filled in by resume() on each context switch.
1100      -         * All exceptions but #DF will run on the thread stack.
1101      -         * Set up the double fault stack here.
1102      -         */
1103      -        ktss0->tss_ist1 =
1104      -            (uint64_t)&dblfault_stack0[sizeof (dblfault_stack0)];
     1149 +        extern struct cpu cpus[];
1105 1150  
1106 1151          /*
1107      -         * Set I/O bit map offset equal to size of TSS segment limit
1108      -         * for no I/O permission map. This will force all user I/O
1109      -         * instructions to generate #gp fault.
     1152 +         * tss_rsp0 is dynamically filled in by resume() (in swtch.s) on each
     1153 +         * context switch but it'll be overwritten with this same value anyway.
1110 1154           */
1111      -        ktss0->tss_bitmapbase = sizeof (*ktss0);
     1155 +        if (kpti_enable == 1) {
     1156 +                ktss0->tss_rsp0 = (uint64_t)&cpus->cpu_m.mcpu_kpti.kf_tr_rsp;
     1157 +        }
1112 1158  
1113      -        /*
1114      -         * Point %tr to descriptor for ktss0 in gdt.
1115      -         */
1116      -        wr_tsr(KTSS_SEL);
1117      -}
     1159 +        /* Set up the IST stacks for double fault, NMI, MCE. */
     1160 +        ktss0->tss_ist1 = (uintptr_t)&dblfault_stack0[sizeof (dblfault_stack0)];
     1161 +        ktss0->tss_ist2 = (uintptr_t)&nmi_stack0[sizeof (nmi_stack0)];
     1162 +        ktss0->tss_ist3 = (uintptr_t)&mce_stack0[sizeof (mce_stack0)];
1118 1163  
1119      -#elif defined(__i386)
1120      -
1121      -static void
1122      -init_tss(void)
1123      -{
1124 1164          /*
1125      -         * ktss0->tss_esp dynamically filled in by resume() on each
1126      -         * context switch.
     1165 +         * This IST stack is used for #DB,#BP (debug) interrupts (when KPTI is
     1166 +         * enabled), and also for KDI (always).
1127 1167           */
1128      -        ktss0->tss_ss0  = KDS_SEL;
1129      -        ktss0->tss_eip  = (uint32_t)_start;
1130      -        ktss0->tss_ds   = ktss0->tss_es = ktss0->tss_ss = KDS_SEL;
1131      -        ktss0->tss_cs   = KCS_SEL;
1132      -        ktss0->tss_fs   = KFS_SEL;
1133      -        ktss0->tss_gs   = KGS_SEL;
1134      -        ktss0->tss_ldt  = ULDT_SEL;
     1168 +        ktss0->tss_ist4 = (uint64_t)&cpus->cpu_m.mcpu_kpti_dbg.kf_tr_rsp;
1135 1169  
1136      -        /*
1137      -         * Initialize double fault tss.
1138      -         */
1139      -        dftss0->tss_esp0 = (uint32_t)&dblfault_stack0[sizeof (dblfault_stack0)];
1140      -        dftss0->tss_ss0 = KDS_SEL;
     1170 +        if (kpti_enable == 1) {
     1171 +                /* This IST stack is used for #GP,#PF,#SS (fault) interrupts. */
     1172 +                ktss0->tss_ist5 =
     1173 +                    (uint64_t)&cpus->cpu_m.mcpu_kpti_flt.kf_tr_rsp;
1141 1174  
1142      -        /*
1143      -         * tss_cr3 will get initialized in hat_kern_setup() once our page
1144      -         * tables have been setup.
1145      -         */
1146      -        dftss0->tss_eip = (uint32_t)syserrtrap;
1147      -        dftss0->tss_esp = (uint32_t)&dblfault_stack0[sizeof (dblfault_stack0)];
1148      -        dftss0->tss_cs  = KCS_SEL;
1149      -        dftss0->tss_ds  = KDS_SEL;
1150      -        dftss0->tss_es  = KDS_SEL;
1151      -        dftss0->tss_ss  = KDS_SEL;
1152      -        dftss0->tss_fs  = KFS_SEL;
1153      -        dftss0->tss_gs  = KGS_SEL;
     1175 +                /* This IST stack is used for all other intrs (for KPTI). */
     1176 +                ktss0->tss_ist6 = (uint64_t)&cpus->cpu_m.mcpu_kpti.kf_tr_rsp;
     1177 +        }
1154 1178  
1155 1179          /*
1156 1180           * Set I/O bit map offset equal to size of TSS segment limit
1157 1181           * for no I/O permission map. This will force all user I/O
1158 1182           * instructions to generate #gp fault.
1159 1183           */
1160 1184          ktss0->tss_bitmapbase = sizeof (*ktss0);
1161 1185  
1162 1186          /*
1163 1187           * Point %tr to descriptor for ktss0 in gdt.
1164 1188           */
1165 1189          wr_tsr(KTSS_SEL);
1166 1190  }
1167 1191  
1168      -#endif  /* __i386 */
1169 1192  #endif  /* !__xpv */
1170 1193  
1171 1194  #if defined(__xpv)
1172 1195  
1173 1196  void
1174 1197  init_desctbls(void)
1175 1198  {
1176 1199          uint_t vec;
1177 1200          user_desc_t *gdt;
1178 1201  
↓ open down ↓ 71 lines elided ↑ open up ↑
1250 1273  #endif
1251 1274  
1252 1275          /*
1253 1276           * Setup and install our GDT.
1254 1277           */
1255 1278          gdt = init_gdt();
1256 1279          ASSERT(IS_P2ALIGNED((uintptr_t)gdt, PAGESIZE));
1257 1280          CPU->cpu_gdt = gdt;
1258 1281  
1259 1282          /*
     1283 +         * Initialize this CPU's LDT.
     1284 +         */
     1285 +        CPU->cpu_m.mcpu_ldt = BOP_ALLOC(bootops, (caddr_t)LDT_VA,
     1286 +            LDT_CPU_SIZE, PAGESIZE);
     1287 +        bzero(CPU->cpu_m.mcpu_ldt, LDT_CPU_SIZE);
     1288 +        CPU->cpu_m.mcpu_ldt_len = 0;
     1289 +
     1290 +        /*
1260 1291           * Setup and install our IDT.
1261 1292           */
1262 1293          init_idt(idt0);
1263 1294  
1264 1295          idtr.dtr_base = (uintptr_t)idt0;
1265 1296          idtr.dtr_limit = (NIDT * sizeof (*idt0)) - 1;
1266 1297          wr_idtr(&idtr);
1267 1298          CPU->cpu_idt = idt0;
1268 1299  
1269 1300  #if defined(__i386)
1270 1301          /*
1271 1302           * We maintain a description of idt0 in convenient IDTR format
1272 1303           * for #pf's on some older pentium processors. See pentium_pftrap().
1273 1304           */
1274 1305          idt0_default_r = idtr;
1275 1306  #endif  /* __i386 */
1276 1307  
1277 1308          init_tss();
1278 1309          CPU->cpu_tss = ktss0;
1279 1310          init_ldt();
     1311 +
     1312 +        /* Stash this so that the NMI,MCE,#DF and KDI handlers can use it. */
     1313 +        kpti_safe_cr3 = (uint64_t)getcr3();
1280 1314  }
1281 1315  
1282 1316  #endif  /* __xpv */
1283 1317  
1284 1318  /*
1285 1319   * In the early kernel, we need to set up a simple GDT to run on.
1286 1320   *
1287 1321   * XXPV Can dboot use this too?  See dboot_gdt.s
1288 1322   */
1289 1323  void
↓ open down ↓ 40 lines elided ↑ open up ↑
1330 1364           * Currently the hypervisor only supports 64-bit syscalls via
1331 1365           * syscall instruction. The 32-bit syscalls are handled by
1332 1366           * interrupt gate above.
1333 1367           */
1334 1368          xen_set_callback(brand_sys_syscall, CALLBACKTYPE_syscall,
1335 1369              CALLBACKF_mask_events);
1336 1370  
1337 1371  #else
1338 1372  
1339 1373          if (is_x86_feature(x86_featureset, X86FSET_ASYSC)) {
1340      -                wrmsr(MSR_AMD_LSTAR, (uintptr_t)brand_sys_syscall);
1341      -                wrmsr(MSR_AMD_CSTAR, (uintptr_t)brand_sys_syscall32);
     1374 +                if (kpti_enable == 1) {
     1375 +                        wrmsr(MSR_AMD_LSTAR, (uintptr_t)tr_brand_sys_syscall);
     1376 +                        wrmsr(MSR_AMD_CSTAR, (uintptr_t)tr_brand_sys_syscall32);
     1377 +                } else {
     1378 +                        wrmsr(MSR_AMD_LSTAR, (uintptr_t)brand_sys_syscall);
     1379 +                        wrmsr(MSR_AMD_CSTAR, (uintptr_t)brand_sys_syscall32);
     1380 +                }
1342 1381          }
1343 1382  
1344 1383  #endif
1345 1384  #endif  /* __amd64 */
1346 1385  
1347      -        if (is_x86_feature(x86_featureset, X86FSET_SEP))
1348      -                wrmsr(MSR_INTC_SEP_EIP, (uintptr_t)brand_sys_sysenter);
     1386 +        if (is_x86_feature(x86_featureset, X86FSET_SEP)) {
     1387 +                if (kpti_enable == 1) {
     1388 +                        wrmsr(MSR_INTC_SEP_EIP,
     1389 +                            (uintptr_t)tr_brand_sys_sysenter);
     1390 +                } else {
     1391 +                        wrmsr(MSR_INTC_SEP_EIP, (uintptr_t)brand_sys_sysenter);
     1392 +                }
     1393 +        }
1349 1394  }
1350 1395  
1351 1396  /*
1352 1397   * Disable interpositioning on the system call path by rewriting the
1353 1398   * sys{call|enter} MSRs and the syscall-related entries in the IDT to use
1354 1399   * the standard entry points, which bypass the interpositioning hooks.
1355 1400   */
1356 1401  void
1357 1402  brand_interpositioning_disable(void)
1358 1403  {
↓ open down ↓ 15 lines elided ↑ open up ↑
1374 1419  
1375 1420          /*
1376 1421           * See comment above in brand_interpositioning_enable.
1377 1422           */
1378 1423          xen_set_callback(sys_syscall, CALLBACKTYPE_syscall,
1379 1424              CALLBACKF_mask_events);
1380 1425  
1381 1426  #else
1382 1427  
1383 1428          if (is_x86_feature(x86_featureset, X86FSET_ASYSC)) {
1384      -                wrmsr(MSR_AMD_LSTAR, (uintptr_t)sys_syscall);
1385      -                wrmsr(MSR_AMD_CSTAR, (uintptr_t)sys_syscall32);
     1429 +                if (kpti_enable == 1) {
     1430 +                        wrmsr(MSR_AMD_LSTAR, (uintptr_t)tr_sys_syscall);
     1431 +                        wrmsr(MSR_AMD_CSTAR, (uintptr_t)tr_sys_syscall32);
     1432 +                } else {
     1433 +                        wrmsr(MSR_AMD_LSTAR, (uintptr_t)sys_syscall);
     1434 +                        wrmsr(MSR_AMD_CSTAR, (uintptr_t)sys_syscall32);
     1435 +                }
1386 1436          }
1387 1437  
1388 1438  #endif
1389 1439  #endif  /* __amd64 */
1390 1440  
1391      -        if (is_x86_feature(x86_featureset, X86FSET_SEP))
1392      -                wrmsr(MSR_INTC_SEP_EIP, (uintptr_t)sys_sysenter);
     1441 +        if (is_x86_feature(x86_featureset, X86FSET_SEP)) {
     1442 +                if (kpti_enable == 1) {
     1443 +                        wrmsr(MSR_INTC_SEP_EIP, (uintptr_t)tr_sys_sysenter);
     1444 +                } else {
     1445 +                        wrmsr(MSR_INTC_SEP_EIP, (uintptr_t)sys_sysenter);
     1446 +                }
     1447 +        }
1393 1448  }
    
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX