Print this page
8956 Implement KPTI
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>


  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
  24  */
  25 
  26 /*      Copyright (c) 1990, 1991 UNIX System Laboratories, Inc. */
  27 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T   */
  28 /*              All Rights Reserved                             */
  29 /*                                                              */
  30 /*      Copyright (c) 1987, 1988 Microsoft Corporation          */
  31 /*              All Rights Reserved                             */
  32 /*                                                              */
  33 
  34 /*
  35  * Copyright 2017 Joyent, Inc.
  36  */
  37 
  38 #include <sys/types.h>
  39 #include <sys/sysmacros.h>
  40 #include <sys/param.h>
  41 #include <sys/signal.h>
  42 #include <sys/systm.h>
  43 #include <sys/user.h>
  44 #include <sys/proc.h>
  45 #include <sys/disp.h>
  46 #include <sys/class.h>
  47 #include <sys/core.h>
  48 #include <sys/syscall.h>
  49 #include <sys/cpuvar.h>
  50 #include <sys/vm.h>
  51 #include <sys/sysinfo.h>
  52 #include <sys/fault.h>
  53 #include <sys/stack.h>
  54 #include <sys/psw.h>
  55 #include <sys/regset.h>


 463  */
 464 void
 465 trap(struct regs *rp, caddr_t addr, processorid_t cpuid)
 466 {
 467         kthread_t *ct = curthread;
 468         enum seg_rw rw;
 469         unsigned type;
 470         proc_t *p = ttoproc(ct);
 471         klwp_t *lwp = ttolwp(ct);
 472         uintptr_t lofault;
 473         label_t *onfault;
 474         faultcode_t pagefault(), res, errcode;
 475         enum fault_type fault_type;
 476         k_siginfo_t siginfo;
 477         uint_t fault = 0;
 478         int mstate;
 479         int sicode = 0;
 480         int watchcode;
 481         int watchpage;
 482         caddr_t vaddr;
 483         int singlestep_twiddle;
 484         size_t sz;
 485         int ta;
 486 #ifdef __amd64
 487         uchar_t instr;
 488 #endif
 489 
 490         ASSERT_STACK_ALIGNED();
 491 
 492         type = rp->r_trapno;
 493         CPU_STATS_ADDQ(CPU, sys, trap, 1);
 494         ASSERT(ct->t_schedflag & TS_DONT_SWAP);
 495 
 496         if (type == T_PGFLT) {
 497 
 498                 errcode = rp->r_err;
 499                 if (errcode & PF_ERR_WRITE)
 500                         rw = S_WRITE;
 501                 else if ((caddr_t)rp->r_pc == addr ||
 502                     (mmu.pt_nx != 0 && (errcode & PF_ERR_EXEC)))
 503                         rw = S_EXEC;


1074 
1075                 sti();  /* The SIMD exception comes in via cmninttrap */
1076                 break;
1077 
1078         case T_BPTFLT:  /* breakpoint trap */
1079                 /*
1080                  * Kernel breakpoint traps should only happen when kmdb is
1081                  * active, and even then, it'll have interposed on the IDT, so
1082                  * control won't get here.  If it does, we've hit a breakpoint
1083                  * without the debugger, which is very strange, and very
1084                  * fatal.
1085                  */
1086                 if (tudebug && tudebugbpt)
1087                         showregs(type, rp, (caddr_t)0);
1088 
1089                 (void) die(type, rp, addr, cpuid);
1090                 break;
1091 
1092         case T_SGLSTP: /* single step/hw breakpoint exception */
1093 
1094                 /* Now evaluate how we got here */
1095                 if (lwp != NULL && (lwp->lwp_pcb.pcb_drstat & DR_SINGLESTEP)) {
1096                         /*
1097                          * i386 single-steps even through lcalls which
1098                          * change the privilege level. So we take a trap at
1099                          * the first instruction in privileged mode.
1100                          *
1101                          * Set a flag to indicate that upon completion of
1102                          * the system call, deal with the single-step trap.
1103                          *
1104                          * The same thing happens for sysenter, too.
1105                          */
1106                         singlestep_twiddle = 0;
1107                         if (rp->r_pc == (uintptr_t)sys_sysenter ||
1108                             rp->r_pc == (uintptr_t)brand_sys_sysenter) {
1109                                 singlestep_twiddle = 1;
1110 #if defined(__amd64)
1111                                 /*
1112                                  * Since we are already on the kernel's
1113                                  * %gs, on 64-bit systems the sysenter case
1114                                  * needs to adjust the pc to avoid
1115                                  * executing the swapgs instruction at the
1116                                  * top of the handler.
1117                                  */
1118                                 if (rp->r_pc == (uintptr_t)sys_sysenter)
1119                                         rp->r_pc = (uintptr_t)
1120                                             _sys_sysenter_post_swapgs;
1121                                 else
1122                                         rp->r_pc = (uintptr_t)
1123                                             _brand_sys_sysenter_post_swapgs;
1124 #endif
1125                         }
1126 #if defined(__i386)
1127                         else if (rp->r_pc == (uintptr_t)sys_call ||
1128                             rp->r_pc == (uintptr_t)brand_sys_call) {
1129                                 singlestep_twiddle = 1;
1130                         }
1131 #endif
1132                         else {
1133                                 /* not on sysenter/syscall; uregs available */
1134                                 if (tudebug && tudebugbpt)
1135                                         showregs(type, rp, (caddr_t)0);
1136                         }
1137                         if (singlestep_twiddle) {
1138                                 rp->r_ps &= ~PS_T; /* turn off trace */
1139                                 lwp->lwp_pcb.pcb_flags |= DEBUG_PENDING;
1140                                 ct->t_post_sys = 1;
1141                                 aston(curthread);
1142                                 goto cleanup;



1143                         }
1144                 }
1145                 /* XXX - needs review on debugger interface? */

1146                 if (boothowto & RB_DEBUG)
1147                         debug_enter((char *)NULL);
1148                 else
1149                         (void) die(type, rp, addr, cpuid);
1150                 break;
1151 
1152         case T_NMIFLT:  /* NMI interrupt */
1153                 printf("Unexpected NMI in system mode\n");
1154                 goto cleanup;
1155 
1156         case T_NMIFLT + USER:   /* NMI interrupt */
1157                 printf("Unexpected NMI in user mode\n");
1158                 break;
1159 
1160         case T_GPFLT:   /* general protection violation */
1161                 /*
1162                  * Any #GP that occurs during an on_trap .. no_trap bracket
1163                  * with OT_DATA_ACCESS or OT_SEGMENT_ACCESS protection,
1164                  * or in a on_fault .. no_fault bracket, is forgiven
1165                  * and we trampoline.  This protection is given regardless


1721         } else if (addr) {
1722                 printf("addr=0x%lx\n", (uintptr_t)addr);
1723         }
1724 
1725         printf("pid=%d, pc=0x%lx, sp=0x%lx, eflags=0x%lx\n",
1726             (ttoproc(curthread) && ttoproc(curthread)->p_pidp) ?
1727             ttoproc(curthread)->p_pid : 0, rp->r_pc, rp->r_sp, rp->r_ps);
1728 
1729 #if defined(__lint)
1730         /*
1731          * this clause can be deleted when lint bug 4870403 is fixed
1732          * (lint thinks that bit 32 is illegal in a %b format string)
1733          */
1734         printf("cr0: %x cr4: %b\n",
1735             (uint_t)getcr0(), (uint_t)getcr4(), FMT_CR4);
1736 #else
1737         printf("cr0: %b cr4: %b\n",
1738             (uint_t)getcr0(), FMT_CR0, (uint_t)getcr4(), FMT_CR4);
1739 #endif  /* __lint */
1740 
1741         printf("cr2: %lx", getcr2());
1742 #if !defined(__xpv)
1743         printf("cr3: %lx", getcr3());
1744 #if defined(__amd64)
1745         printf("cr8: %lx\n", getcr8());
1746 #endif
1747 #endif
1748         printf("\n");
1749 
1750         dumpregs(rp);
1751         splx(s);
1752 }
1753 
1754 static void
1755 dumpregs(struct regs *rp)
1756 {
1757 #if defined(__amd64)
1758         const char fmt[] = "\t%3s: %16lx %3s: %16lx %3s: %16lx\n";
1759 
1760         printf(fmt, "rdi", rp->r_rdi, "rsi", rp->r_rsi, "rdx", rp->r_rdx);
1761         printf(fmt, "rcx", rp->r_rcx, " r8", rp->r_r8, " r9", rp->r_r9);
1762         printf(fmt, "rax", rp->r_rax, "rbx", rp->r_rbx, "rbp", rp->r_rbp);
1763         printf(fmt, "r10", rp->r_r10, "r11", rp->r_r11, "r12", rp->r_r12);


1829 static int
1830 instr_is_segregs_pop(caddr_t pc)
1831 {
1832         static const uint8_t movw_0_esp_gs[4] = { 0x8e, 0x6c, 0x24, 0x0 };
1833         static const uint8_t movw_4_esp_fs[4] = { 0x8e, 0x64, 0x24, 0x4 };
1834         static const uint8_t movw_8_esp_es[4] = { 0x8e, 0x44, 0x24, 0x8 };
1835         static const uint8_t movw_c_esp_ds[4] = { 0x8e, 0x5c, 0x24, 0xc };
1836 
1837         if (bcmp(pc, movw_0_esp_gs, sizeof (movw_0_esp_gs)) == 0 ||
1838             bcmp(pc, movw_4_esp_fs, sizeof (movw_4_esp_fs)) == 0 ||
1839             bcmp(pc, movw_8_esp_es, sizeof (movw_8_esp_es)) == 0 ||
1840             bcmp(pc, movw_c_esp_ds, sizeof (movw_c_esp_ds)) == 0)
1841                 return (1);
1842 
1843         return (0);
1844 }
1845 
1846 #endif  /* __i386 */
1847 
1848 /*
1849  * Test to see if the instruction is part of _sys_rtt.

1850  *
1851  * Again on the hypervisor if we try to IRET to user land with a bad code
1852  * or stack selector we will get vectored through xen_failsafe_callback.
1853  * In which case we assume we got here via _sys_rtt since we only allow
1854  * IRET to user land to take place in _sys_rtt.
1855  */
1856 static int
1857 instr_is_sys_rtt(caddr_t pc)
1858 {
1859         extern void _sys_rtt(), _sys_rtt_end();
1860 













1861         if ((uintptr_t)pc < (uintptr_t)_sys_rtt ||
1862             (uintptr_t)pc > (uintptr_t)_sys_rtt_end)
1863                 return (0);
1864 
1865         return (1);
1866 }
1867 
1868 /*
1869  * Handle #gp faults in kernel mode.
1870  *
1871  * One legitimate way this can happen is if we attempt to update segment
1872  * registers to naughty values on the way out of the kernel.
1873  *
1874  * This can happen in a couple of ways: someone - either accidentally or
1875  * on purpose - creates (setcontext(2), lwp_create(2)) or modifies
1876  * (signal(2)) a ucontext that contains silly segment register values.
1877  * Or someone - either accidentally or on purpose - modifies the prgregset_t
1878  * of a subject process via /proc to contain silly segment register values.
1879  *
1880  * (The unfortunate part is that we can end up discovering the bad segment




  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
  24  */
  25 
  26 /*      Copyright (c) 1990, 1991 UNIX System Laboratories, Inc. */
  27 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T   */
  28 /*              All Rights Reserved                             */
  29 /*                                                              */
  30 /*      Copyright (c) 1987, 1988 Microsoft Corporation          */
  31 /*              All Rights Reserved                             */
  32 /*                                                              */
  33 
  34 /*
  35  * Copyright 2018 Joyent, Inc.
  36  */
  37 
  38 #include <sys/types.h>
  39 #include <sys/sysmacros.h>
  40 #include <sys/param.h>
  41 #include <sys/signal.h>
  42 #include <sys/systm.h>
  43 #include <sys/user.h>
  44 #include <sys/proc.h>
  45 #include <sys/disp.h>
  46 #include <sys/class.h>
  47 #include <sys/core.h>
  48 #include <sys/syscall.h>
  49 #include <sys/cpuvar.h>
  50 #include <sys/vm.h>
  51 #include <sys/sysinfo.h>
  52 #include <sys/fault.h>
  53 #include <sys/stack.h>
  54 #include <sys/psw.h>
  55 #include <sys/regset.h>


 463  */
 464 void
 465 trap(struct regs *rp, caddr_t addr, processorid_t cpuid)
 466 {
 467         kthread_t *ct = curthread;
 468         enum seg_rw rw;
 469         unsigned type;
 470         proc_t *p = ttoproc(ct);
 471         klwp_t *lwp = ttolwp(ct);
 472         uintptr_t lofault;
 473         label_t *onfault;
 474         faultcode_t pagefault(), res, errcode;
 475         enum fault_type fault_type;
 476         k_siginfo_t siginfo;
 477         uint_t fault = 0;
 478         int mstate;
 479         int sicode = 0;
 480         int watchcode;
 481         int watchpage;
 482         caddr_t vaddr;

 483         size_t sz;
 484         int ta;
 485 #ifdef __amd64
 486         uchar_t instr;
 487 #endif
 488 
 489         ASSERT_STACK_ALIGNED();
 490 
 491         type = rp->r_trapno;
 492         CPU_STATS_ADDQ(CPU, sys, trap, 1);
 493         ASSERT(ct->t_schedflag & TS_DONT_SWAP);
 494 
 495         if (type == T_PGFLT) {
 496 
 497                 errcode = rp->r_err;
 498                 if (errcode & PF_ERR_WRITE)
 499                         rw = S_WRITE;
 500                 else if ((caddr_t)rp->r_pc == addr ||
 501                     (mmu.pt_nx != 0 && (errcode & PF_ERR_EXEC)))
 502                         rw = S_EXEC;


1073 
1074                 sti();  /* The SIMD exception comes in via cmninttrap */
1075                 break;
1076 
1077         case T_BPTFLT:  /* breakpoint trap */
1078                 /*
1079                  * Kernel breakpoint traps should only happen when kmdb is
1080                  * active, and even then, it'll have interposed on the IDT, so
1081                  * control won't get here.  If it does, we've hit a breakpoint
1082                  * without the debugger, which is very strange, and very
1083                  * fatal.
1084                  */
1085                 if (tudebug && tudebugbpt)
1086                         showregs(type, rp, (caddr_t)0);
1087 
1088                 (void) die(type, rp, addr, cpuid);
1089                 break;
1090 
1091         case T_SGLSTP: /* single step/hw breakpoint exception */
1092 
1093 #if !defined(__xpv)

1094                 /*
1095                  * We'd never normally get here, as kmdb handles its own single
1096                  * step traps.  There is one nasty exception though, as
1097                  * described in more detail in sys_sysenter().  Note that
1098                  * checking for all four locations covers both the KPTI and the
1099                  * non-KPTI cases correctly: the former will never be found at
1100                  * (brand_)sys_sysenter, and vice versa.


1101                  */
1102                 if (lwp != NULL && (lwp->lwp_pcb.pcb_drstat & DR_SINGLESTEP)) {
1103                         if (rp->r_pc == (greg_t)brand_sys_sysenter ||
1104                             rp->r_pc == (greg_t)sys_sysenter ||
1105                             rp->r_pc == (greg_t)tr_brand_sys_sysenter ||
1106                             rp->r_pc == (greg_t)tr_sys_sysenter) {
1107 
1108                                 rp->r_pc += 0x3; /* sizeof (swapgs) */
1109 
























1110                                 rp->r_ps &= ~PS_T; /* turn off trace */
1111                                 lwp->lwp_pcb.pcb_flags |= DEBUG_PENDING;
1112                                 ct->t_post_sys = 1;
1113                                 aston(curthread);
1114                                 goto cleanup;
1115                         } else {
1116                                 if (tudebug && tudebugbpt)
1117                                         showregs(type, rp, (caddr_t)0);
1118                         }
1119                 }
1120 #endif /* !__xpv */
1121 
1122                 if (boothowto & RB_DEBUG)
1123                         debug_enter((char *)NULL);
1124                 else
1125                         (void) die(type, rp, addr, cpuid);
1126                 break;
1127 
1128         case T_NMIFLT:  /* NMI interrupt */
1129                 printf("Unexpected NMI in system mode\n");
1130                 goto cleanup;
1131 
1132         case T_NMIFLT + USER:   /* NMI interrupt */
1133                 printf("Unexpected NMI in user mode\n");
1134                 break;
1135 
1136         case T_GPFLT:   /* general protection violation */
1137                 /*
1138                  * Any #GP that occurs during an on_trap .. no_trap bracket
1139                  * with OT_DATA_ACCESS or OT_SEGMENT_ACCESS protection,
1140                  * or in a on_fault .. no_fault bracket, is forgiven
1141                  * and we trampoline.  This protection is given regardless


1697         } else if (addr) {
1698                 printf("addr=0x%lx\n", (uintptr_t)addr);
1699         }
1700 
1701         printf("pid=%d, pc=0x%lx, sp=0x%lx, eflags=0x%lx\n",
1702             (ttoproc(curthread) && ttoproc(curthread)->p_pidp) ?
1703             ttoproc(curthread)->p_pid : 0, rp->r_pc, rp->r_sp, rp->r_ps);
1704 
1705 #if defined(__lint)
1706         /*
1707          * this clause can be deleted when lint bug 4870403 is fixed
1708          * (lint thinks that bit 32 is illegal in a %b format string)
1709          */
1710         printf("cr0: %x  cr4: %b\n",
1711             (uint_t)getcr0(), (uint_t)getcr4(), FMT_CR4);
1712 #else
1713         printf("cr0: %b  cr4: %b\n",
1714             (uint_t)getcr0(), FMT_CR0, (uint_t)getcr4(), FMT_CR4);
1715 #endif  /* __lint */
1716 
1717         printf("cr2: %lx  ", getcr2());
1718 #if !defined(__xpv)
1719         printf("cr3: %lx  ", getcr3());
1720 #if defined(__amd64)
1721         printf("cr8: %lx\n", getcr8());
1722 #endif
1723 #endif
1724         printf("\n");
1725 
1726         dumpregs(rp);
1727         splx(s);
1728 }
1729 
1730 static void
1731 dumpregs(struct regs *rp)
1732 {
1733 #if defined(__amd64)
1734         const char fmt[] = "\t%3s: %16lx %3s: %16lx %3s: %16lx\n";
1735 
1736         printf(fmt, "rdi", rp->r_rdi, "rsi", rp->r_rsi, "rdx", rp->r_rdx);
1737         printf(fmt, "rcx", rp->r_rcx, " r8", rp->r_r8, " r9", rp->r_r9);
1738         printf(fmt, "rax", rp->r_rax, "rbx", rp->r_rbx, "rbp", rp->r_rbp);
1739         printf(fmt, "r10", rp->r_r10, "r11", rp->r_r11, "r12", rp->r_r12);


1805 static int
1806 instr_is_segregs_pop(caddr_t pc)
1807 {
1808         static const uint8_t movw_0_esp_gs[4] = { 0x8e, 0x6c, 0x24, 0x0 };
1809         static const uint8_t movw_4_esp_fs[4] = { 0x8e, 0x64, 0x24, 0x4 };
1810         static const uint8_t movw_8_esp_es[4] = { 0x8e, 0x44, 0x24, 0x8 };
1811         static const uint8_t movw_c_esp_ds[4] = { 0x8e, 0x5c, 0x24, 0xc };
1812 
1813         if (bcmp(pc, movw_0_esp_gs, sizeof (movw_0_esp_gs)) == 0 ||
1814             bcmp(pc, movw_4_esp_fs, sizeof (movw_4_esp_fs)) == 0 ||
1815             bcmp(pc, movw_8_esp_es, sizeof (movw_8_esp_es)) == 0 ||
1816             bcmp(pc, movw_c_esp_ds, sizeof (movw_c_esp_ds)) == 0)
1817                 return (1);
1818 
1819         return (0);
1820 }
1821 
1822 #endif  /* __i386 */
1823 
1824 /*
1825  * Test to see if the instruction is part of _sys_rtt (or the KPTI trampolines
1826  * which are used by _sys_rtt).
1827  *
1828  * Again on the hypervisor if we try to IRET to user land with a bad code
1829  * or stack selector we will get vectored through xen_failsafe_callback.
1830  * In which case we assume we got here via _sys_rtt since we only allow
1831  * IRET to user land to take place in _sys_rtt.
1832  */
1833 static int
1834 instr_is_sys_rtt(caddr_t pc)
1835 {
1836         extern void _sys_rtt(), _sys_rtt_end();
1837 
1838 #if !defined(__xpv)
1839         extern void tr_sysc_ret_start(), tr_sysc_ret_end();
1840         extern void tr_intr_ret_start(), tr_intr_ret_end();
1841 
1842         if ((uintptr_t)pc >= (uintptr_t)tr_sysc_ret_start &&
1843             (uintptr_t)pc <= (uintptr_t)tr_sysc_ret_end)
1844                 return (1);
1845 
1846         if ((uintptr_t)pc >= (uintptr_t)tr_intr_ret_start &&
1847             (uintptr_t)pc <= (uintptr_t)tr_intr_ret_end)
1848                 return (1);
1849 #endif
1850 
1851         if ((uintptr_t)pc < (uintptr_t)_sys_rtt ||
1852             (uintptr_t)pc > (uintptr_t)_sys_rtt_end)
1853                 return (0);
1854 
1855         return (1);
1856 }
1857 
1858 /*
1859  * Handle #gp faults in kernel mode.
1860  *
1861  * One legitimate way this can happen is if we attempt to update segment
1862  * registers to naughty values on the way out of the kernel.
1863  *
1864  * This can happen in a couple of ways: someone - either accidentally or
1865  * on purpose - creates (setcontext(2), lwp_create(2)) or modifies
1866  * (signal(2)) a ucontext that contains silly segment register values.
1867  * Or someone - either accidentally or on purpose - modifies the prgregset_t
1868  * of a subject process via /proc to contain silly segment register values.
1869  *
1870  * (The unfortunate part is that we can end up discovering the bad segment