Print this page
8956 Implement KPTI
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>


   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  23  */
  24 /*
  25  * Copyright (c) 2010, Intel Corporation.
  26  * All rights reserved.
  27  */
  28 /*
  29  * Copyright 2011 Joyent, Inc. All rights reserved.
  30  */
  31 
  32 /*
  33  * Welcome to the world of the "real mode platter".
  34  * See also startup.c, mpcore.s and apic.c for related routines.
  35  */
  36 
  37 #include <sys/types.h>
  38 #include <sys/systm.h>
  39 #include <sys/cpuvar.h>
  40 #include <sys/cpu_module.h>
  41 #include <sys/kmem.h>
  42 #include <sys/archsystm.h>
  43 #include <sys/machsystm.h>
  44 #include <sys/controlregs.h>
  45 #include <sys/x86_archext.h>
  46 #include <sys/smp_impldefs.h>
  47 #include <sys/sysmacros.h>
  48 #include <sys/mach_mmu.h>
  49 #include <sys/promif.h>


 116 mach_cpucontext_fini(void)
 117 {
 118         if (warm_reset_vector)
 119                 psm_unmap_phys((caddr_t)warm_reset_vector,
 120                     sizeof (warm_reset_vector));
 121         hat_unload(kas.a_hat, (caddr_t)(uintptr_t)rm_platter_pa, MMU_PAGESIZE,
 122             HAT_UNLOAD);
 123 }
 124 
 125 #if defined(__amd64)
 126 extern void *long_mode_64(void);
 127 #endif  /* __amd64 */
 128 
 129 /*ARGSUSED*/
 130 void
 131 rmp_gdt_init(rm_platter_t *rm)
 132 {
 133 
 134 #if defined(__amd64)
 135         /* Use the kas address space for the CPU startup thread. */
 136         if (MAKECR3(kas.a_hat->hat_htable->ht_pfn) > 0xffffffffUL)
 137                 panic("Cannot initialize CPUs; kernel's 64-bit page tables\n"
 138                     "located above 4G in physical memory (@ 0x%lx)",
 139                     MAKECR3(kas.a_hat->hat_htable->ht_pfn));

 140 
 141         /*
 142          * Setup pseudo-descriptors for temporary GDT and IDT for use ONLY
 143          * by code in real_mode_start_cpu():
 144          *
 145          * GDT[0]:  NULL selector
 146          * GDT[1]:  64-bit CS: Long = 1, Present = 1, bits 12, 11 = 1
 147          *
 148          * Clear the IDT as interrupts will be off and a limit of 0 will cause
 149          * the CPU to triple fault and reset on an NMI, seemingly as reasonable
 150          * a course of action as any other, though it may cause the entire
 151          * platform to reset in some cases...
 152          */
 153         rm->rm_temp_gdt[0] = 0;
 154         rm->rm_temp_gdt[TEMPGDT_KCODE64] = 0x20980000000000ULL;
 155 
 156         rm->rm_temp_gdt_lim = (ushort_t)(sizeof (rm->rm_temp_gdt) - 1);
 157         rm->rm_temp_gdt_base = rm_platter_pa +
 158             (uint32_t)offsetof(rm_platter_t, rm_temp_gdt);
 159         rm->rm_temp_idt_lim = 0;
 160         rm->rm_temp_idt_base = 0;
 161 
 162         /*
 163          * Since the CPU needs to jump to protected mode using an identity
 164          * mapped address, we need to calculate it here.
 165          */
 166         rm->rm_longmode64_addr = rm_platter_pa +
 167             (uint32_t)((uintptr_t)long_mode_64 -
 168             (uintptr_t)real_mode_start_cpu);
 169 #endif  /* __amd64 */
 170 }
 171 
 172 static void *
 173 mach_cpucontext_alloc_tables(struct cpu *cp)
 174 {
 175         tss_t *ntss;
 176         struct cpu_tables *ct;

 177 
 178         /*
 179          * Allocate space for stack, tss, gdt and idt. We round the size
 180          * allotted for cpu_tables up, so that the TSS is on a unique page.
 181          * This is more efficient when running in virtual machines.
 182          */
 183         ct = kmem_zalloc(P2ROUNDUP(sizeof (*ct), PAGESIZE), KM_SLEEP);

 184         if ((uintptr_t)ct & PAGEOFFSET)
 185                 panic("mach_cpucontext_alloc_tables: cpu%d misaligned tables",
 186                     cp->cpu_id);
 187 
 188         ntss = cp->cpu_tss = &ct->ct_tss;
 189 
 190 #if defined(__amd64)


 191 
 192         /*
 193          * #DF (double fault).
 194          */
 195         ntss->tss_ist1 = (uint64_t)&ct->ct_stack[sizeof (ct->ct_stack)];
 196 












































 197 #elif defined(__i386)
 198 
 199         ntss->tss_esp0 = ntss->tss_esp1 = ntss->tss_esp2 = ntss->tss_esp =
 200             (uint32_t)&ct->ct_stack[sizeof (ct->ct_stack)];
 201 
 202         ntss->tss_ss0 = ntss->tss_ss1 = ntss->tss_ss2 = ntss->tss_ss = KDS_SEL;
 203 
 204         ntss->tss_eip = (uint32_t)cp->cpu_thread->t_pc;
 205 
 206         ntss->tss_cs = KCS_SEL;
 207         ntss->tss_ds = ntss->tss_es = KDS_SEL;
 208         ntss->tss_fs = KFS_SEL;
 209         ntss->tss_gs = KGS_SEL;
 210 
 211 #endif  /* __i386 */
 212 
 213         /*
 214          * Set I/O bit map offset equal to size of TSS segment limit
 215          * for no I/O permission map. This will cause all user I/O
 216          * instructions to generate #gp fault.
 217          */
 218         ntss->tss_bitmapbase = sizeof (*ntss);
 219 
 220         /*


 291         }
 292 
 293         /* Copy CPU startup code to rm_platter for CPU hot-add operations. */
 294         if (plat_dr_enabled()) {
 295                 bcopy((caddr_t)real_mode_start_cpu, (caddr_t)rm->rm_code,
 296                     (size_t)real_mode_start_cpu_end -
 297                     (size_t)real_mode_start_cpu);
 298         }
 299 
 300         /*
 301          * Now copy all that we've set up onto the real mode platter
 302          * for the real mode code to digest as part of starting the cpu.
 303          */
 304         rm->rm_idt_base = cp->cpu_idt;
 305         rm->rm_idt_lim = sizeof (*cp->cpu_idt) * NIDT - 1;
 306         rm->rm_gdt_base = cp->cpu_gdt;
 307         rm->rm_gdt_lim = sizeof (*cp->cpu_gdt) * NGDT - 1;
 308 
 309         /*
 310          * CPU needs to access kernel address space after powering on.
 311          * When hot-adding CPU at runtime, directly use top level page table
 312          * of kas other than the return value of getcr3(). getcr3() returns
 313          * current process's top level page table, which may be different from
 314          * the one of kas.
 315          */
 316         rm->rm_pdbr = MAKECR3(kas.a_hat->hat_htable->ht_pfn);
 317         rm->rm_cpu = cp->cpu_id;
 318 
 319         /*
 320          * For hot-adding CPU at runtime, Machine Check and Performance Counter
 321          * should be disabled. They will be enabled on demand after CPU powers
 322          * on successfully
 323          */
 324         rm->rm_cr4 = getcr4();
 325         rm->rm_cr4 &= ~(CR4_MCE | CR4_PCE);
 326 
 327         rmp_gdt_init(rm);
 328 
 329         return (ct);
 330 }
 331 
 332 void
 333 mach_cpucontext_xfree(struct cpu *cp, void *arg, int err, int optype)
 334 {
 335         struct cpu_tables *ct = arg;
 336 
 337         ASSERT(&ct->ct_tss == cp->cpu_tss);
 338         if (optype == MACH_CPUCONTEXT_OP_START) {
 339                 switch (err) {
 340                 case 0:
 341                         /*
 342                          * Save pointer for reuse when stopping CPU.
 343                          */
 344                         cp->cpu_m.mcpu_mach_ctx_ptr = arg;
 345                         break;




   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  23  */
  24 /*
  25  * Copyright (c) 2010, Intel Corporation.
  26  * All rights reserved.
  27  */
  28 /*
  29  * Copyright 2018 Joyent, Inc
  30  */
  31 
  32 /*
  33  * Welcome to the world of the "real mode platter".
  34  * See also startup.c, mpcore.s and apic.c for related routines.
  35  */
  36 
  37 #include <sys/types.h>
  38 #include <sys/systm.h>
  39 #include <sys/cpuvar.h>
  40 #include <sys/cpu_module.h>
  41 #include <sys/kmem.h>
  42 #include <sys/archsystm.h>
  43 #include <sys/machsystm.h>
  44 #include <sys/controlregs.h>
  45 #include <sys/x86_archext.h>
  46 #include <sys/smp_impldefs.h>
  47 #include <sys/sysmacros.h>
  48 #include <sys/mach_mmu.h>
  49 #include <sys/promif.h>


 116 mach_cpucontext_fini(void)
 117 {
 118         if (warm_reset_vector)
 119                 psm_unmap_phys((caddr_t)warm_reset_vector,
 120                     sizeof (warm_reset_vector));
 121         hat_unload(kas.a_hat, (caddr_t)(uintptr_t)rm_platter_pa, MMU_PAGESIZE,
 122             HAT_UNLOAD);
 123 }
 124 
 125 #if defined(__amd64)
 126 extern void *long_mode_64(void);
 127 #endif  /* __amd64 */
 128 
 129 /*ARGSUSED*/
 130 void
 131 rmp_gdt_init(rm_platter_t *rm)
 132 {
 133 
 134 #if defined(__amd64)
 135         /* Use the kas address space for the CPU startup thread. */
 136         if (mmu_ptob(kas.a_hat->hat_htable->ht_pfn) > 0xffffffffUL) {
 137                 panic("Cannot initialize CPUs; kernel's 64-bit page tables\n"
 138                     "located above 4G in physical memory (@ 0x%lx)",
 139                     mmu_ptob(kas.a_hat->hat_htable->ht_pfn));
 140         }
 141 
 142         /*
 143          * Setup pseudo-descriptors for temporary GDT and IDT for use ONLY
 144          * by code in real_mode_start_cpu():
 145          *
 146          * GDT[0]:  NULL selector
 147          * GDT[1]:  64-bit CS: Long = 1, Present = 1, bits 12, 11 = 1
 148          *
 149          * Clear the IDT as interrupts will be off and a limit of 0 will cause
 150          * the CPU to triple fault and reset on an NMI, seemingly as reasonable
 151          * a course of action as any other, though it may cause the entire
 152          * platform to reset in some cases...
 153          */
 154         rm->rm_temp_gdt[0] = 0;
 155         rm->rm_temp_gdt[TEMPGDT_KCODE64] = 0x20980000000000ULL;
 156 
 157         rm->rm_temp_gdt_lim = (ushort_t)(sizeof (rm->rm_temp_gdt) - 1);
 158         rm->rm_temp_gdt_base = rm_platter_pa +
 159             (uint32_t)offsetof(rm_platter_t, rm_temp_gdt);
 160         rm->rm_temp_idt_lim = 0;
 161         rm->rm_temp_idt_base = 0;
 162 
 163         /*
 164          * Since the CPU needs to jump to protected mode using an identity
 165          * mapped address, we need to calculate it here.
 166          */
 167         rm->rm_longmode64_addr = rm_platter_pa +
 168             (uint32_t)((uintptr_t)long_mode_64 -
 169             (uintptr_t)real_mode_start_cpu);
 170 #endif  /* __amd64 */
 171 }
 172 
 173 static void *
 174 mach_cpucontext_alloc_tables(struct cpu *cp)
 175 {
 176         tss_t *ntss;
 177         struct cpu_tables *ct;
 178         size_t ctsize;
 179 
 180         /*
 181          * Allocate space for stack, tss, gdt and idt. We round the size
 182          * allotted for cpu_tables up, so that the TSS is on a unique page.
 183          * This is more efficient when running in virtual machines.
 184          */
 185         ctsize = P2ROUNDUP(sizeof (*ct), PAGESIZE);
 186         ct = kmem_zalloc(ctsize, KM_SLEEP);
 187         if ((uintptr_t)ct & PAGEOFFSET)
 188                 panic("mach_cpucontext_alloc_tables: cpu%d misaligned tables",
 189                     cp->cpu_id);
 190 
 191         ntss = cp->cpu_tss = &ct->ct_tss;
 192 
 193 #if defined(__amd64)
 194         uintptr_t va;
 195         size_t len;
 196 
 197         /*
 198          * #DF (double fault).
 199          */
 200         ntss->tss_ist1 = (uintptr_t)&ct->ct_stack1[sizeof (ct->ct_stack1)];
 201 
 202         /*
 203          * #NM (non-maskable interrupt)
 204          */
 205         ntss->tss_ist2 = (uintptr_t)&ct->ct_stack2[sizeof (ct->ct_stack2)];
 206 
 207         /*
 208          * #MC (machine check exception / hardware error)
 209          */
 210         ntss->tss_ist3 = (uintptr_t)&ct->ct_stack3[sizeof (ct->ct_stack3)];
 211 
 212         /*
 213          * #DB, #BP debug interrupts and KDI/kmdb
 214          */
 215         ntss->tss_ist4 = (uintptr_t)&cp->cpu_m.mcpu_kpti_dbg.kf_tr_rsp;
 216 
 217         if (kpti_enable == 1) {
 218                 /*
 219                  * #GP, #PF, #SS fault interrupts
 220                  */
 221                 ntss->tss_ist5 = (uintptr_t)&cp->cpu_m.mcpu_kpti_flt.kf_tr_rsp;
 222 
 223                 /*
 224                  * Used by all other interrupts
 225                  */
 226                 ntss->tss_ist6 = (uint64_t)&cp->cpu_m.mcpu_kpti.kf_tr_rsp;
 227 
 228                 /*
 229                  * On AMD64 we need to make sure that all of the pages of the
 230                  * struct cpu_tables are punched through onto the user CPU for
 231                  * kpti.
 232                  *
 233                  * The final page will always be the TSS, so treat that
 234                  * separately.
 235                  */
 236                 for (va = (uintptr_t)ct, len = ctsize - MMU_PAGESIZE;
 237                     len >= MMU_PAGESIZE;
 238                     len -= MMU_PAGESIZE, va += MMU_PAGESIZE) {
 239                         /* The doublefault stack must be RW */
 240                         hati_cpu_punchin(cp, va, PROT_READ | PROT_WRITE);
 241                 }
 242                 ASSERT3U((uintptr_t)ntss, ==, va);
 243                 hati_cpu_punchin(cp, (uintptr_t)ntss, PROT_READ);
 244         }
 245 
 246 #elif defined(__i386)
 247 
 248         ntss->tss_esp0 = ntss->tss_esp1 = ntss->tss_esp2 = ntss->tss_esp =
 249             (uint32_t)&ct->ct_stack1[sizeof (ct->ct_stack1)];
 250 
 251         ntss->tss_ss0 = ntss->tss_ss1 = ntss->tss_ss2 = ntss->tss_ss = KDS_SEL;
 252 
 253         ntss->tss_eip = (uint32_t)cp->cpu_thread->t_pc;
 254 
 255         ntss->tss_cs = KCS_SEL;
 256         ntss->tss_ds = ntss->tss_es = KDS_SEL;
 257         ntss->tss_fs = KFS_SEL;
 258         ntss->tss_gs = KGS_SEL;
 259 
 260 #endif  /* __i386 */
 261 
 262         /*
 263          * Set I/O bit map offset equal to size of TSS segment limit
 264          * for no I/O permission map. This will cause all user I/O
 265          * instructions to generate #gp fault.
 266          */
 267         ntss->tss_bitmapbase = sizeof (*ntss);
 268 
 269         /*


 340         }
 341 
 342         /* Copy CPU startup code to rm_platter for CPU hot-add operations. */
 343         if (plat_dr_enabled()) {
 344                 bcopy((caddr_t)real_mode_start_cpu, (caddr_t)rm->rm_code,
 345                     (size_t)real_mode_start_cpu_end -
 346                     (size_t)real_mode_start_cpu);
 347         }
 348 
 349         /*
 350          * Now copy all that we've set up onto the real mode platter
 351          * for the real mode code to digest as part of starting the cpu.
 352          */
 353         rm->rm_idt_base = cp->cpu_idt;
 354         rm->rm_idt_lim = sizeof (*cp->cpu_idt) * NIDT - 1;
 355         rm->rm_gdt_base = cp->cpu_gdt;
 356         rm->rm_gdt_lim = sizeof (*cp->cpu_gdt) * NGDT - 1;
 357 
 358         /*
 359          * CPU needs to access kernel address space after powering on.




 360          */
 361         rm->rm_pdbr = MAKECR3(kas.a_hat->hat_htable->ht_pfn, PCID_NONE);
 362         rm->rm_cpu = cp->cpu_id;
 363 
 364         /*
 365          * We need to mask off any bits set on our boot CPU that can't apply
 366          * while the subject CPU is initializing.  If appropriate, they are
 367          * enabled later on.
 368          */
 369         rm->rm_cr4 = getcr4();
 370         rm->rm_cr4 &= ~(CR4_MCE | CR4_PCE | CR4_PCIDE);
 371 
 372         rmp_gdt_init(rm);
 373 
 374         return (ct);
 375 }
 376 
 377 void
 378 mach_cpucontext_xfree(struct cpu *cp, void *arg, int err, int optype)
 379 {
 380         struct cpu_tables *ct = arg;
 381 
 382         ASSERT(&ct->ct_tss == cp->cpu_tss);
 383         if (optype == MACH_CPUCONTEXT_OP_START) {
 384                 switch (err) {
 385                 case 0:
 386                         /*
 387                          * Save pointer for reuse when stopping CPU.
 388                          */
 389                         cp->cpu_m.mcpu_mach_ctx_ptr = arg;
 390                         break;