1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 /* 12 * Copyright 2019 Joyent, Inc. 13 */ 14 15 /* 16 * This file contains the trampolines that are used by KPTI in order to be 17 * able to take interrupts/trap/etc while on the "user" page table. 18 * 19 * We don't map the full kernel text into the user page table: instead we 20 * map this one small section of trampolines (which compiles to ~13 pages). 21 * These trampolines are set in the IDT always (so they will run no matter 22 * whether we're on the kernel or user page table), and their primary job is to 23 * pivot us to the kernel %cr3 and %rsp without ruining everything. 24 * 25 * All of these interrupts use the amd64 IST feature when we have KPTI enabled, 26 * meaning that they will execute with their %rsp set to a known location, even 27 * if we take them in the kernel. 28 * 29 * Over in desctbls.c (for cpu0) and mp_pc.c (other cpus) we set up the IST 30 * stack to point at &cpu->cpu_m.mcpu_kpti.kf_tr_rsp. You can see the mcpu_kpti 31 * (a struct kpti_frame) defined in machcpuvar.h. This struct is set up to be 32 * page-aligned, and we map the page it's on into both page tables. Using a 33 * struct attached to the cpu_t also means that we can use %rsp-relative 34 * addressing to find anything on the cpu_t, so we don't have to touch %gs or 35 * GSBASE at all on incoming interrupt trampolines (which can get pretty hairy). 36 * 37 * This little struct is where the CPU will push the actual interrupt frame. 38 * Then, in the trampoline, we change %cr3, then figure out our destination 39 * stack pointer and "pivot" to it (set %rsp and re-push the CPU's interrupt 40 * frame). Then we jump to the regular ISR in the kernel text and carry on as 41 * normal. 42 * 43 * We leave the original frame and any spilled regs behind in the kpti_frame 44 * lazily until we want to return to userland. Then, we clear any spilled 45 * regs from it, and overwrite the rest with our iret frame. When switching 46 * this cpu to a different process (in hat_switch), we bzero the whole region to 47 * make sure nothing can leak between processes. 48 * 49 * When we're returning back to the original place we took the interrupt later 50 * (especially if it was in userland), we have to jmp back to the "return 51 * trampolines" here, since when we set %cr3 back to the user value, we need to 52 * be executing from code here in these shared pages and not the main kernel 53 * text again. Even though it should be fine to iret directly from kernel text 54 * when returning to kernel code, we make things jmp to a trampoline here just 55 * for consistency. 56 * 57 * Note that with IST, it's very important that we always must have pivoted 58 * away from the IST stack before we could possibly take any other interrupt 59 * on the same IST (unless it's an end-of-the-world fault and we don't care 60 * about coming back from it ever). 61 * 62 * This is particularly relevant to the dbgtrap/brktrap trampolines, as they 63 * regularly have to happen from within trampoline code (e.g. in the sysenter 64 * single-step case) and then return to the world normally. As a result, these 65 * two are IST'd to their own kpti_frame right above the normal one (in the same 66 * page), so they don't clobber their parent interrupt. 67 * 68 * To aid with debugging, we also IST the page fault (#PF/pftrap), general 69 * protection fault (#GP/gptrap) and stack fault (#SS/stktrap) interrupts to 70 * their own separate kpti_frame. This ensures that if we take one of these 71 * due to a bug in trampoline code, we preserve the original trampoline 72 * state that caused the trap. 73 * 74 * NMI, MCE and dblfault interrupts also are taken on their own dedicated IST 75 * stacks, since they can interrupt another ISR at any time. These stacks are 76 * full-sized, however, and not a little kpti_frame struct. We only set %cr3 in 77 * their trampolines (and do it unconditionally), and don't bother pivoting 78 * away. We're either going into the panic() path, or we're going to return 79 * straight away without rescheduling, so it's fine to not be on our real 80 * kthread stack (and some of the state we want to go find it with might be 81 * corrupt!) 82 * 83 * Finally, for these "special" interrupts (NMI/MCE/double fault) we use a 84 * special %cr3 value we stash here in the text (kpti_safe_cr3). We set this to 85 * point at the PML4 for kas early in boot and never touch it again. Hopefully 86 * it survives whatever corruption brings down the rest of the kernel! 87 * 88 * Syscalls are different to interrupts (at least in the SYSENTER/SYSCALL64 89 * cases) in that they do not push an interrupt frame (and also have some other 90 * effects). In the syscall trampolines, we assume that we can only be taking 91 * the call from userland and use swapgs and an unconditional overwrite of %cr3. 92 * We do not do any stack pivoting for syscalls (and we leave SYSENTER's 93 * existing %rsp pivot untouched) -- instead we spill registers into 94 * %gs:CPU_KPTI_* as we need to. 95 * 96 * Note that the normal %cr3 values do not cause invalidations with PCIDE - see 97 * hat_switch(). 98 */ 99 100 /* 101 * The macros here mostly line up with what's in kdi_idthdl.s, too, so if you 102 * fix bugs here check to see if they should be fixed there as well. 103 */ 104 105 #include <sys/asm_linkage.h> 106 #include <sys/asm_misc.h> 107 #include <sys/regset.h> 108 #include <sys/privregs.h> 109 #include <sys/psw.h> 110 #include <sys/machbrand.h> 111 #include <sys/param.h> 112 113 #include <sys/segments.h> 114 #include <sys/pcb.h> 115 #include <sys/trap.h> 116 #include <sys/ftrace.h> 117 #include <sys/traptrace.h> 118 #include <sys/clock.h> 119 #include <sys/model.h> 120 #include <sys/panic.h> 121 122 #if defined(__xpv) 123 #include <sys/hypervisor.h> 124 #endif 125 126 #include "assym.h" 127 128 .data 129 DGDEF3(kpti_enable, 8, 8) 130 .fill 1, 8, 1 131 132 #if DEBUG 133 .data 134 _bad_ts_panic_msg: 135 .string "kpti_trampolines.s: tr_iret_user but CR0.TS set" 136 #endif 137 138 .section ".text"; 139 .align MMU_PAGESIZE 140 141 .global kpti_tramp_start 142 kpti_tramp_start: 143 nop 144 145 /* This will be set by mlsetup, and then double-checked later */ 146 .global kpti_safe_cr3 147 kpti_safe_cr3: 148 .quad 0 149 SET_SIZE(kpti_safe_cr3) 150 151 /* startup_kmem() will overwrite this */ 152 .global kpti_kbase 153 kpti_kbase: 154 .quad KERNELBASE 155 SET_SIZE(kpti_kbase) 156 157 #define SET_KERNEL_CR3(spillreg) \ 158 mov %cr3, spillreg; \ 159 mov spillreg, %gs:CPU_KPTI_TR_CR3; \ 160 mov %gs:CPU_KPTI_KCR3, spillreg; \ 161 cmp $0, spillreg; \ 162 je 2f; \ 163 mov spillreg, %cr3; \ 164 2: 165 166 #if DEBUG 167 #define SET_USER_CR3(spillreg) \ 168 mov %cr3, spillreg; \ 169 mov spillreg, %gs:CPU_KPTI_TR_CR3; \ 170 mov %gs:CPU_KPTI_UCR3, spillreg; \ 171 mov spillreg, %cr3 172 #else 173 #define SET_USER_CR3(spillreg) \ 174 mov %gs:CPU_KPTI_UCR3, spillreg; \ 175 mov spillreg, %cr3 176 #endif 177 178 #define PIVOT_KPTI_STK(spillreg) \ 179 mov %rsp, spillreg; \ 180 mov %gs:CPU_KPTI_RET_RSP, %rsp; \ 181 pushq T_FRAMERET_SS(spillreg); \ 182 pushq T_FRAMERET_RSP(spillreg); \ 183 pushq T_FRAMERET_RFLAGS(spillreg); \ 184 pushq T_FRAMERET_CS(spillreg); \ 185 pushq T_FRAMERET_RIP(spillreg) 186 187 188 #define INTERRUPT_TRAMPOLINE_P(errpush) \ 189 pushq %r13; \ 190 pushq %r14; \ 191 subq $KPTI_R14, %rsp; \ 192 /* Save current %cr3. */ \ 193 mov %cr3, %r14; \ 194 mov %r14, KPTI_TR_CR3(%rsp); \ 195 \ 196 cmpw $KCS_SEL, KPTI_CS(%rsp); \ 197 je 3f; \ 198 1: \ 199 /* Change to the "kernel" %cr3 */ \ 200 mov KPTI_KCR3(%rsp), %r14; \ 201 cmp $0, %r14; \ 202 je 2f; \ 203 mov %r14, %cr3; \ 204 2: \ 205 /* Get our cpu_t in %r13 */ \ 206 mov %rsp, %r13; \ 207 and $(~(MMU_PAGESIZE - 1)), %r13; \ 208 subq $CPU_KPTI_START, %r13; \ 209 /* Use top of the kthread stk */ \ 210 mov CPU_THREAD(%r13), %r14; \ 211 mov T_STACK(%r14), %r14; \ 212 addq $REGSIZE+MINFRAME, %r14; \ 213 jmp 4f; \ 214 3: \ 215 /* Check the %rsp in the frame. */ \ 216 /* Is it above kernel base? */ \ 217 mov kpti_kbase, %r14; \ 218 cmp %r14, KPTI_RSP(%rsp); \ 219 jb 1b; \ 220 /* Use the %rsp from the trap frame */ \ 221 mov KPTI_RSP(%rsp), %r14; \ 222 and $(~0xf), %r14; \ 223 4: \ 224 mov %rsp, %r13; \ 225 /* %r14 contains our destination stk */ \ 226 mov %r14, %rsp; \ 227 pushq KPTI_SS(%r13); \ 228 pushq KPTI_RSP(%r13); \ 229 pushq KPTI_RFLAGS(%r13); \ 230 pushq KPTI_CS(%r13); \ 231 pushq KPTI_RIP(%r13); \ 232 errpush; \ 233 mov KPTI_R14(%r13), %r14; \ 234 mov KPTI_R13(%r13), %r13 235 236 #define INTERRUPT_TRAMPOLINE_NOERR \ 237 INTERRUPT_TRAMPOLINE_P(/**/) 238 239 #define INTERRUPT_TRAMPOLINE \ 240 INTERRUPT_TRAMPOLINE_P(pushq KPTI_ERR(%r13)) 241 242 /* 243 * This is used for all interrupts that can plausibly be taken inside another 244 * interrupt and are using a kpti_frame stack (so #BP, #DB, #GP, #PF, #SS). 245 * 246 * We also use this for #NP, even though it uses the standard IST: the 247 * additional %rsp checks below will catch when we get an exception doing an 248 * iret to userspace with a bad %cs/%ss. This appears as a kernel trap, and 249 * only later gets redirected via kern_gpfault(). 250 * 251 * We check for whether we took the interrupt while in another trampoline, in 252 * which case we need to use the kthread stack. 253 */ 254 #define DBG_INTERRUPT_TRAMPOLINE_P(errpush) \ 255 pushq %r13; \ 256 pushq %r14; \ 257 subq $KPTI_R14, %rsp; \ 258 /* Check for clobbering */ \ 259 cmp $0, KPTI_FLAG(%rsp); \ 260 je 1f; \ 261 /* Don't worry, this totally works */ \ 262 int $8; \ 263 1: \ 264 movq $1, KPTI_FLAG(%rsp); \ 265 /* Save current %cr3. */ \ 266 mov %cr3, %r14; \ 267 mov %r14, KPTI_TR_CR3(%rsp); \ 268 \ 269 cmpw $KCS_SEL, KPTI_CS(%rsp); \ 270 je 4f; \ 271 2: \ 272 /* Change to the "kernel" %cr3 */ \ 273 mov KPTI_KCR3(%rsp), %r14; \ 274 cmp $0, %r14; \ 275 je 3f; \ 276 mov %r14, %cr3; \ 277 3: \ 278 /* Get our cpu_t in %r13 */ \ 279 mov %rsp, %r13; \ 280 and $(~(MMU_PAGESIZE - 1)), %r13; \ 281 subq $CPU_KPTI_START, %r13; \ 282 /* Use top of the kthread stk */ \ 283 mov CPU_THREAD(%r13), %r14; \ 284 mov T_STACK(%r14), %r14; \ 285 addq $REGSIZE+MINFRAME, %r14; \ 286 jmp 6f; \ 287 4: \ 288 /* Check the %rsp in the frame. */ \ 289 /* Is it above kernel base? */ \ 290 /* If not, treat as user. */ \ 291 mov kpti_kbase, %r14; \ 292 cmp %r14, KPTI_RSP(%rsp); \ 293 jb 2b; \ 294 /* Is it within the kpti_frame page? */ \ 295 /* If it is, treat as user interrupt */ \ 296 mov %rsp, %r13; \ 297 and $(~(MMU_PAGESIZE - 1)), %r13; \ 298 mov KPTI_RSP(%rsp), %r14; \ 299 and $(~(MMU_PAGESIZE - 1)), %r14; \ 300 cmp %r13, %r14; \ 301 je 2b; \ 302 /* Were we in trampoline code? */ \ 303 leaq kpti_tramp_start, %r14; \ 304 cmp %r14, KPTI_RIP(%rsp); \ 305 jb 5f; \ 306 leaq kpti_tramp_end, %r14; \ 307 cmp %r14, KPTI_RIP(%rsp); \ 308 ja 5f; \ 309 /* If we were, change %cr3: we might */ \ 310 /* have interrupted before it did. */ \ 311 mov KPTI_KCR3(%rsp), %r14; \ 312 mov %r14, %cr3; \ 313 5: \ 314 /* Use the %rsp from the trap frame */ \ 315 mov KPTI_RSP(%rsp), %r14; \ 316 and $(~0xf), %r14; \ 317 6: \ 318 mov %rsp, %r13; \ 319 /* %r14 contains our destination stk */ \ 320 mov %r14, %rsp; \ 321 pushq KPTI_SS(%r13); \ 322 pushq KPTI_RSP(%r13); \ 323 pushq KPTI_RFLAGS(%r13); \ 324 pushq KPTI_CS(%r13); \ 325 pushq KPTI_RIP(%r13); \ 326 errpush; \ 327 mov KPTI_R14(%r13), %r14; \ 328 movq $0, KPTI_FLAG(%r13); \ 329 mov KPTI_R13(%r13), %r13 330 331 #define DBG_INTERRUPT_TRAMPOLINE_NOERR \ 332 DBG_INTERRUPT_TRAMPOLINE_P(/**/) 333 334 #define DBG_INTERRUPT_TRAMPOLINE \ 335 DBG_INTERRUPT_TRAMPOLINE_P(pushq KPTI_ERR(%r13)) 336 337 /* 338 * These labels (_start and _end) are used by trap.c to determine if 339 * we took an interrupt like an NMI during the return process. 340 */ 341 .global tr_sysc_ret_start 342 tr_sysc_ret_start: 343 344 /* 345 * Syscall return trampolines. 346 * 347 * These are expected to be called on the kernel %gs. tr_sysret[ql] are 348 * called after %rsp is changed back to the user value, so we have no 349 * stack to work with. tr_sysexit has a kernel stack (but has to 350 * preserve rflags, soooo). 351 */ 352 ENTRY_NP(tr_sysretq) 353 cmpq $1, kpti_enable 354 jne 1f 355 356 mov %r13, %gs:CPU_KPTI_R13 357 SET_USER_CR3(%r13) 358 mov %gs:CPU_KPTI_R13, %r13 359 /* Zero these to make sure they didn't leak from a kernel trap */ 360 movq $0, %gs:CPU_KPTI_R13 361 movq $0, %gs:CPU_KPTI_R14 362 1: 363 swapgs 364 sysretq 365 SET_SIZE(tr_sysretq) 366 367 ENTRY_NP(tr_sysretl) 368 cmpq $1, kpti_enable 369 jne 1f 370 371 mov %r13, %gs:CPU_KPTI_R13 372 SET_USER_CR3(%r13) 373 mov %gs:CPU_KPTI_R13, %r13 374 /* Zero these to make sure they didn't leak from a kernel trap */ 375 movq $0, %gs:CPU_KPTI_R13 376 movq $0, %gs:CPU_KPTI_R14 377 1: 378 SWAPGS 379 SYSRETL 380 SET_SIZE(tr_sysretl) 381 382 ENTRY_NP(tr_sysexit) 383 /* 384 * Note: we want to preserve RFLAGS across this branch, since sysexit 385 * (unlike sysret above) does not restore RFLAGS for us. 386 * 387 * We still have the real kernel stack (sysexit does restore that), so 388 * we can use pushfq/popfq. 389 */ 390 pushfq 391 392 cmpq $1, kpti_enable 393 jne 1f 394 395 /* Have to pop it back off now before we change %cr3! */ 396 popfq 397 mov %r13, %gs:CPU_KPTI_R13 398 SET_USER_CR3(%r13) 399 mov %gs:CPU_KPTI_R13, %r13 400 /* Zero these to make sure they didn't leak from a kernel trap */ 401 movq $0, %gs:CPU_KPTI_R13 402 movq $0, %gs:CPU_KPTI_R14 403 jmp 2f 404 1: 405 popfq 406 2: 407 swapgs 408 sti 409 sysexit 410 SET_SIZE(tr_sysexit) 411 412 .global tr_sysc_ret_end 413 tr_sysc_ret_end: 414 415 /* 416 * Syscall entry trampolines. 417 */ 418 419 #if DEBUG 420 #define MK_SYSCALL_TRAMPOLINE(isr) \ 421 ENTRY_NP(tr_/**/isr); \ 422 swapgs; \ 423 mov %r13, %gs:CPU_KPTI_R13; \ 424 mov %cr3, %r13; \ 425 mov %r13, %gs:CPU_KPTI_TR_CR3; \ 426 mov %gs:CPU_KPTI_KCR3, %r13; \ 427 mov %r13, %cr3; \ 428 mov %gs:CPU_KPTI_R13, %r13; \ 429 swapgs; \ 430 jmp isr; \ 431 SET_SIZE(tr_/**/isr) 432 #else 433 #define MK_SYSCALL_TRAMPOLINE(isr) \ 434 ENTRY_NP(tr_/**/isr); \ 435 swapgs; \ 436 mov %r13, %gs:CPU_KPTI_R13; \ 437 mov %gs:CPU_KPTI_KCR3, %r13; \ 438 mov %r13, %cr3; \ 439 mov %gs:CPU_KPTI_R13, %r13; \ 440 swapgs; \ 441 jmp isr; \ 442 SET_SIZE(tr_/**/isr) 443 #endif 444 445 MK_SYSCALL_TRAMPOLINE(sys_syscall) 446 MK_SYSCALL_TRAMPOLINE(sys_syscall32) 447 MK_SYSCALL_TRAMPOLINE(brand_sys_syscall) 448 MK_SYSCALL_TRAMPOLINE(brand_sys_syscall32) 449 450 /* 451 * SYSENTER is special. The CPU is really not very helpful when it 452 * comes to preserving and restoring state with it, and as a result 453 * we have to do all of it by hand. So, since we want to preserve 454 * RFLAGS, we have to be very careful in these trampolines to not 455 * clobber any bits in it. That means no cmpqs or branches! 456 */ 457 ENTRY_NP(tr_sys_sysenter) 458 swapgs 459 mov %r13, %gs:CPU_KPTI_R13 460 #if DEBUG 461 mov %cr3, %r13 462 mov %r13, %gs:CPU_KPTI_TR_CR3 463 #endif 464 mov %gs:CPU_KPTI_KCR3, %r13 465 mov %r13, %cr3 466 mov %gs:CPU_KPTI_R13, %r13 467 jmp _sys_sysenter_post_swapgs 468 SET_SIZE(tr_sys_sysenter) 469 470 ENTRY_NP(tr_brand_sys_sysenter) 471 swapgs 472 mov %r13, %gs:CPU_KPTI_R13 473 #if DEBUG 474 mov %cr3, %r13 475 mov %r13, %gs:CPU_KPTI_TR_CR3 476 #endif 477 mov %gs:CPU_KPTI_KCR3, %r13 478 mov %r13, %cr3 479 mov %gs:CPU_KPTI_R13, %r13 480 jmp _brand_sys_sysenter_post_swapgs 481 SET_SIZE(tr_brand_sys_sysenter) 482 483 #define MK_SYSCALL_INT_TRAMPOLINE(isr) \ 484 ENTRY_NP(tr_/**/isr); \ 485 swapgs; \ 486 mov %r13, %gs:CPU_KPTI_R13; \ 487 SET_KERNEL_CR3(%r13); \ 488 mov %gs:CPU_THREAD, %r13; \ 489 mov T_STACK(%r13), %r13; \ 490 addq $REGSIZE+MINFRAME, %r13; \ 491 mov %r13, %rsp; \ 492 pushq %gs:CPU_KPTI_SS; \ 493 pushq %gs:CPU_KPTI_RSP; \ 494 pushq %gs:CPU_KPTI_RFLAGS; \ 495 pushq %gs:CPU_KPTI_CS; \ 496 pushq %gs:CPU_KPTI_RIP; \ 497 mov %gs:CPU_KPTI_R13, %r13; \ 498 swapgs; \ 499 jmp isr; \ 500 SET_SIZE(tr_/**/isr) 501 502 MK_SYSCALL_INT_TRAMPOLINE(brand_sys_syscall_int) 503 MK_SYSCALL_INT_TRAMPOLINE(sys_syscall_int) 504 505 /* 506 * Interrupt/trap return trampolines 507 */ 508 509 .global tr_intr_ret_start 510 tr_intr_ret_start: 511 512 ENTRY_NP(tr_iret_auto) 513 cmpq $1, kpti_enable 514 jne tr_iret_kernel 515 cmpw $KCS_SEL, T_FRAMERET_CS(%rsp) 516 je tr_iret_kernel 517 jmp tr_iret_user 518 SET_SIZE(tr_iret_auto) 519 520 ENTRY_NP(tr_iret_kernel) 521 /* 522 * Yes, this does nothing extra. But this way we know if we see iret 523 * elsewhere, then we've failed to properly consider trampolines there. 524 */ 525 iretq 526 SET_SIZE(tr_iret_kernel) 527 528 ENTRY_NP(tr_iret_user) 529 #if DEBUG 530 /* 531 * Panic if we find CR0.TS set. We're still on the kernel stack and 532 * %cr3, but we do need to swap back to the kernel gs. (We don't worry 533 * about swapgs speculation here.) 534 */ 535 pushq %rax 536 mov %cr0, %rax 537 testq $CR0_TS, %rax 538 jz 1f 539 swapgs 540 popq %rax 541 leaq _bad_ts_panic_msg(%rip), %rdi 542 xorl %eax, %eax 543 pushq %rbp 544 movq %rsp, %rbp 545 call panic 546 1: 547 popq %rax 548 #endif 549 550 cmpq $1, kpti_enable 551 jne 1f 552 553 /* 554 * KPTI enabled: we're on the user gsbase at this point, so we 555 * need to swap back so we can pivot stacks. 556 * 557 * The swapgs lfence mitigation is probably not needed here 558 * since a mis-speculation of the above branch would imply KPTI 559 * is disabled, but we'll do so anyway. 560 */ 561 swapgs 562 lfence 563 mov %r13, %gs:CPU_KPTI_R13 564 PIVOT_KPTI_STK(%r13) 565 SET_USER_CR3(%r13) 566 mov %gs:CPU_KPTI_R13, %r13 567 /* Zero these to make sure they didn't leak from a kernel trap. */ 568 movq $0, %gs:CPU_KPTI_R13 569 movq $0, %gs:CPU_KPTI_R14 570 /* And back to user gsbase again. */ 571 swapgs 572 1: 573 iretq 574 SET_SIZE(tr_iret_user) 575 576 /* 577 * This special return trampoline is for KDI's use only (with kmdb). 578 * 579 * KDI/kmdb do not use swapgs -- they directly write the GSBASE MSR 580 * instead. This trampoline runs after GSBASE has already been changed 581 * back to the userland value (so we can't use %gs). 582 * 583 * Instead, the caller gives us a pointer to the kpti_dbg frame in %r13. 584 * The KPTI_R13 member in the kpti_dbg has already been set to what the 585 * real %r13 should be before we IRET. 586 * 587 * Additionally, KDI keeps a copy of the incoming %cr3 value when it 588 * took an interrupt, and has put that back in the kpti_dbg area for us 589 * to use, so we don't do any sniffing of %cs here. This is important 590 * so that debugging code that changes %cr3 is possible. 591 */ 592 ENTRY_NP(tr_iret_kdi) 593 movq %r14, KPTI_R14(%r13) /* %r14 has to be preserved by us */ 594 595 movq %rsp, %r14 /* original %rsp is pointing at IRET frame */ 596 leaq KPTI_TOP(%r13), %rsp 597 pushq T_FRAMERET_SS(%r14) 598 pushq T_FRAMERET_RSP(%r14) 599 pushq T_FRAMERET_RFLAGS(%r14) 600 pushq T_FRAMERET_CS(%r14) 601 pushq T_FRAMERET_RIP(%r14) 602 603 movq KPTI_TR_CR3(%r13), %r14 604 movq %r14, %cr3 605 606 movq KPTI_R14(%r13), %r14 607 movq KPTI_R13(%r13), %r13 /* preserved by our caller */ 608 609 iretq 610 SET_SIZE(tr_iret_kdi) 611 612 .global tr_intr_ret_end 613 tr_intr_ret_end: 614 615 /* 616 * Interrupt/trap entry trampolines 617 */ 618 619 /* CPU pushed an error code, and ISR wants one */ 620 #define MK_INTR_TRAMPOLINE(isr) \ 621 ENTRY_NP(tr_/**/isr); \ 622 INTERRUPT_TRAMPOLINE; \ 623 jmp isr; \ 624 SET_SIZE(tr_/**/isr) 625 626 /* CPU didn't push an error code, and ISR doesn't want one */ 627 #define MK_INTR_TRAMPOLINE_NOERR(isr) \ 628 ENTRY_NP(tr_/**/isr); \ 629 push $0; \ 630 INTERRUPT_TRAMPOLINE_NOERR; \ 631 jmp isr; \ 632 SET_SIZE(tr_/**/isr) 633 634 /* CPU pushed an error code, and ISR wants one */ 635 #define MK_DBG_INTR_TRAMPOLINE(isr) \ 636 ENTRY_NP(tr_/**/isr); \ 637 DBG_INTERRUPT_TRAMPOLINE; \ 638 jmp isr; \ 639 SET_SIZE(tr_/**/isr) 640 641 /* CPU didn't push an error code, and ISR doesn't want one */ 642 #define MK_DBG_INTR_TRAMPOLINE_NOERR(isr) \ 643 ENTRY_NP(tr_/**/isr); \ 644 push $0; \ 645 DBG_INTERRUPT_TRAMPOLINE_NOERR; \ 646 jmp isr; \ 647 SET_SIZE(tr_/**/isr) 648 649 650 MK_INTR_TRAMPOLINE_NOERR(div0trap) 651 MK_DBG_INTR_TRAMPOLINE_NOERR(dbgtrap) 652 MK_DBG_INTR_TRAMPOLINE_NOERR(brktrap) 653 MK_INTR_TRAMPOLINE_NOERR(ovflotrap) 654 MK_INTR_TRAMPOLINE_NOERR(boundstrap) 655 MK_INTR_TRAMPOLINE_NOERR(invoptrap) 656 MK_INTR_TRAMPOLINE_NOERR(ndptrap) 657 MK_INTR_TRAMPOLINE(invtsstrap) 658 MK_DBG_INTR_TRAMPOLINE(segnptrap) 659 MK_DBG_INTR_TRAMPOLINE(stktrap) 660 MK_DBG_INTR_TRAMPOLINE(gptrap) 661 MK_DBG_INTR_TRAMPOLINE(pftrap) 662 MK_INTR_TRAMPOLINE_NOERR(resvtrap) 663 MK_INTR_TRAMPOLINE_NOERR(ndperr) 664 MK_INTR_TRAMPOLINE(achktrap) 665 MK_INTR_TRAMPOLINE_NOERR(xmtrap) 666 MK_INTR_TRAMPOLINE_NOERR(invaltrap) 667 MK_INTR_TRAMPOLINE_NOERR(fasttrap) 668 MK_INTR_TRAMPOLINE_NOERR(dtrace_ret) 669 670 /* 671 * These are special because they can interrupt other traps, and 672 * each other. We don't need to pivot their stacks, because they have 673 * dedicated IST stack space, but we need to change %cr3. 674 */ 675 ENTRY_NP(tr_nmiint) 676 pushq %r13 677 mov kpti_safe_cr3, %r13 678 mov %r13, %cr3 679 popq %r13 680 jmp nmiint 681 SET_SIZE(tr_nmiint) 682 683 #if !defined(__xpv) 684 ENTRY_NP(tr_syserrtrap) 685 /* 686 * If we got here we should always have a zero error code pushed. 687 * The INT $0x8 instr doesn't seem to push one, though, which we use 688 * as an emergency panic in the other trampolines. So adjust things 689 * here. 690 */ 691 cmpq $0, (%rsp) 692 je 1f 693 pushq $0 694 1: 695 pushq %r13 696 mov kpti_safe_cr3, %r13 697 mov %r13, %cr3 698 popq %r13 699 jmp syserrtrap 700 SET_SIZE(tr_syserrtrap) 701 #endif 702 703 ENTRY_NP(tr_mcetrap) 704 pushq %r13 705 mov kpti_safe_cr3, %r13 706 mov %r13, %cr3 707 popq %r13 708 jmp mcetrap 709 SET_SIZE(tr_mcetrap) 710 711 /* 712 * Interrupts start at 32 713 */ 714 #define MKIVCT(n) \ 715 ENTRY_NP(tr_ivct/**/n) \ 716 push $0; \ 717 INTERRUPT_TRAMPOLINE; \ 718 push $n - 0x20; \ 719 jmp cmnint; \ 720 SET_SIZE(tr_ivct/**/n) 721 722 MKIVCT(32); MKIVCT(33); MKIVCT(34); MKIVCT(35); 723 MKIVCT(36); MKIVCT(37); MKIVCT(38); MKIVCT(39); 724 MKIVCT(40); MKIVCT(41); MKIVCT(42); MKIVCT(43); 725 MKIVCT(44); MKIVCT(45); MKIVCT(46); MKIVCT(47); 726 MKIVCT(48); MKIVCT(49); MKIVCT(50); MKIVCT(51); 727 MKIVCT(52); MKIVCT(53); MKIVCT(54); MKIVCT(55); 728 MKIVCT(56); MKIVCT(57); MKIVCT(58); MKIVCT(59); 729 MKIVCT(60); MKIVCT(61); MKIVCT(62); MKIVCT(63); 730 MKIVCT(64); MKIVCT(65); MKIVCT(66); MKIVCT(67); 731 MKIVCT(68); MKIVCT(69); MKIVCT(70); MKIVCT(71); 732 MKIVCT(72); MKIVCT(73); MKIVCT(74); MKIVCT(75); 733 MKIVCT(76); MKIVCT(77); MKIVCT(78); MKIVCT(79); 734 MKIVCT(80); MKIVCT(81); MKIVCT(82); MKIVCT(83); 735 MKIVCT(84); MKIVCT(85); MKIVCT(86); MKIVCT(87); 736 MKIVCT(88); MKIVCT(89); MKIVCT(90); MKIVCT(91); 737 MKIVCT(92); MKIVCT(93); MKIVCT(94); MKIVCT(95); 738 MKIVCT(96); MKIVCT(97); MKIVCT(98); MKIVCT(99); 739 MKIVCT(100); MKIVCT(101); MKIVCT(102); MKIVCT(103); 740 MKIVCT(104); MKIVCT(105); MKIVCT(106); MKIVCT(107); 741 MKIVCT(108); MKIVCT(109); MKIVCT(110); MKIVCT(111); 742 MKIVCT(112); MKIVCT(113); MKIVCT(114); MKIVCT(115); 743 MKIVCT(116); MKIVCT(117); MKIVCT(118); MKIVCT(119); 744 MKIVCT(120); MKIVCT(121); MKIVCT(122); MKIVCT(123); 745 MKIVCT(124); MKIVCT(125); MKIVCT(126); MKIVCT(127); 746 MKIVCT(128); MKIVCT(129); MKIVCT(130); MKIVCT(131); 747 MKIVCT(132); MKIVCT(133); MKIVCT(134); MKIVCT(135); 748 MKIVCT(136); MKIVCT(137); MKIVCT(138); MKIVCT(139); 749 MKIVCT(140); MKIVCT(141); MKIVCT(142); MKIVCT(143); 750 MKIVCT(144); MKIVCT(145); MKIVCT(146); MKIVCT(147); 751 MKIVCT(148); MKIVCT(149); MKIVCT(150); MKIVCT(151); 752 MKIVCT(152); MKIVCT(153); MKIVCT(154); MKIVCT(155); 753 MKIVCT(156); MKIVCT(157); MKIVCT(158); MKIVCT(159); 754 MKIVCT(160); MKIVCT(161); MKIVCT(162); MKIVCT(163); 755 MKIVCT(164); MKIVCT(165); MKIVCT(166); MKIVCT(167); 756 MKIVCT(168); MKIVCT(169); MKIVCT(170); MKIVCT(171); 757 MKIVCT(172); MKIVCT(173); MKIVCT(174); MKIVCT(175); 758 MKIVCT(176); MKIVCT(177); MKIVCT(178); MKIVCT(179); 759 MKIVCT(180); MKIVCT(181); MKIVCT(182); MKIVCT(183); 760 MKIVCT(184); MKIVCT(185); MKIVCT(186); MKIVCT(187); 761 MKIVCT(188); MKIVCT(189); MKIVCT(190); MKIVCT(191); 762 MKIVCT(192); MKIVCT(193); MKIVCT(194); MKIVCT(195); 763 MKIVCT(196); MKIVCT(197); MKIVCT(198); MKIVCT(199); 764 MKIVCT(200); MKIVCT(201); MKIVCT(202); MKIVCT(203); 765 MKIVCT(204); MKIVCT(205); MKIVCT(206); MKIVCT(207); 766 MKIVCT(208); MKIVCT(209); MKIVCT(210); MKIVCT(211); 767 MKIVCT(212); MKIVCT(213); MKIVCT(214); MKIVCT(215); 768 MKIVCT(216); MKIVCT(217); MKIVCT(218); MKIVCT(219); 769 MKIVCT(220); MKIVCT(221); MKIVCT(222); MKIVCT(223); 770 MKIVCT(224); MKIVCT(225); MKIVCT(226); MKIVCT(227); 771 MKIVCT(228); MKIVCT(229); MKIVCT(230); MKIVCT(231); 772 MKIVCT(232); MKIVCT(233); MKIVCT(234); MKIVCT(235); 773 MKIVCT(236); MKIVCT(237); MKIVCT(238); MKIVCT(239); 774 MKIVCT(240); MKIVCT(241); MKIVCT(242); MKIVCT(243); 775 MKIVCT(244); MKIVCT(245); MKIVCT(246); MKIVCT(247); 776 MKIVCT(248); MKIVCT(249); MKIVCT(250); MKIVCT(251); 777 MKIVCT(252); MKIVCT(253); MKIVCT(254); MKIVCT(255); 778 779 /* 780 * We're PCIDE, but we don't have INVPCID. The only way to invalidate a 781 * PCID other than the current one, then, is to load its cr3 then 782 * invlpg. But loading kf_user_cr3 means we can longer access our 783 * caller's text mapping (or indeed, its stack). So this little helper 784 * has to live within our trampoline text region. 785 * 786 * Called as tr_mmu_flush_user_range(addr, len, pgsz, cr3) 787 */ 788 ENTRY_NP(tr_mmu_flush_user_range) 789 push %rbx 790 /* When we read cr3, it never has the NOINVL bit set. */ 791 mov %cr3, %rax 792 movq $CR3_NOINVL_BIT, %rbx 793 orq %rbx, %rax 794 795 mov %rcx, %cr3 796 add %rdi, %rsi 797 .align ASM_ENTRY_ALIGN 798 1: 799 invlpg (%rdi) 800 add %rdx, %rdi 801 cmp %rsi, %rdi 802 jb 1b 803 mov %rax, %cr3 804 pop %rbx 805 retq 806 SET_SIZE(tr_mmu_flush_user_range) 807 808 .align MMU_PAGESIZE 809 .global kpti_tramp_end 810 kpti_tramp_end: 811 nop 812