1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. 24 */ 25 26 /* 27 * Copyright 2018 Joyent, Inc. All rights reserved. 28 */ 29 30 /* 31 * Copyright (c) 1992 Terrence R. Lambert. 32 * Copyright (c) 1990 The Regents of the University of California. 33 * All rights reserved. 34 * 35 * This code is derived from software contributed to Berkeley by 36 * William Jolitz. 37 * 38 * Redistribution and use in source and binary forms, with or without 39 * modification, are permitted provided that the following conditions 40 * are met: 41 * 1. Redistributions of source code must retain the above copyright 42 * notice, this list of conditions and the following disclaimer. 43 * 2. Redistributions in binary form must reproduce the above copyright 44 * notice, this list of conditions and the following disclaimer in the 45 * documentation and/or other materials provided with the distribution. 46 * 3. All advertising materials mentioning features or use of this software 47 * must display the following acknowledgement: 48 * This product includes software developed by the University of 49 * California, Berkeley and its contributors. 50 * 4. Neither the name of the University nor the names of its contributors 51 * may be used to endorse or promote products derived from this software 52 * without specific prior written permission. 53 * 54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 64 * SUCH DAMAGE. 65 * 66 * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 67 */ 68 69 #include <sys/types.h> 70 #include <sys/sysmacros.h> 71 #include <sys/tss.h> 72 #include <sys/segments.h> 73 #include <sys/trap.h> 74 #include <sys/cpuvar.h> 75 #include <sys/bootconf.h> 76 #include <sys/x86_archext.h> 77 #include <sys/controlregs.h> 78 #include <sys/archsystm.h> 79 #include <sys/machsystm.h> 80 #include <sys/kobj.h> 81 #include <sys/cmn_err.h> 82 #include <sys/reboot.h> 83 #include <sys/kdi.h> 84 #include <sys/mach_mmu.h> 85 #include <sys/systm.h> 86 #include <sys/note.h> 87 88 #ifdef __xpv 89 #include <sys/hypervisor.h> 90 #include <vm/as.h> 91 #endif 92 93 #include <sys/promif.h> 94 #include <sys/bootinfo.h> 95 #include <vm/kboot_mmu.h> 96 #include <vm/hat_pte.h> 97 98 /* 99 * cpu0 and default tables and structures. 100 */ 101 user_desc_t *gdt0; 102 #if !defined(__xpv) 103 desctbr_t gdt0_default_r; 104 #endif 105 106 gate_desc_t *idt0; /* interrupt descriptor table */ 107 #if defined(__i386) 108 desctbr_t idt0_default_r; /* describes idt0 in IDTR format */ 109 #endif 110 111 tss_t *ktss0; /* kernel task state structure */ 112 113 #if defined(__i386) 114 tss_t *dftss0; /* #DF double-fault exception */ 115 #endif /* __i386 */ 116 117 user_desc_t zero_udesc; /* base zero user desc native procs */ 118 user_desc_t null_udesc; /* null user descriptor */ 119 system_desc_t null_sdesc; /* null system descriptor */ 120 121 #if defined(__amd64) 122 user_desc_t zero_u32desc; /* 32-bit compatibility procs */ 123 #endif /* __amd64 */ 124 125 #if defined(__amd64) 126 user_desc_t ucs_on; 127 user_desc_t ucs_off; 128 user_desc_t ucs32_on; 129 user_desc_t ucs32_off; 130 #endif /* __amd64 */ 131 132 /* 133 * If the size of this is changed, you must update hat_pcp_setup() and the 134 * definitions in exception.s 135 */ 136 extern char dblfault_stack0[DEFAULTSTKSZ]; 137 extern char nmi_stack0[DEFAULTSTKSZ]; 138 extern char mce_stack0[DEFAULTSTKSZ]; 139 140 extern void fast_null(void); 141 extern hrtime_t get_hrtime(void); 142 extern hrtime_t gethrvtime(void); 143 extern hrtime_t get_hrestime(void); 144 extern uint64_t getlgrp(void); 145 146 void (*(fasttable[]))(void) = { 147 fast_null, /* T_FNULL routine */ 148 fast_null, /* T_FGETFP routine (initially null) */ 149 fast_null, /* T_FSETFP routine (initially null) */ 150 (void (*)())get_hrtime, /* T_GETHRTIME */ 151 (void (*)())gethrvtime, /* T_GETHRVTIME */ 152 (void (*)())get_hrestime, /* T_GETHRESTIME */ 153 (void (*)())getlgrp /* T_GETLGRP */ 154 }; 155 156 /* 157 * Structure containing pre-computed descriptors to allow us to temporarily 158 * interpose on a standard handler. 159 */ 160 struct interposing_handler { 161 int ih_inum; 162 gate_desc_t ih_interp_desc; 163 gate_desc_t ih_default_desc; 164 }; 165 166 /* 167 * The brand infrastructure interposes on two handlers, and we use one as a 168 * NULL signpost. 169 */ 170 static struct interposing_handler brand_tbl[2]; 171 172 /* 173 * software prototypes for default local descriptor table 174 */ 175 176 /* 177 * Routines for loading segment descriptors in format the hardware 178 * can understand. 179 */ 180 181 /* 182 * In long mode we have the new L or long mode attribute bit 183 * for code segments. Only the conforming bit in type is used along 184 * with descriptor priority and present bits. Default operand size must 185 * be zero when in long mode. In 32-bit compatibility mode all fields 186 * are treated as in legacy mode. For data segments while in long mode 187 * only the present bit is loaded. 188 */ 189 void 190 set_usegd(user_desc_t *dp, uint_t lmode, void *base, size_t size, 191 uint_t type, uint_t dpl, uint_t gran, uint_t defopsz) 192 { 193 ASSERT(lmode == SDP_SHORT || lmode == SDP_LONG); 194 /* This should never be a "system" segment. */ 195 ASSERT3U(type & SDT_S, !=, 0); 196 197 /* 198 * 64-bit long mode. 199 */ 200 if (lmode == SDP_LONG) 201 dp->usd_def32 = 0; /* 32-bit operands only */ 202 else 203 /* 204 * 32-bit compatibility mode. 205 */ 206 dp->usd_def32 = defopsz; /* 0 = 16, 1 = 32-bit ops */ 207 208 /* 209 * We should always set the "accessed" bit (SDT_A), otherwise the CPU 210 * will write to the GDT whenever we change segment registers around. 211 * With KPTI on, the GDT is read-only in the user page table, which 212 * causes crashes if we don't set this. 213 */ 214 ASSERT3U(type & SDT_A, !=, 0); 215 216 dp->usd_long = lmode; /* 64-bit mode */ 217 dp->usd_type = type; 218 dp->usd_dpl = dpl; 219 dp->usd_p = 1; 220 dp->usd_gran = gran; /* 0 = bytes, 1 = pages */ 221 222 dp->usd_lobase = (uintptr_t)base; 223 dp->usd_midbase = (uintptr_t)base >> 16; 224 dp->usd_hibase = (uintptr_t)base >> (16 + 8); 225 dp->usd_lolimit = size; 226 dp->usd_hilimit = (uintptr_t)size >> 16; 227 } 228 229 /* 230 * Install system segment descriptor for LDT and TSS segments. 231 */ 232 233 void 234 set_syssegd(system_desc_t *dp, void *base, size_t size, uint_t type, 235 uint_t dpl) 236 { 237 dp->ssd_lolimit = size; 238 dp->ssd_hilimit = (uintptr_t)size >> 16; 239 240 dp->ssd_lobase = (uintptr_t)base; 241 dp->ssd_midbase = (uintptr_t)base >> 16; 242 dp->ssd_hibase = (uintptr_t)base >> (16 + 8); 243 dp->ssd_hi64base = (uintptr_t)base >> (16 + 8 + 8); 244 245 dp->ssd_type = type; 246 dp->ssd_zero1 = 0; /* must be zero */ 247 dp->ssd_zero2 = 0; 248 dp->ssd_dpl = dpl; 249 dp->ssd_p = 1; 250 dp->ssd_gran = 0; /* force byte units */ 251 } 252 253 void * 254 get_ssd_base(system_desc_t *dp) 255 { 256 uintptr_t base; 257 258 base = (uintptr_t)dp->ssd_lobase | 259 (uintptr_t)dp->ssd_midbase << 16 | 260 (uintptr_t)dp->ssd_hibase << (16 + 8) | 261 (uintptr_t)dp->ssd_hi64base << (16 + 8 + 8); 262 return ((void *)base); 263 } 264 265 /* 266 * Install gate segment descriptor for interrupt, trap, call and task gates. 267 * 268 * For 64 bit native if we have KPTI enabled, we use the IST stack mechanism on 269 * all interrupts. We have different ISTs for each class of exceptions that are 270 * most likely to occur while handling an existing exception; while many of 271 * these are just going to panic, it's nice not to trample on the existing 272 * exception state for debugging purposes. 273 * 274 * Normal interrupts are all redirected unconditionally to the KPTI trampoline 275 * stack space. This unifies the trampoline handling between user and kernel 276 * space (and avoids the need to touch %gs). 277 * 278 * The KDI IDT *all* uses the DBG IST: consider single stepping tr_pftrap, when 279 * we do a read from KMDB that cause another #PF. Without its own IST, this 280 * would stomp on the kernel's mcpu_kpti_flt frame. 281 */ 282 uint_t 283 idt_vector_to_ist(uint_t vector) 284 { 285 #if defined(__xpv) 286 _NOTE(ARGUNUSED(vector)); 287 return (IST_NONE); 288 #else 289 switch (vector) { 290 /* These should always use IST even without KPTI enabled. */ 291 case T_DBLFLT: 292 return (IST_DF); 293 case T_NMIFLT: 294 return (IST_NMI); 295 case T_MCE: 296 return (IST_MCE); 297 298 case T_BPTFLT: 299 case T_SGLSTP: 300 if (kpti_enable == 1) { 301 return (IST_DBG); 302 } 303 return (IST_NONE); 304 case T_STKFLT: 305 case T_GPFLT: 306 case T_PGFLT: 307 if (kpti_enable == 1) { 308 return (IST_NESTABLE); 309 } 310 return (IST_NONE); 311 default: 312 if (kpti_enable == 1) { 313 return (IST_DEFAULT); 314 } 315 return (IST_NONE); 316 } 317 #endif 318 } 319 320 void 321 set_gatesegd(gate_desc_t *dp, void (*func)(void), selector_t sel, 322 uint_t type, uint_t dpl, uint_t ist) 323 { 324 dp->sgd_looffset = (uintptr_t)func; 325 dp->sgd_hioffset = (uintptr_t)func >> 16; 326 dp->sgd_hi64offset = (uintptr_t)func >> (16 + 16); 327 dp->sgd_selector = (uint16_t)sel; 328 dp->sgd_ist = ist; 329 dp->sgd_type = type; 330 dp->sgd_dpl = dpl; 331 dp->sgd_p = 1; 332 } 333 334 /* 335 * Updates a single user descriptor in the the GDT of the current cpu. 336 * Caller is responsible for preventing cpu migration. 337 */ 338 339 void 340 gdt_update_usegd(uint_t sidx, user_desc_t *udp) 341 { 342 #if defined(DEBUG) 343 /* This should never be a "system" segment, but it might be null. */ 344 if (udp->usd_p != 0 || udp->usd_type != 0) { 345 ASSERT3U(udp->usd_type & SDT_S, !=, 0); 346 } 347 /* 348 * We should always set the "accessed" bit (SDT_A), otherwise the CPU 349 * will write to the GDT whenever we change segment registers around. 350 * With KPTI on, the GDT is read-only in the user page table, which 351 * causes crashes if we don't set this. 352 */ 353 if (udp->usd_p != 0 || udp->usd_type != 0) { 354 ASSERT3U(udp->usd_type & SDT_A, !=, 0); 355 } 356 #endif 357 358 #if defined(__xpv) 359 uint64_t dpa = CPU->cpu_m.mcpu_gdtpa + sizeof (*udp) * sidx; 360 361 if (HYPERVISOR_update_descriptor(pa_to_ma(dpa), *(uint64_t *)udp)) 362 panic("gdt_update_usegd: HYPERVISOR_update_descriptor"); 363 364 #else /* __xpv */ 365 CPU->cpu_gdt[sidx] = *udp; 366 #endif /* __xpv */ 367 } 368 369 /* 370 * Writes single descriptor pointed to by udp into a processes 371 * LDT entry pointed to by ldp. 372 */ 373 int 374 ldt_update_segd(user_desc_t *ldp, user_desc_t *udp) 375 { 376 #if defined(DEBUG) 377 /* This should never be a "system" segment, but it might be null. */ 378 if (udp->usd_p != 0 || udp->usd_type != 0) { 379 ASSERT3U(udp->usd_type & SDT_S, !=, 0); 380 } 381 /* 382 * We should always set the "accessed" bit (SDT_A), otherwise the CPU 383 * will write to the LDT whenever we change segment registers around. 384 * With KPTI on, the LDT is read-only in the user page table, which 385 * causes crashes if we don't set this. 386 */ 387 if (udp->usd_p != 0 || udp->usd_type != 0) { 388 ASSERT3U(udp->usd_type & SDT_A, !=, 0); 389 } 390 #endif 391 392 #if defined(__xpv) 393 uint64_t dpa; 394 395 dpa = mmu_ptob(hat_getpfnum(kas.a_hat, (caddr_t)ldp)) | 396 ((uintptr_t)ldp & PAGEOFFSET); 397 398 /* 399 * The hypervisor is a little more restrictive about what it 400 * supports in the LDT. 401 */ 402 if (HYPERVISOR_update_descriptor(pa_to_ma(dpa), *(uint64_t *)udp) != 0) 403 return (EINVAL); 404 405 #else /* __xpv */ 406 *ldp = *udp; 407 408 #endif /* __xpv */ 409 return (0); 410 } 411 412 #if defined(__xpv) 413 414 /* 415 * Converts hw format gate descriptor into pseudo-IDT format for the hypervisor. 416 * Returns true if a valid entry was written. 417 */ 418 int 419 xen_idt_to_trap_info(uint_t vec, gate_desc_t *sgd, void *ti_arg) 420 { 421 trap_info_t *ti = ti_arg; /* XXPV Aargh - segments.h comment */ 422 423 /* 424 * skip holes in the IDT 425 */ 426 if (GATESEG_GETOFFSET(sgd) == 0) 427 return (0); 428 429 ASSERT(sgd->sgd_type == SDT_SYSIGT); 430 ti->vector = vec; 431 TI_SET_DPL(ti, sgd->sgd_dpl); 432 433 /* 434 * Is this an interrupt gate? 435 */ 436 if (sgd->sgd_type == SDT_SYSIGT) { 437 /* LINTED */ 438 TI_SET_IF(ti, 1); 439 } 440 ti->cs = sgd->sgd_selector; 441 #if defined(__amd64) 442 ti->cs |= SEL_KPL; /* force into ring 3. see KCS_SEL */ 443 #endif 444 ti->address = GATESEG_GETOFFSET(sgd); 445 return (1); 446 } 447 448 /* 449 * Convert a single hw format gate descriptor and write it into our virtual IDT. 450 */ 451 void 452 xen_idt_write(gate_desc_t *sgd, uint_t vec) 453 { 454 trap_info_t trapinfo[2]; 455 456 bzero(trapinfo, sizeof (trapinfo)); 457 if (xen_idt_to_trap_info(vec, sgd, &trapinfo[0]) == 0) 458 return; 459 if (xen_set_trap_table(trapinfo) != 0) 460 panic("xen_idt_write: xen_set_trap_table() failed"); 461 } 462 463 #endif /* __xpv */ 464 465 #if defined(__amd64) 466 467 /* 468 * Build kernel GDT. 469 */ 470 471 static void 472 init_gdt_common(user_desc_t *gdt) 473 { 474 int i; 475 476 /* 477 * 64-bit kernel code segment. 478 */ 479 set_usegd(&gdt[GDT_KCODE], SDP_LONG, NULL, 0, SDT_MEMERA, SEL_KPL, 480 SDP_PAGES, SDP_OP32); 481 482 /* 483 * 64-bit kernel data segment. The limit attribute is ignored in 64-bit 484 * mode, but we set it here to 0xFFFF so that we can use the SYSRET 485 * instruction to return from system calls back to 32-bit applications. 486 * SYSRET doesn't update the base, limit, or attributes of %ss or %ds 487 * descriptors. We therefore must ensure that the kernel uses something, 488 * though it will be ignored by hardware, that is compatible with 32-bit 489 * apps. For the same reason we must set the default op size of this 490 * descriptor to 32-bit operands. 491 */ 492 set_usegd(&gdt[GDT_KDATA], SDP_LONG, NULL, -1, SDT_MEMRWA, 493 SEL_KPL, SDP_PAGES, SDP_OP32); 494 gdt[GDT_KDATA].usd_def32 = 1; 495 496 /* 497 * 64-bit user code segment. 498 */ 499 set_usegd(&gdt[GDT_UCODE], SDP_LONG, NULL, 0, SDT_MEMERA, SEL_UPL, 500 SDP_PAGES, SDP_OP32); 501 502 /* 503 * 32-bit user code segment. 504 */ 505 set_usegd(&gdt[GDT_U32CODE], SDP_SHORT, NULL, -1, SDT_MEMERA, 506 SEL_UPL, SDP_PAGES, SDP_OP32); 507 508 /* 509 * See gdt_ucode32() and gdt_ucode_native(). 510 */ 511 ucs_on = ucs_off = gdt[GDT_UCODE]; 512 ucs_off.usd_p = 0; /* forces #np fault */ 513 514 ucs32_on = ucs32_off = gdt[GDT_U32CODE]; 515 ucs32_off.usd_p = 0; /* forces #np fault */ 516 517 /* 518 * 32 and 64 bit data segments can actually share the same descriptor. 519 * In long mode only the present bit is checked but all other fields 520 * are loaded. But in compatibility mode all fields are interpreted 521 * as in legacy mode so they must be set correctly for a 32-bit data 522 * segment. 523 */ 524 set_usegd(&gdt[GDT_UDATA], SDP_SHORT, NULL, -1, SDT_MEMRWA, SEL_UPL, 525 SDP_PAGES, SDP_OP32); 526 527 #if !defined(__xpv) 528 529 /* 530 * The 64-bit kernel has no default LDT. By default, the LDT descriptor 531 * in the GDT is 0. 532 */ 533 534 /* 535 * Kernel TSS 536 */ 537 set_syssegd((system_desc_t *)&gdt[GDT_KTSS], ktss0, 538 sizeof (*ktss0) - 1, SDT_SYSTSS, SEL_KPL); 539 540 #endif /* !__xpv */ 541 542 /* 543 * Initialize fs and gs descriptors for 32 bit processes. 544 * Only attributes and limits are initialized, the effective 545 * base address is programmed via fsbase/gsbase. 546 */ 547 set_usegd(&gdt[GDT_LWPFS], SDP_SHORT, NULL, -1, SDT_MEMRWA, 548 SEL_UPL, SDP_PAGES, SDP_OP32); 549 set_usegd(&gdt[GDT_LWPGS], SDP_SHORT, NULL, -1, SDT_MEMRWA, 550 SEL_UPL, SDP_PAGES, SDP_OP32); 551 552 /* 553 * Initialize the descriptors set aside for brand usage. 554 * Only attributes and limits are initialized. 555 */ 556 for (i = GDT_BRANDMIN; i <= GDT_BRANDMAX; i++) 557 set_usegd(&gdt0[i], SDP_SHORT, NULL, -1, SDT_MEMRWA, 558 SEL_UPL, SDP_PAGES, SDP_OP32); 559 560 /* 561 * Initialize convenient zero base user descriptors for clearing 562 * lwp private %fs and %gs descriptors in GDT. See setregs() for 563 * an example. 564 */ 565 set_usegd(&zero_udesc, SDP_LONG, 0, 0, SDT_MEMRWA, SEL_UPL, 566 SDP_BYTES, SDP_OP32); 567 set_usegd(&zero_u32desc, SDP_SHORT, 0, -1, SDT_MEMRWA, SEL_UPL, 568 SDP_PAGES, SDP_OP32); 569 } 570 571 #if defined(__xpv) 572 573 static user_desc_t * 574 init_gdt(void) 575 { 576 uint64_t gdtpa; 577 ulong_t ma[1]; /* XXPV should be a memory_t */ 578 ulong_t addr; 579 580 #if !defined(__lint) 581 /* 582 * Our gdt is never larger than a single page. 583 */ 584 ASSERT((sizeof (*gdt0) * NGDT) <= PAGESIZE); 585 #endif 586 gdt0 = (user_desc_t *)BOP_ALLOC(bootops, (caddr_t)GDT_VA, 587 PAGESIZE, PAGESIZE); 588 bzero(gdt0, PAGESIZE); 589 590 init_gdt_common(gdt0); 591 592 /* 593 * XXX Since we never invoke kmdb until after the kernel takes 594 * over the descriptor tables why not have it use the kernel's 595 * selectors? 596 */ 597 if (boothowto & RB_DEBUG) { 598 set_usegd(&gdt0[GDT_B32DATA], SDP_LONG, NULL, -1, SDT_MEMRWA, 599 SEL_KPL, SDP_PAGES, SDP_OP32); 600 set_usegd(&gdt0[GDT_B64CODE], SDP_LONG, NULL, -1, SDT_MEMERA, 601 SEL_KPL, SDP_PAGES, SDP_OP32); 602 } 603 604 /* 605 * Clear write permission for page containing the gdt and install it. 606 */ 607 gdtpa = pfn_to_pa(va_to_pfn(gdt0)); 608 ma[0] = (ulong_t)(pa_to_ma(gdtpa) >> PAGESHIFT); 609 kbm_read_only((uintptr_t)gdt0, gdtpa); 610 xen_set_gdt(ma, NGDT); 611 612 /* 613 * Reload the segment registers to use the new GDT. 614 * On 64-bit, fixup KCS_SEL to be in ring 3. 615 * See KCS_SEL in segments.h. 616 */ 617 load_segment_registers((KCS_SEL | SEL_KPL), KFS_SEL, KGS_SEL, KDS_SEL); 618 619 /* 620 * setup %gs for kernel 621 */ 622 xen_set_segment_base(SEGBASE_GS_KERNEL, (ulong_t)&cpus[0]); 623 624 /* 625 * XX64 We should never dereference off "other gsbase" or 626 * "fsbase". So, we should arrange to point FSBASE and 627 * KGSBASE somewhere truly awful e.g. point it at the last 628 * valid address below the hole so that any attempts to index 629 * off them cause an exception. 630 * 631 * For now, point it at 8G -- at least it should be unmapped 632 * until some 64-bit processes run. 633 */ 634 addr = 0x200000000ul; 635 xen_set_segment_base(SEGBASE_FS, addr); 636 xen_set_segment_base(SEGBASE_GS_USER, addr); 637 xen_set_segment_base(SEGBASE_GS_USER_SEL, 0); 638 639 return (gdt0); 640 } 641 642 #else /* __xpv */ 643 644 static user_desc_t * 645 init_gdt(void) 646 { 647 desctbr_t r_bgdt, r_gdt; 648 user_desc_t *bgdt; 649 650 #if !defined(__lint) 651 /* 652 * Our gdt is never larger than a single page. 653 */ 654 ASSERT((sizeof (*gdt0) * NGDT) <= PAGESIZE); 655 #endif 656 gdt0 = (user_desc_t *)BOP_ALLOC(bootops, (caddr_t)GDT_VA, 657 PAGESIZE, PAGESIZE); 658 bzero(gdt0, PAGESIZE); 659 660 init_gdt_common(gdt0); 661 662 /* 663 * Copy in from boot's gdt to our gdt. 664 * Entry 0 is the null descriptor by definition. 665 */ 666 rd_gdtr(&r_bgdt); 667 bgdt = (user_desc_t *)r_bgdt.dtr_base; 668 if (bgdt == NULL) 669 panic("null boot gdt"); 670 671 gdt0[GDT_B32DATA] = bgdt[GDT_B32DATA]; 672 gdt0[GDT_B32CODE] = bgdt[GDT_B32CODE]; 673 gdt0[GDT_B16CODE] = bgdt[GDT_B16CODE]; 674 gdt0[GDT_B16DATA] = bgdt[GDT_B16DATA]; 675 gdt0[GDT_B64CODE] = bgdt[GDT_B64CODE]; 676 677 /* 678 * Install our new GDT 679 */ 680 r_gdt.dtr_limit = (sizeof (*gdt0) * NGDT) - 1; 681 r_gdt.dtr_base = (uintptr_t)gdt0; 682 wr_gdtr(&r_gdt); 683 684 /* 685 * Reload the segment registers to use the new GDT 686 */ 687 load_segment_registers(KCS_SEL, KFS_SEL, KGS_SEL, KDS_SEL); 688 689 /* 690 * setup %gs for kernel 691 */ 692 wrmsr(MSR_AMD_GSBASE, (uint64_t)&cpus[0]); 693 694 /* 695 * XX64 We should never dereference off "other gsbase" or 696 * "fsbase". So, we should arrange to point FSBASE and 697 * KGSBASE somewhere truly awful e.g. point it at the last 698 * valid address below the hole so that any attempts to index 699 * off them cause an exception. 700 * 701 * For now, point it at 8G -- at least it should be unmapped 702 * until some 64-bit processes run. 703 */ 704 wrmsr(MSR_AMD_FSBASE, 0x200000000ul); 705 wrmsr(MSR_AMD_KGSBASE, 0x200000000ul); 706 return (gdt0); 707 } 708 709 #endif /* __xpv */ 710 711 #elif defined(__i386) 712 713 static void 714 init_gdt_common(user_desc_t *gdt) 715 { 716 int i; 717 718 /* 719 * Text and data for both kernel and user span entire 32 bit 720 * address space. 721 */ 722 723 /* 724 * kernel code segment. 725 */ 726 set_usegd(&gdt[GDT_KCODE], NULL, -1, SDT_MEMERA, SEL_KPL, SDP_PAGES, 727 SDP_OP32); 728 729 /* 730 * kernel data segment. 731 */ 732 set_usegd(&gdt[GDT_KDATA], NULL, -1, SDT_MEMRWA, SEL_KPL, SDP_PAGES, 733 SDP_OP32); 734 735 /* 736 * user code segment. 737 */ 738 set_usegd(&gdt[GDT_UCODE], NULL, -1, SDT_MEMERA, SEL_UPL, SDP_PAGES, 739 SDP_OP32); 740 741 /* 742 * user data segment. 743 */ 744 set_usegd(&gdt[GDT_UDATA], NULL, -1, SDT_MEMRWA, SEL_UPL, SDP_PAGES, 745 SDP_OP32); 746 747 #if !defined(__xpv) 748 749 /* 750 * TSS for T_DBLFLT (double fault) handler 751 */ 752 set_syssegd((system_desc_t *)&gdt[GDT_DBFLT], dftss0, 753 sizeof (*dftss0) - 1, SDT_SYSTSS, SEL_KPL); 754 755 /* 756 * TSS for kernel 757 */ 758 set_syssegd((system_desc_t *)&gdt[GDT_KTSS], ktss0, 759 sizeof (*ktss0) - 1, SDT_SYSTSS, SEL_KPL); 760 761 #endif /* !__xpv */ 762 763 /* 764 * %gs selector for kernel 765 */ 766 set_usegd(&gdt[GDT_GS], &cpus[0], sizeof (struct cpu) -1, SDT_MEMRWA, 767 SEL_KPL, SDP_BYTES, SDP_OP32); 768 769 /* 770 * Initialize lwp private descriptors. 771 * Only attributes and limits are initialized, the effective 772 * base address is programmed via fsbase/gsbase. 773 */ 774 set_usegd(&gdt[GDT_LWPFS], NULL, (size_t)-1, SDT_MEMRWA, SEL_UPL, 775 SDP_PAGES, SDP_OP32); 776 set_usegd(&gdt[GDT_LWPGS], NULL, (size_t)-1, SDT_MEMRWA, SEL_UPL, 777 SDP_PAGES, SDP_OP32); 778 779 /* 780 * Initialize the descriptors set aside for brand usage. 781 * Only attributes and limits are initialized. 782 */ 783 for (i = GDT_BRANDMIN; i <= GDT_BRANDMAX; i++) 784 set_usegd(&gdt0[i], NULL, (size_t)-1, SDT_MEMRWA, SEL_UPL, 785 SDP_PAGES, SDP_OP32); 786 /* 787 * Initialize convenient zero base user descriptor for clearing 788 * lwp private %fs and %gs descriptors in GDT. See setregs() for 789 * an example. 790 */ 791 set_usegd(&zero_udesc, NULL, -1, SDT_MEMRWA, SEL_UPL, 792 SDP_BYTES, SDP_OP32); 793 } 794 795 #if defined(__xpv) 796 797 static user_desc_t * 798 init_gdt(void) 799 { 800 uint64_t gdtpa; 801 ulong_t ma[1]; /* XXPV should be a memory_t */ 802 803 #if !defined(__lint) 804 /* 805 * Our gdt is never larger than a single page. 806 */ 807 ASSERT((sizeof (*gdt0) * NGDT) <= PAGESIZE); 808 #endif 809 gdt0 = (user_desc_t *)BOP_ALLOC(bootops, (caddr_t)GDT_VA, 810 PAGESIZE, PAGESIZE); 811 bzero(gdt0, PAGESIZE); 812 813 init_gdt_common(gdt0); 814 gdtpa = pfn_to_pa(va_to_pfn(gdt0)); 815 816 /* 817 * XXX Since we never invoke kmdb until after the kernel takes 818 * over the descriptor tables why not have it use the kernel's 819 * selectors? 820 */ 821 if (boothowto & RB_DEBUG) { 822 set_usegd(&gdt0[GDT_B32DATA], NULL, -1, SDT_MEMRWA, SEL_KPL, 823 SDP_PAGES, SDP_OP32); 824 set_usegd(&gdt0[GDT_B32CODE], NULL, -1, SDT_MEMERA, SEL_KPL, 825 SDP_PAGES, SDP_OP32); 826 } 827 828 /* 829 * Clear write permission for page containing the gdt and install it. 830 */ 831 ma[0] = (ulong_t)(pa_to_ma(gdtpa) >> PAGESHIFT); 832 kbm_read_only((uintptr_t)gdt0, gdtpa); 833 xen_set_gdt(ma, NGDT); 834 835 /* 836 * Reload the segment registers to use the new GDT 837 */ 838 load_segment_registers( 839 KCS_SEL, KDS_SEL, KDS_SEL, KFS_SEL, KGS_SEL, KDS_SEL); 840 841 return (gdt0); 842 } 843 844 #else /* __xpv */ 845 846 static user_desc_t * 847 init_gdt(void) 848 { 849 desctbr_t r_bgdt, r_gdt; 850 user_desc_t *bgdt; 851 852 #if !defined(__lint) 853 /* 854 * Our gdt is never larger than a single page. 855 */ 856 ASSERT((sizeof (*gdt0) * NGDT) <= PAGESIZE); 857 #endif 858 /* 859 * XXX this allocation belongs in our caller, not here. 860 */ 861 gdt0 = (user_desc_t *)BOP_ALLOC(bootops, (caddr_t)GDT_VA, 862 PAGESIZE, PAGESIZE); 863 bzero(gdt0, PAGESIZE); 864 865 init_gdt_common(gdt0); 866 867 /* 868 * Copy in from boot's gdt to our gdt entries. 869 * Entry 0 is null descriptor by definition. 870 */ 871 rd_gdtr(&r_bgdt); 872 bgdt = (user_desc_t *)r_bgdt.dtr_base; 873 if (bgdt == NULL) 874 panic("null boot gdt"); 875 876 gdt0[GDT_B32DATA] = bgdt[GDT_B32DATA]; 877 gdt0[GDT_B32CODE] = bgdt[GDT_B32CODE]; 878 gdt0[GDT_B16CODE] = bgdt[GDT_B16CODE]; 879 gdt0[GDT_B16DATA] = bgdt[GDT_B16DATA]; 880 881 /* 882 * Install our new GDT 883 */ 884 r_gdt.dtr_limit = (sizeof (*gdt0) * NGDT) - 1; 885 r_gdt.dtr_base = (uintptr_t)gdt0; 886 wr_gdtr(&r_gdt); 887 888 /* 889 * Reload the segment registers to use the new GDT 890 */ 891 load_segment_registers( 892 KCS_SEL, KDS_SEL, KDS_SEL, KFS_SEL, KGS_SEL, KDS_SEL); 893 894 return (gdt0); 895 } 896 897 #endif /* __xpv */ 898 #endif /* __i386 */ 899 900 /* 901 * Build kernel IDT. 902 * 903 * Note that for amd64 we pretty much require every gate to be an interrupt 904 * gate which blocks interrupts atomically on entry; that's because of our 905 * dependency on using 'swapgs' every time we come into the kernel to find 906 * the cpu structure. If we get interrupted just before doing that, %cs could 907 * be in kernel mode (so that the trap prolog doesn't do a swapgs), but 908 * %gsbase is really still pointing at something in userland. Bad things will 909 * ensue. We also use interrupt gates for i386 as well even though this is not 910 * required for some traps. 911 * 912 * Perhaps they should have invented a trap gate that does an atomic swapgs? 913 */ 914 static void 915 init_idt_common(gate_desc_t *idt) 916 { 917 set_gatesegd(&idt[T_ZERODIV], 918 (kpti_enable == 1) ? &tr_div0trap : &div0trap, 919 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_ZERODIV)); 920 set_gatesegd(&idt[T_SGLSTP], 921 (kpti_enable == 1) ? &tr_dbgtrap : &dbgtrap, 922 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_SGLSTP)); 923 set_gatesegd(&idt[T_NMIFLT], 924 (kpti_enable == 1) ? &tr_nmiint : &nmiint, 925 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_NMIFLT)); 926 set_gatesegd(&idt[T_BPTFLT], 927 (kpti_enable == 1) ? &tr_brktrap : &brktrap, 928 KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_BPTFLT)); 929 set_gatesegd(&idt[T_OVFLW], 930 (kpti_enable == 1) ? &tr_ovflotrap : &ovflotrap, 931 KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_OVFLW)); 932 set_gatesegd(&idt[T_BOUNDFLT], 933 (kpti_enable == 1) ? &tr_boundstrap : &boundstrap, 934 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_BOUNDFLT)); 935 set_gatesegd(&idt[T_ILLINST], 936 (kpti_enable == 1) ? &tr_invoptrap : &invoptrap, 937 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_ILLINST)); 938 set_gatesegd(&idt[T_NOEXTFLT], 939 (kpti_enable == 1) ? &tr_ndptrap : &ndptrap, 940 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_NOEXTFLT)); 941 942 /* 943 * double fault handler. 944 * 945 * Note that on the hypervisor a guest does not receive #df faults. 946 * Instead a failsafe event is injected into the guest if its selectors 947 * and/or stack is in a broken state. See xen_failsafe_callback. 948 */ 949 #if !defined(__xpv) 950 set_gatesegd(&idt[T_DBLFLT], 951 (kpti_enable == 1) ? &tr_syserrtrap : &syserrtrap, 952 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_DBLFLT)); 953 #endif /* !__xpv */ 954 955 /* 956 * T_EXTOVRFLT coprocessor-segment-overrun not supported. 957 */ 958 set_gatesegd(&idt[T_TSSFLT], 959 (kpti_enable == 1) ? &tr_invtsstrap : &invtsstrap, 960 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_TSSFLT)); 961 set_gatesegd(&idt[T_SEGFLT], 962 (kpti_enable == 1) ? &tr_segnptrap : &segnptrap, 963 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_SEGFLT)); 964 set_gatesegd(&idt[T_STKFLT], 965 (kpti_enable == 1) ? &tr_stktrap : &stktrap, 966 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_STKFLT)); 967 set_gatesegd(&idt[T_GPFLT], 968 (kpti_enable == 1) ? &tr_gptrap : &gptrap, 969 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_GPFLT)); 970 set_gatesegd(&idt[T_PGFLT], 971 (kpti_enable == 1) ? &tr_pftrap : &pftrap, 972 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_PGFLT)); 973 set_gatesegd(&idt[T_EXTERRFLT], 974 (kpti_enable == 1) ? &tr_ndperr : &ndperr, 975 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_EXTERRFLT)); 976 set_gatesegd(&idt[T_ALIGNMENT], 977 (kpti_enable == 1) ? &tr_achktrap : &achktrap, 978 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_ALIGNMENT)); 979 set_gatesegd(&idt[T_MCE], 980 (kpti_enable == 1) ? &tr_mcetrap : &mcetrap, 981 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_MCE)); 982 set_gatesegd(&idt[T_SIMDFPE], 983 (kpti_enable == 1) ? &tr_xmtrap : &xmtrap, 984 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_SIMDFPE)); 985 986 /* 987 * install fast trap handler at 210. 988 */ 989 set_gatesegd(&idt[T_FASTTRAP], 990 (kpti_enable == 1) ? &tr_fasttrap : &fasttrap, 991 KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_FASTTRAP)); 992 993 /* 994 * System call handler. 995 */ 996 set_gatesegd(&idt[T_SYSCALLINT], 997 (kpti_enable == 1) ? &tr_sys_syscall_int : &sys_syscall_int, 998 KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_SYSCALLINT)); 999 1000 /* 1001 * Install the DTrace interrupt handler for the pid provider. 1002 */ 1003 set_gatesegd(&idt[T_DTRACE_RET], 1004 (kpti_enable == 1) ? &tr_dtrace_ret : &dtrace_ret, 1005 KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_DTRACE_RET)); 1006 1007 /* 1008 * Prepare interposing descriptor for the syscall handler 1009 * and cache copy of the default descriptor. 1010 */ 1011 brand_tbl[0].ih_inum = T_SYSCALLINT; 1012 brand_tbl[0].ih_default_desc = idt0[T_SYSCALLINT]; 1013 1014 set_gatesegd(&(brand_tbl[0].ih_interp_desc), 1015 (kpti_enable == 1) ? &tr_brand_sys_syscall_int : 1016 &brand_sys_syscall_int, KCS_SEL, SDT_SYSIGT, TRP_UPL, 1017 idt_vector_to_ist(T_SYSCALLINT)); 1018 1019 brand_tbl[1].ih_inum = 0; 1020 } 1021 1022 #if defined(__xpv) 1023 1024 static void 1025 init_idt(gate_desc_t *idt) 1026 { 1027 init_idt_common(idt); 1028 } 1029 1030 #else /* __xpv */ 1031 1032 static void 1033 init_idt(gate_desc_t *idt) 1034 { 1035 char ivctname[80]; 1036 void (*ivctptr)(void); 1037 int i; 1038 1039 /* 1040 * Initialize entire table with 'reserved' trap and then overwrite 1041 * specific entries. T_EXTOVRFLT (9) is unsupported and reserved 1042 * since it can only be generated on a 386 processor. 15 is also 1043 * unsupported and reserved. 1044 */ 1045 #if !defined(__xpv) 1046 for (i = 0; i < NIDT; i++) { 1047 set_gatesegd(&idt[i], 1048 (kpti_enable == 1) ? &tr_resvtrap : &resvtrap, 1049 KCS_SEL, SDT_SYSIGT, TRP_KPL, 1050 idt_vector_to_ist(T_RESVTRAP)); 1051 } 1052 #else 1053 for (i = 0; i < NIDT; i++) { 1054 set_gatesegd(&idt[i], &resvtrap, KCS_SEL, SDT_SYSIGT, TRP_KPL, 1055 IST_NONE); 1056 } 1057 #endif 1058 1059 /* 1060 * 20-31 reserved 1061 */ 1062 #if !defined(__xpv) 1063 for (i = 20; i < 32; i++) { 1064 set_gatesegd(&idt[i], 1065 (kpti_enable == 1) ? &tr_invaltrap : &invaltrap, 1066 KCS_SEL, SDT_SYSIGT, TRP_KPL, 1067 idt_vector_to_ist(T_INVALTRAP)); 1068 } 1069 #else 1070 for (i = 20; i < 32; i++) { 1071 set_gatesegd(&idt[i], &invaltrap, KCS_SEL, SDT_SYSIGT, TRP_KPL, 1072 IST_NONE); 1073 } 1074 #endif 1075 1076 /* 1077 * interrupts 32 - 255 1078 */ 1079 for (i = 32; i < 256; i++) { 1080 #if !defined(__xpv) 1081 (void) snprintf(ivctname, sizeof (ivctname), 1082 (kpti_enable == 1) ? "tr_ivct%d" : "ivct%d", i); 1083 #else 1084 (void) snprintf(ivctname, sizeof (ivctname), "ivct%d", i); 1085 #endif 1086 ivctptr = (void (*)(void))kobj_getsymvalue(ivctname, 0); 1087 if (ivctptr == NULL) 1088 panic("kobj_getsymvalue(%s) failed", ivctname); 1089 1090 set_gatesegd(&idt[i], ivctptr, KCS_SEL, SDT_SYSIGT, TRP_KPL, 1091 idt_vector_to_ist(i)); 1092 } 1093 1094 /* 1095 * Now install the common ones. Note that it will overlay some 1096 * entries installed above like T_SYSCALLINT, T_FASTTRAP etc. 1097 */ 1098 init_idt_common(idt); 1099 } 1100 1101 #endif /* __xpv */ 1102 1103 /* 1104 * The kernel does not deal with LDTs unless a user explicitly creates 1105 * one. Under normal circumstances, the LDTR contains 0. Any process attempting 1106 * to reference the LDT will therefore cause a #gp. System calls made via the 1107 * obsolete lcall mechanism are emulated by the #gp fault handler. 1108 */ 1109 static void 1110 init_ldt(void) 1111 { 1112 #if defined(__xpv) 1113 xen_set_ldt(NULL, 0); 1114 #else 1115 wr_ldtr(0); 1116 #endif 1117 } 1118 1119 #if !defined(__xpv) 1120 1121 static void 1122 init_tss(void) 1123 { 1124 extern struct cpu cpus[]; 1125 1126 /* 1127 * tss_rsp0 is dynamically filled in by resume() (in swtch.s) on each 1128 * context switch but it'll be overwritten with this same value anyway. 1129 */ 1130 if (kpti_enable == 1) { 1131 ktss0->tss_rsp0 = (uint64_t)&cpus->cpu_m.mcpu_kpti.kf_tr_rsp; 1132 } 1133 1134 /* Set up the IST stacks for double fault, NMI, MCE. */ 1135 ktss0->tss_ist1 = (uintptr_t)&dblfault_stack0[sizeof (dblfault_stack0)]; 1136 ktss0->tss_ist2 = (uintptr_t)&nmi_stack0[sizeof (nmi_stack0)]; 1137 ktss0->tss_ist3 = (uintptr_t)&mce_stack0[sizeof (mce_stack0)]; 1138 1139 /* 1140 * This IST stack is used for #DB,#BP (debug) interrupts (when KPTI is 1141 * enabled), and also for KDI (always). 1142 */ 1143 ktss0->tss_ist4 = (uint64_t)&cpus->cpu_m.mcpu_kpti_dbg.kf_tr_rsp; 1144 1145 if (kpti_enable == 1) { 1146 /* This IST stack is used for #GP,#PF,#SS (fault) interrupts. */ 1147 ktss0->tss_ist5 = 1148 (uint64_t)&cpus->cpu_m.mcpu_kpti_flt.kf_tr_rsp; 1149 1150 /* This IST stack is used for all other intrs (for KPTI). */ 1151 ktss0->tss_ist6 = (uint64_t)&cpus->cpu_m.mcpu_kpti.kf_tr_rsp; 1152 } 1153 1154 /* 1155 * Set I/O bit map offset equal to size of TSS segment limit 1156 * for no I/O permission map. This will force all user I/O 1157 * instructions to generate #gp fault. 1158 */ 1159 ktss0->tss_bitmapbase = sizeof (*ktss0); 1160 1161 /* 1162 * Point %tr to descriptor for ktss0 in gdt. 1163 */ 1164 wr_tsr(KTSS_SEL); 1165 } 1166 1167 #endif /* !__xpv */ 1168 1169 #if defined(__xpv) 1170 1171 void 1172 init_desctbls(void) 1173 { 1174 uint_t vec; 1175 user_desc_t *gdt; 1176 1177 /* 1178 * Setup and install our GDT. 1179 */ 1180 gdt = init_gdt(); 1181 1182 /* 1183 * Store static pa of gdt to speed up pa_to_ma() translations 1184 * on lwp context switches. 1185 */ 1186 ASSERT(IS_P2ALIGNED((uintptr_t)gdt, PAGESIZE)); 1187 CPU->cpu_gdt = gdt; 1188 CPU->cpu_m.mcpu_gdtpa = pfn_to_pa(va_to_pfn(gdt)); 1189 1190 /* 1191 * Setup and install our IDT. 1192 */ 1193 #if !defined(__lint) 1194 ASSERT(NIDT * sizeof (*idt0) <= PAGESIZE); 1195 #endif 1196 idt0 = (gate_desc_t *)BOP_ALLOC(bootops, (caddr_t)IDT_VA, 1197 PAGESIZE, PAGESIZE); 1198 bzero(idt0, PAGESIZE); 1199 init_idt(idt0); 1200 for (vec = 0; vec < NIDT; vec++) 1201 xen_idt_write(&idt0[vec], vec); 1202 1203 CPU->cpu_idt = idt0; 1204 1205 /* 1206 * set default kernel stack 1207 */ 1208 xen_stack_switch(KDS_SEL, 1209 (ulong_t)&dblfault_stack0[sizeof (dblfault_stack0)]); 1210 1211 xen_init_callbacks(); 1212 1213 init_ldt(); 1214 } 1215 1216 #else /* __xpv */ 1217 1218 void 1219 init_desctbls(void) 1220 { 1221 user_desc_t *gdt; 1222 desctbr_t idtr; 1223 1224 /* 1225 * Allocate IDT and TSS structures on unique pages for better 1226 * performance in virtual machines. 1227 */ 1228 #if !defined(__lint) 1229 ASSERT(NIDT * sizeof (*idt0) <= PAGESIZE); 1230 #endif 1231 idt0 = (gate_desc_t *)BOP_ALLOC(bootops, (caddr_t)IDT_VA, 1232 PAGESIZE, PAGESIZE); 1233 bzero(idt0, PAGESIZE); 1234 #if !defined(__lint) 1235 ASSERT(sizeof (*ktss0) <= PAGESIZE); 1236 #endif 1237 ktss0 = (tss_t *)BOP_ALLOC(bootops, (caddr_t)KTSS_VA, 1238 PAGESIZE, PAGESIZE); 1239 bzero(ktss0, PAGESIZE); 1240 1241 #if defined(__i386) 1242 #if !defined(__lint) 1243 ASSERT(sizeof (*dftss0) <= PAGESIZE); 1244 #endif 1245 dftss0 = (tss_t *)BOP_ALLOC(bootops, (caddr_t)DFTSS_VA, 1246 PAGESIZE, PAGESIZE); 1247 bzero(dftss0, PAGESIZE); 1248 #endif 1249 1250 /* 1251 * Setup and install our GDT. 1252 */ 1253 gdt = init_gdt(); 1254 ASSERT(IS_P2ALIGNED((uintptr_t)gdt, PAGESIZE)); 1255 CPU->cpu_gdt = gdt; 1256 1257 /* 1258 * Initialize this CPU's LDT. 1259 */ 1260 CPU->cpu_m.mcpu_ldt = BOP_ALLOC(bootops, (caddr_t)LDT_VA, 1261 LDT_CPU_SIZE, PAGESIZE); 1262 bzero(CPU->cpu_m.mcpu_ldt, LDT_CPU_SIZE); 1263 CPU->cpu_m.mcpu_ldt_len = 0; 1264 1265 /* 1266 * Setup and install our IDT. 1267 */ 1268 init_idt(idt0); 1269 1270 idtr.dtr_base = (uintptr_t)idt0; 1271 idtr.dtr_limit = (NIDT * sizeof (*idt0)) - 1; 1272 wr_idtr(&idtr); 1273 CPU->cpu_idt = idt0; 1274 1275 #if defined(__i386) 1276 /* 1277 * We maintain a description of idt0 in convenient IDTR format 1278 * for #pf's on some older pentium processors. See pentium_pftrap(). 1279 */ 1280 idt0_default_r = idtr; 1281 #endif /* __i386 */ 1282 1283 init_tss(); 1284 CPU->cpu_tss = ktss0; 1285 init_ldt(); 1286 1287 /* Stash this so that the NMI,MCE,#DF and KDI handlers can use it. */ 1288 kpti_safe_cr3 = (uint64_t)getcr3(); 1289 } 1290 1291 #endif /* __xpv */ 1292 1293 #ifndef __xpv 1294 /* 1295 * As per Intel Vol 3 27.5.2, the GDTR limit is reset to 64Kb on a VM exit, so 1296 * we have to manually fix it up ourselves. 1297 * 1298 * The caller may still need to make sure that it can't go off-CPU with the 1299 * incorrect limit, before calling this (such as disabling pre-emption). 1300 */ 1301 void 1302 reset_gdtr_limit(void) 1303 { 1304 ulong_t flags = intr_clear(); 1305 desctbr_t gdtr; 1306 1307 rd_gdtr(&gdtr); 1308 gdtr.dtr_limit = (sizeof (user_desc_t) * NGDT) - 1; 1309 wr_gdtr(&gdtr); 1310 1311 intr_restore(flags); 1312 } 1313 #endif /* __xpv */ 1314 1315 /* 1316 * In the early kernel, we need to set up a simple GDT to run on. 1317 * 1318 * XXPV Can dboot use this too? See dboot_gdt.s 1319 */ 1320 void 1321 init_boot_gdt(user_desc_t *bgdt) 1322 { 1323 #if defined(__amd64) 1324 set_usegd(&bgdt[GDT_B32DATA], SDP_LONG, NULL, -1, SDT_MEMRWA, SEL_KPL, 1325 SDP_PAGES, SDP_OP32); 1326 set_usegd(&bgdt[GDT_B64CODE], SDP_LONG, NULL, -1, SDT_MEMERA, SEL_KPL, 1327 SDP_PAGES, SDP_OP32); 1328 #elif defined(__i386) 1329 set_usegd(&bgdt[GDT_B32DATA], NULL, -1, SDT_MEMRWA, SEL_KPL, 1330 SDP_PAGES, SDP_OP32); 1331 set_usegd(&bgdt[GDT_B32CODE], NULL, -1, SDT_MEMERA, SEL_KPL, 1332 SDP_PAGES, SDP_OP32); 1333 #endif /* __i386 */ 1334 } 1335 1336 /* 1337 * Enable interpositioning on the system call path by rewriting the 1338 * sys{call|enter} MSRs and the syscall-related entries in the IDT to use 1339 * the branded entry points. 1340 */ 1341 void 1342 brand_interpositioning_enable(void) 1343 { 1344 gate_desc_t *idt = CPU->cpu_idt; 1345 int i; 1346 1347 ASSERT(curthread->t_preempt != 0 || getpil() >= DISP_LEVEL); 1348 1349 for (i = 0; brand_tbl[i].ih_inum; i++) { 1350 idt[brand_tbl[i].ih_inum] = brand_tbl[i].ih_interp_desc; 1351 #if defined(__xpv) 1352 xen_idt_write(&idt[brand_tbl[i].ih_inum], 1353 brand_tbl[i].ih_inum); 1354 #endif 1355 } 1356 1357 #if defined(__amd64) 1358 #if defined(__xpv) 1359 1360 /* 1361 * Currently the hypervisor only supports 64-bit syscalls via 1362 * syscall instruction. The 32-bit syscalls are handled by 1363 * interrupt gate above. 1364 */ 1365 xen_set_callback(brand_sys_syscall, CALLBACKTYPE_syscall, 1366 CALLBACKF_mask_events); 1367 1368 #else 1369 1370 if (is_x86_feature(x86_featureset, X86FSET_ASYSC)) { 1371 if (kpti_enable == 1) { 1372 wrmsr(MSR_AMD_LSTAR, (uintptr_t)tr_brand_sys_syscall); 1373 wrmsr(MSR_AMD_CSTAR, (uintptr_t)tr_brand_sys_syscall32); 1374 } else { 1375 wrmsr(MSR_AMD_LSTAR, (uintptr_t)brand_sys_syscall); 1376 wrmsr(MSR_AMD_CSTAR, (uintptr_t)brand_sys_syscall32); 1377 } 1378 } 1379 1380 #endif 1381 #endif /* __amd64 */ 1382 1383 if (is_x86_feature(x86_featureset, X86FSET_SEP)) { 1384 if (kpti_enable == 1) { 1385 wrmsr(MSR_INTC_SEP_EIP, 1386 (uintptr_t)tr_brand_sys_sysenter); 1387 } else { 1388 wrmsr(MSR_INTC_SEP_EIP, (uintptr_t)brand_sys_sysenter); 1389 } 1390 } 1391 } 1392 1393 /* 1394 * Disable interpositioning on the system call path by rewriting the 1395 * sys{call|enter} MSRs and the syscall-related entries in the IDT to use 1396 * the standard entry points, which bypass the interpositioning hooks. 1397 */ 1398 void 1399 brand_interpositioning_disable(void) 1400 { 1401 gate_desc_t *idt = CPU->cpu_idt; 1402 int i; 1403 1404 ASSERT(curthread->t_preempt != 0 || getpil() >= DISP_LEVEL); 1405 1406 for (i = 0; brand_tbl[i].ih_inum; i++) { 1407 idt[brand_tbl[i].ih_inum] = brand_tbl[i].ih_default_desc; 1408 #if defined(__xpv) 1409 xen_idt_write(&idt[brand_tbl[i].ih_inum], 1410 brand_tbl[i].ih_inum); 1411 #endif 1412 } 1413 1414 #if defined(__amd64) 1415 #if defined(__xpv) 1416 1417 /* 1418 * See comment above in brand_interpositioning_enable. 1419 */ 1420 xen_set_callback(sys_syscall, CALLBACKTYPE_syscall, 1421 CALLBACKF_mask_events); 1422 1423 #else 1424 1425 if (is_x86_feature(x86_featureset, X86FSET_ASYSC)) { 1426 if (kpti_enable == 1) { 1427 wrmsr(MSR_AMD_LSTAR, (uintptr_t)tr_sys_syscall); 1428 wrmsr(MSR_AMD_CSTAR, (uintptr_t)tr_sys_syscall32); 1429 } else { 1430 wrmsr(MSR_AMD_LSTAR, (uintptr_t)sys_syscall); 1431 wrmsr(MSR_AMD_CSTAR, (uintptr_t)sys_syscall32); 1432 } 1433 } 1434 1435 #endif 1436 #endif /* __amd64 */ 1437 1438 if (is_x86_feature(x86_featureset, X86FSET_SEP)) { 1439 if (kpti_enable == 1) { 1440 wrmsr(MSR_INTC_SEP_EIP, (uintptr_t)tr_sys_sysenter); 1441 } else { 1442 wrmsr(MSR_INTC_SEP_EIP, (uintptr_t)sys_sysenter); 1443 } 1444 } 1445 }