1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. 24 */ 25 26 /* 27 * Copyright 2018 Joyent, Inc. All rights reserved. 28 */ 29 30 /* 31 * Copyright (c) 1992 Terrence R. Lambert. 32 * Copyright (c) 1990 The Regents of the University of California. 33 * All rights reserved. 34 * 35 * This code is derived from software contributed to Berkeley by 36 * William Jolitz. 37 * 38 * Redistribution and use in source and binary forms, with or without 39 * modification, are permitted provided that the following conditions 40 * are met: 41 * 1. Redistributions of source code must retain the above copyright 42 * notice, this list of conditions and the following disclaimer. 43 * 2. Redistributions in binary form must reproduce the above copyright 44 * notice, this list of conditions and the following disclaimer in the 45 * documentation and/or other materials provided with the distribution. 46 * 3. All advertising materials mentioning features or use of this software 47 * must display the following acknowledgement: 48 * This product includes software developed by the University of 49 * California, Berkeley and its contributors. 50 * 4. Neither the name of the University nor the names of its contributors 51 * may be used to endorse or promote products derived from this software 52 * without specific prior written permission. 53 * 54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 64 * SUCH DAMAGE. 65 * 66 * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 67 */ 68 69 #include <sys/types.h> 70 #include <sys/sysmacros.h> 71 #include <sys/tss.h> 72 #include <sys/segments.h> 73 #include <sys/trap.h> 74 #include <sys/cpuvar.h> 75 #include <sys/bootconf.h> 76 #include <sys/x86_archext.h> 77 #include <sys/controlregs.h> 78 #include <sys/archsystm.h> 79 #include <sys/machsystm.h> 80 #include <sys/kobj.h> 81 #include <sys/cmn_err.h> 82 #include <sys/reboot.h> 83 #include <sys/kdi.h> 84 #include <sys/mach_mmu.h> 85 #include <sys/systm.h> 86 #include <sys/note.h> 87 88 #ifdef __xpv 89 #include <sys/hypervisor.h> 90 #include <vm/as.h> 91 #endif 92 93 #include <sys/promif.h> 94 #include <sys/bootinfo.h> 95 #include <vm/kboot_mmu.h> 96 #include <vm/hat_pte.h> 97 98 /* 99 * cpu0 and default tables and structures. 100 */ 101 user_desc_t *gdt0; 102 #if !defined(__xpv) 103 desctbr_t gdt0_default_r; 104 #endif 105 106 gate_desc_t *idt0; /* interrupt descriptor table */ 107 #if defined(__i386) 108 desctbr_t idt0_default_r; /* describes idt0 in IDTR format */ 109 #endif 110 111 tss_t *ktss0; /* kernel task state structure */ 112 113 #if defined(__i386) 114 tss_t *dftss0; /* #DF double-fault exception */ 115 #endif /* __i386 */ 116 117 user_desc_t zero_udesc; /* base zero user desc native procs */ 118 user_desc_t null_udesc; /* null user descriptor */ 119 system_desc_t null_sdesc; /* null system descriptor */ 120 121 #if defined(__amd64) 122 user_desc_t zero_u32desc; /* 32-bit compatibility procs */ 123 #endif /* __amd64 */ 124 125 #if defined(__amd64) 126 user_desc_t ucs_on; 127 user_desc_t ucs_off; 128 user_desc_t ucs32_on; 129 user_desc_t ucs32_off; 130 #endif /* __amd64 */ 131 132 /* 133 * If the size of this is changed, you must update hat_pcp_setup() and the 134 * definitions in exception.s 135 */ 136 extern char dblfault_stack0[DEFAULTSTKSZ]; 137 extern char nmi_stack0[DEFAULTSTKSZ]; 138 extern char mce_stack0[DEFAULTSTKSZ]; 139 140 extern void fast_null(void); 141 extern hrtime_t get_hrtime(void); 142 extern hrtime_t gethrvtime(void); 143 extern hrtime_t get_hrestime(void); 144 extern uint64_t getlgrp(void); 145 146 void (*(fasttable[]))(void) = { 147 fast_null, /* T_FNULL routine */ 148 fast_null, /* T_FGETFP routine (initially null) */ 149 fast_null, /* T_FSETFP routine (initially null) */ 150 (void (*)())get_hrtime, /* T_GETHRTIME */ 151 (void (*)())gethrvtime, /* T_GETHRVTIME */ 152 (void (*)())get_hrestime, /* T_GETHRESTIME */ 153 (void (*)())getlgrp /* T_GETLGRP */ 154 }; 155 156 /* 157 * Structure containing pre-computed descriptors to allow us to temporarily 158 * interpose on a standard handler. 159 */ 160 struct interposing_handler { 161 int ih_inum; 162 gate_desc_t ih_interp_desc; 163 gate_desc_t ih_default_desc; 164 }; 165 166 /* 167 * The brand infrastructure interposes on two handlers, and we use one as a 168 * NULL signpost. 169 */ 170 static struct interposing_handler brand_tbl[2]; 171 172 /* 173 * software prototypes for default local descriptor table 174 */ 175 176 /* 177 * Routines for loading segment descriptors in format the hardware 178 * can understand. 179 */ 180 181 #if defined(__amd64) 182 183 /* 184 * In long mode we have the new L or long mode attribute bit 185 * for code segments. Only the conforming bit in type is used along 186 * with descriptor priority and present bits. Default operand size must 187 * be zero when in long mode. In 32-bit compatibility mode all fields 188 * are treated as in legacy mode. For data segments while in long mode 189 * only the present bit is loaded. 190 */ 191 void 192 set_usegd(user_desc_t *dp, uint_t lmode, void *base, size_t size, 193 uint_t type, uint_t dpl, uint_t gran, uint_t defopsz) 194 { 195 ASSERT(lmode == SDP_SHORT || lmode == SDP_LONG); 196 197 /* 198 * 64-bit long mode. 199 */ 200 if (lmode == SDP_LONG) 201 dp->usd_def32 = 0; /* 32-bit operands only */ 202 else 203 /* 204 * 32-bit compatibility mode. 205 */ 206 dp->usd_def32 = defopsz; /* 0 = 16, 1 = 32-bit ops */ 207 208 dp->usd_long = lmode; /* 64-bit mode */ 209 dp->usd_type = type; 210 dp->usd_dpl = dpl; 211 dp->usd_p = 1; 212 dp->usd_gran = gran; /* 0 = bytes, 1 = pages */ 213 214 dp->usd_lobase = (uintptr_t)base; 215 dp->usd_midbase = (uintptr_t)base >> 16; 216 dp->usd_hibase = (uintptr_t)base >> (16 + 8); 217 dp->usd_lolimit = size; 218 dp->usd_hilimit = (uintptr_t)size >> 16; 219 } 220 221 #elif defined(__i386) 222 223 /* 224 * Install user segment descriptor for code and data. 225 */ 226 void 227 set_usegd(user_desc_t *dp, void *base, size_t size, uint_t type, 228 uint_t dpl, uint_t gran, uint_t defopsz) 229 { 230 dp->usd_lolimit = size; 231 dp->usd_hilimit = (uintptr_t)size >> 16; 232 233 dp->usd_lobase = (uintptr_t)base; 234 dp->usd_midbase = (uintptr_t)base >> 16; 235 dp->usd_hibase = (uintptr_t)base >> (16 + 8); 236 237 dp->usd_type = type; 238 dp->usd_dpl = dpl; 239 dp->usd_p = 1; 240 dp->usd_def32 = defopsz; /* 0 = 16, 1 = 32 bit operands */ 241 dp->usd_gran = gran; /* 0 = bytes, 1 = pages */ 242 } 243 244 #endif /* __i386 */ 245 246 /* 247 * Install system segment descriptor for LDT and TSS segments. 248 */ 249 250 #if defined(__amd64) 251 252 void 253 set_syssegd(system_desc_t *dp, void *base, size_t size, uint_t type, 254 uint_t dpl) 255 { 256 dp->ssd_lolimit = size; 257 dp->ssd_hilimit = (uintptr_t)size >> 16; 258 259 dp->ssd_lobase = (uintptr_t)base; 260 dp->ssd_midbase = (uintptr_t)base >> 16; 261 dp->ssd_hibase = (uintptr_t)base >> (16 + 8); 262 dp->ssd_hi64base = (uintptr_t)base >> (16 + 8 + 8); 263 264 dp->ssd_type = type; 265 dp->ssd_zero1 = 0; /* must be zero */ 266 dp->ssd_zero2 = 0; 267 dp->ssd_dpl = dpl; 268 dp->ssd_p = 1; 269 dp->ssd_gran = 0; /* force byte units */ 270 } 271 272 void * 273 get_ssd_base(system_desc_t *dp) 274 { 275 uintptr_t base; 276 277 base = (uintptr_t)dp->ssd_lobase | 278 (uintptr_t)dp->ssd_midbase << 16 | 279 (uintptr_t)dp->ssd_hibase << (16 + 8) | 280 (uintptr_t)dp->ssd_hi64base << (16 + 8 + 8); 281 return ((void *)base); 282 } 283 284 #elif defined(__i386) 285 286 void 287 set_syssegd(system_desc_t *dp, void *base, size_t size, uint_t type, 288 uint_t dpl) 289 { 290 dp->ssd_lolimit = size; 291 dp->ssd_hilimit = (uintptr_t)size >> 16; 292 293 dp->ssd_lobase = (uintptr_t)base; 294 dp->ssd_midbase = (uintptr_t)base >> 16; 295 dp->ssd_hibase = (uintptr_t)base >> (16 + 8); 296 297 dp->ssd_type = type; 298 dp->ssd_zero = 0; /* must be zero */ 299 dp->ssd_dpl = dpl; 300 dp->ssd_p = 1; 301 dp->ssd_gran = 0; /* force byte units */ 302 } 303 304 void * 305 get_ssd_base(system_desc_t *dp) 306 { 307 uintptr_t base; 308 309 base = (uintptr_t)dp->ssd_lobase | 310 (uintptr_t)dp->ssd_midbase << 16 | 311 (uintptr_t)dp->ssd_hibase << (16 + 8); 312 return ((void *)base); 313 } 314 315 #endif /* __i386 */ 316 317 /* 318 * Install gate segment descriptor for interrupt, trap, call and task gates. 319 * 320 * For 64 bit native if we have KPTI enabled, we use the IST stack mechanism on 321 * all interrupts. We have different ISTs for each class of exceptions that are 322 * most likely to occur while handling an existing exception; while many of 323 * these are just going to panic, it's nice not to trample on the existing 324 * exception state for debugging purposes. 325 * 326 * Normal interrupts are all redirected unconditionally to the KPTI trampoline 327 * stack space. This unifies the trampoline handling between user and kernel 328 * space (and avoids the need to touch %gs). 329 * 330 * The KDI IDT *all* uses the DBG IST: consider single stepping tr_pftrap, when 331 * we do a read from KMDB that cause another #PF. Without its own IST, this 332 * would stomp on the kernel's mcpu_kpti_flt frame. 333 */ 334 uint_t 335 idt_vector_to_ist(uint_t vector) 336 { 337 #if defined(__xpv) 338 _NOTE(ARGUNUSED(vector)); 339 return (IST_NONE); 340 #else 341 switch (vector) { 342 /* These should always use IST even without KPTI enabled. */ 343 case T_DBLFLT: 344 return (IST_DF); 345 case T_NMIFLT: 346 return (IST_NMI); 347 case T_MCE: 348 return (IST_MCE); 349 350 case T_BPTFLT: 351 case T_SGLSTP: 352 if (kpti_enable == 1) { 353 return (IST_DBG); 354 } 355 return (IST_NONE); 356 case T_STKFLT: 357 case T_GPFLT: 358 case T_PGFLT: 359 if (kpti_enable == 1) { 360 return (IST_NESTABLE); 361 } 362 return (IST_NONE); 363 default: 364 if (kpti_enable == 1) { 365 return (IST_DEFAULT); 366 } 367 return (IST_NONE); 368 } 369 #endif 370 } 371 372 void 373 set_gatesegd(gate_desc_t *dp, void (*func)(void), selector_t sel, 374 uint_t type, uint_t dpl, uint_t ist) 375 { 376 dp->sgd_looffset = (uintptr_t)func; 377 dp->sgd_hioffset = (uintptr_t)func >> 16; 378 dp->sgd_hi64offset = (uintptr_t)func >> (16 + 16); 379 dp->sgd_selector = (uint16_t)sel; 380 dp->sgd_ist = ist; 381 dp->sgd_type = type; 382 dp->sgd_dpl = dpl; 383 dp->sgd_p = 1; 384 } 385 386 /* 387 * Updates a single user descriptor in the the GDT of the current cpu. 388 * Caller is responsible for preventing cpu migration. 389 */ 390 391 void 392 gdt_update_usegd(uint_t sidx, user_desc_t *udp) 393 { 394 #if defined(__xpv) 395 396 uint64_t dpa = CPU->cpu_m.mcpu_gdtpa + sizeof (*udp) * sidx; 397 398 if (HYPERVISOR_update_descriptor(pa_to_ma(dpa), *(uint64_t *)udp)) 399 panic("gdt_update_usegd: HYPERVISOR_update_descriptor"); 400 401 #else /* __xpv */ 402 403 CPU->cpu_gdt[sidx] = *udp; 404 405 #endif /* __xpv */ 406 } 407 408 /* 409 * Writes single descriptor pointed to by udp into a processes 410 * LDT entry pointed to by ldp. 411 */ 412 int 413 ldt_update_segd(user_desc_t *ldp, user_desc_t *udp) 414 { 415 #if defined(__xpv) 416 417 uint64_t dpa; 418 419 dpa = mmu_ptob(hat_getpfnum(kas.a_hat, (caddr_t)ldp)) | 420 ((uintptr_t)ldp & PAGEOFFSET); 421 422 /* 423 * The hypervisor is a little more restrictive about what it 424 * supports in the LDT. 425 */ 426 if (HYPERVISOR_update_descriptor(pa_to_ma(dpa), *(uint64_t *)udp) != 0) 427 return (EINVAL); 428 429 #else /* __xpv */ 430 431 *ldp = *udp; 432 433 #endif /* __xpv */ 434 return (0); 435 } 436 437 #if defined(__xpv) 438 439 /* 440 * Converts hw format gate descriptor into pseudo-IDT format for the hypervisor. 441 * Returns true if a valid entry was written. 442 */ 443 int 444 xen_idt_to_trap_info(uint_t vec, gate_desc_t *sgd, void *ti_arg) 445 { 446 trap_info_t *ti = ti_arg; /* XXPV Aargh - segments.h comment */ 447 448 /* 449 * skip holes in the IDT 450 */ 451 if (GATESEG_GETOFFSET(sgd) == 0) 452 return (0); 453 454 ASSERT(sgd->sgd_type == SDT_SYSIGT); 455 ti->vector = vec; 456 TI_SET_DPL(ti, sgd->sgd_dpl); 457 458 /* 459 * Is this an interrupt gate? 460 */ 461 if (sgd->sgd_type == SDT_SYSIGT) { 462 /* LINTED */ 463 TI_SET_IF(ti, 1); 464 } 465 ti->cs = sgd->sgd_selector; 466 #if defined(__amd64) 467 ti->cs |= SEL_KPL; /* force into ring 3. see KCS_SEL */ 468 #endif 469 ti->address = GATESEG_GETOFFSET(sgd); 470 return (1); 471 } 472 473 /* 474 * Convert a single hw format gate descriptor and write it into our virtual IDT. 475 */ 476 void 477 xen_idt_write(gate_desc_t *sgd, uint_t vec) 478 { 479 trap_info_t trapinfo[2]; 480 481 bzero(trapinfo, sizeof (trapinfo)); 482 if (xen_idt_to_trap_info(vec, sgd, &trapinfo[0]) == 0) 483 return; 484 if (xen_set_trap_table(trapinfo) != 0) 485 panic("xen_idt_write: xen_set_trap_table() failed"); 486 } 487 488 #endif /* __xpv */ 489 490 #if defined(__amd64) 491 492 /* 493 * Build kernel GDT. 494 */ 495 496 static void 497 init_gdt_common(user_desc_t *gdt) 498 { 499 int i; 500 501 /* 502 * 64-bit kernel code segment. 503 */ 504 set_usegd(&gdt[GDT_KCODE], SDP_LONG, NULL, 0, SDT_MEMERA, SEL_KPL, 505 SDP_PAGES, SDP_OP32); 506 507 /* 508 * 64-bit kernel data segment. The limit attribute is ignored in 64-bit 509 * mode, but we set it here to 0xFFFF so that we can use the SYSRET 510 * instruction to return from system calls back to 32-bit applications. 511 * SYSRET doesn't update the base, limit, or attributes of %ss or %ds 512 * descriptors. We therefore must ensure that the kernel uses something, 513 * though it will be ignored by hardware, that is compatible with 32-bit 514 * apps. For the same reason we must set the default op size of this 515 * descriptor to 32-bit operands. 516 */ 517 set_usegd(&gdt[GDT_KDATA], SDP_LONG, NULL, -1, SDT_MEMRWA, 518 SEL_KPL, SDP_PAGES, SDP_OP32); 519 gdt[GDT_KDATA].usd_def32 = 1; 520 521 /* 522 * 64-bit user code segment. 523 */ 524 set_usegd(&gdt[GDT_UCODE], SDP_LONG, NULL, 0, SDT_MEMERA, SEL_UPL, 525 SDP_PAGES, SDP_OP32); 526 527 /* 528 * 32-bit user code segment. 529 */ 530 set_usegd(&gdt[GDT_U32CODE], SDP_SHORT, NULL, -1, SDT_MEMERA, 531 SEL_UPL, SDP_PAGES, SDP_OP32); 532 533 /* 534 * See gdt_ucode32() and gdt_ucode_native(). 535 */ 536 ucs_on = ucs_off = gdt[GDT_UCODE]; 537 ucs_off.usd_p = 0; /* forces #np fault */ 538 539 ucs32_on = ucs32_off = gdt[GDT_U32CODE]; 540 ucs32_off.usd_p = 0; /* forces #np fault */ 541 542 /* 543 * 32 and 64 bit data segments can actually share the same descriptor. 544 * In long mode only the present bit is checked but all other fields 545 * are loaded. But in compatibility mode all fields are interpreted 546 * as in legacy mode so they must be set correctly for a 32-bit data 547 * segment. 548 */ 549 set_usegd(&gdt[GDT_UDATA], SDP_SHORT, NULL, -1, SDT_MEMRWA, SEL_UPL, 550 SDP_PAGES, SDP_OP32); 551 552 #if !defined(__xpv) 553 554 /* 555 * The 64-bit kernel has no default LDT. By default, the LDT descriptor 556 * in the GDT is 0. 557 */ 558 559 /* 560 * Kernel TSS 561 */ 562 set_syssegd((system_desc_t *)&gdt[GDT_KTSS], ktss0, 563 sizeof (*ktss0) - 1, SDT_SYSTSS, SEL_KPL); 564 565 #endif /* !__xpv */ 566 567 /* 568 * Initialize fs and gs descriptors for 32 bit processes. 569 * Only attributes and limits are initialized, the effective 570 * base address is programmed via fsbase/gsbase. 571 */ 572 set_usegd(&gdt[GDT_LWPFS], SDP_SHORT, NULL, -1, SDT_MEMRWA, 573 SEL_UPL, SDP_PAGES, SDP_OP32); 574 set_usegd(&gdt[GDT_LWPGS], SDP_SHORT, NULL, -1, SDT_MEMRWA, 575 SEL_UPL, SDP_PAGES, SDP_OP32); 576 577 /* 578 * Initialize the descriptors set aside for brand usage. 579 * Only attributes and limits are initialized. 580 */ 581 for (i = GDT_BRANDMIN; i <= GDT_BRANDMAX; i++) 582 set_usegd(&gdt0[i], SDP_SHORT, NULL, -1, SDT_MEMRWA, 583 SEL_UPL, SDP_PAGES, SDP_OP32); 584 585 /* 586 * Initialize convenient zero base user descriptors for clearing 587 * lwp private %fs and %gs descriptors in GDT. See setregs() for 588 * an example. 589 */ 590 set_usegd(&zero_udesc, SDP_LONG, 0, 0, SDT_MEMRWA, SEL_UPL, 591 SDP_BYTES, SDP_OP32); 592 set_usegd(&zero_u32desc, SDP_SHORT, 0, -1, SDT_MEMRWA, SEL_UPL, 593 SDP_PAGES, SDP_OP32); 594 } 595 596 #if defined(__xpv) 597 598 static user_desc_t * 599 init_gdt(void) 600 { 601 uint64_t gdtpa; 602 ulong_t ma[1]; /* XXPV should be a memory_t */ 603 ulong_t addr; 604 605 #if !defined(__lint) 606 /* 607 * Our gdt is never larger than a single page. 608 */ 609 ASSERT((sizeof (*gdt0) * NGDT) <= PAGESIZE); 610 #endif 611 gdt0 = (user_desc_t *)BOP_ALLOC(bootops, (caddr_t)GDT_VA, 612 PAGESIZE, PAGESIZE); 613 bzero(gdt0, PAGESIZE); 614 615 init_gdt_common(gdt0); 616 617 /* 618 * XXX Since we never invoke kmdb until after the kernel takes 619 * over the descriptor tables why not have it use the kernel's 620 * selectors? 621 */ 622 if (boothowto & RB_DEBUG) { 623 set_usegd(&gdt0[GDT_B32DATA], SDP_LONG, NULL, -1, SDT_MEMRWA, 624 SEL_KPL, SDP_PAGES, SDP_OP32); 625 set_usegd(&gdt0[GDT_B64CODE], SDP_LONG, NULL, -1, SDT_MEMERA, 626 SEL_KPL, SDP_PAGES, SDP_OP32); 627 } 628 629 /* 630 * Clear write permission for page containing the gdt and install it. 631 */ 632 gdtpa = pfn_to_pa(va_to_pfn(gdt0)); 633 ma[0] = (ulong_t)(pa_to_ma(gdtpa) >> PAGESHIFT); 634 kbm_read_only((uintptr_t)gdt0, gdtpa); 635 xen_set_gdt(ma, NGDT); 636 637 /* 638 * Reload the segment registers to use the new GDT. 639 * On 64-bit, fixup KCS_SEL to be in ring 3. 640 * See KCS_SEL in segments.h. 641 */ 642 load_segment_registers((KCS_SEL | SEL_KPL), KFS_SEL, KGS_SEL, KDS_SEL); 643 644 /* 645 * setup %gs for kernel 646 */ 647 xen_set_segment_base(SEGBASE_GS_KERNEL, (ulong_t)&cpus[0]); 648 649 /* 650 * XX64 We should never dereference off "other gsbase" or 651 * "fsbase". So, we should arrange to point FSBASE and 652 * KGSBASE somewhere truly awful e.g. point it at the last 653 * valid address below the hole so that any attempts to index 654 * off them cause an exception. 655 * 656 * For now, point it at 8G -- at least it should be unmapped 657 * until some 64-bit processes run. 658 */ 659 addr = 0x200000000ul; 660 xen_set_segment_base(SEGBASE_FS, addr); 661 xen_set_segment_base(SEGBASE_GS_USER, addr); 662 xen_set_segment_base(SEGBASE_GS_USER_SEL, 0); 663 664 return (gdt0); 665 } 666 667 #else /* __xpv */ 668 669 static user_desc_t * 670 init_gdt(void) 671 { 672 desctbr_t r_bgdt, r_gdt; 673 user_desc_t *bgdt; 674 675 #if !defined(__lint) 676 /* 677 * Our gdt is never larger than a single page. 678 */ 679 ASSERT((sizeof (*gdt0) * NGDT) <= PAGESIZE); 680 #endif 681 gdt0 = (user_desc_t *)BOP_ALLOC(bootops, (caddr_t)GDT_VA, 682 PAGESIZE, PAGESIZE); 683 bzero(gdt0, PAGESIZE); 684 685 init_gdt_common(gdt0); 686 687 /* 688 * Copy in from boot's gdt to our gdt. 689 * Entry 0 is the null descriptor by definition. 690 */ 691 rd_gdtr(&r_bgdt); 692 bgdt = (user_desc_t *)r_bgdt.dtr_base; 693 if (bgdt == NULL) 694 panic("null boot gdt"); 695 696 gdt0[GDT_B32DATA] = bgdt[GDT_B32DATA]; 697 gdt0[GDT_B32CODE] = bgdt[GDT_B32CODE]; 698 gdt0[GDT_B16CODE] = bgdt[GDT_B16CODE]; 699 gdt0[GDT_B16DATA] = bgdt[GDT_B16DATA]; 700 gdt0[GDT_B64CODE] = bgdt[GDT_B64CODE]; 701 702 /* 703 * Install our new GDT 704 */ 705 r_gdt.dtr_limit = (sizeof (*gdt0) * NGDT) - 1; 706 r_gdt.dtr_base = (uintptr_t)gdt0; 707 wr_gdtr(&r_gdt); 708 709 /* 710 * Reload the segment registers to use the new GDT 711 */ 712 load_segment_registers(KCS_SEL, KFS_SEL, KGS_SEL, KDS_SEL); 713 714 /* 715 * setup %gs for kernel 716 */ 717 wrmsr(MSR_AMD_GSBASE, (uint64_t)&cpus[0]); 718 719 /* 720 * XX64 We should never dereference off "other gsbase" or 721 * "fsbase". So, we should arrange to point FSBASE and 722 * KGSBASE somewhere truly awful e.g. point it at the last 723 * valid address below the hole so that any attempts to index 724 * off them cause an exception. 725 * 726 * For now, point it at 8G -- at least it should be unmapped 727 * until some 64-bit processes run. 728 */ 729 wrmsr(MSR_AMD_FSBASE, 0x200000000ul); 730 wrmsr(MSR_AMD_KGSBASE, 0x200000000ul); 731 return (gdt0); 732 } 733 734 #endif /* __xpv */ 735 736 #elif defined(__i386) 737 738 static void 739 init_gdt_common(user_desc_t *gdt) 740 { 741 int i; 742 743 /* 744 * Text and data for both kernel and user span entire 32 bit 745 * address space. 746 */ 747 748 /* 749 * kernel code segment. 750 */ 751 set_usegd(&gdt[GDT_KCODE], NULL, -1, SDT_MEMERA, SEL_KPL, SDP_PAGES, 752 SDP_OP32); 753 754 /* 755 * kernel data segment. 756 */ 757 set_usegd(&gdt[GDT_KDATA], NULL, -1, SDT_MEMRWA, SEL_KPL, SDP_PAGES, 758 SDP_OP32); 759 760 /* 761 * user code segment. 762 */ 763 set_usegd(&gdt[GDT_UCODE], NULL, -1, SDT_MEMERA, SEL_UPL, SDP_PAGES, 764 SDP_OP32); 765 766 /* 767 * user data segment. 768 */ 769 set_usegd(&gdt[GDT_UDATA], NULL, -1, SDT_MEMRWA, SEL_UPL, SDP_PAGES, 770 SDP_OP32); 771 772 #if !defined(__xpv) 773 774 /* 775 * TSS for T_DBLFLT (double fault) handler 776 */ 777 set_syssegd((system_desc_t *)&gdt[GDT_DBFLT], dftss0, 778 sizeof (*dftss0) - 1, SDT_SYSTSS, SEL_KPL); 779 780 /* 781 * TSS for kernel 782 */ 783 set_syssegd((system_desc_t *)&gdt[GDT_KTSS], ktss0, 784 sizeof (*ktss0) - 1, SDT_SYSTSS, SEL_KPL); 785 786 #endif /* !__xpv */ 787 788 /* 789 * %gs selector for kernel 790 */ 791 set_usegd(&gdt[GDT_GS], &cpus[0], sizeof (struct cpu) -1, SDT_MEMRWA, 792 SEL_KPL, SDP_BYTES, SDP_OP32); 793 794 /* 795 * Initialize lwp private descriptors. 796 * Only attributes and limits are initialized, the effective 797 * base address is programmed via fsbase/gsbase. 798 */ 799 set_usegd(&gdt[GDT_LWPFS], NULL, (size_t)-1, SDT_MEMRWA, SEL_UPL, 800 SDP_PAGES, SDP_OP32); 801 set_usegd(&gdt[GDT_LWPGS], NULL, (size_t)-1, SDT_MEMRWA, SEL_UPL, 802 SDP_PAGES, SDP_OP32); 803 804 /* 805 * Initialize the descriptors set aside for brand usage. 806 * Only attributes and limits are initialized. 807 */ 808 for (i = GDT_BRANDMIN; i <= GDT_BRANDMAX; i++) 809 set_usegd(&gdt0[i], NULL, (size_t)-1, SDT_MEMRWA, SEL_UPL, 810 SDP_PAGES, SDP_OP32); 811 /* 812 * Initialize convenient zero base user descriptor for clearing 813 * lwp private %fs and %gs descriptors in GDT. See setregs() for 814 * an example. 815 */ 816 set_usegd(&zero_udesc, NULL, -1, SDT_MEMRWA, SEL_UPL, 817 SDP_BYTES, SDP_OP32); 818 } 819 820 #if defined(__xpv) 821 822 static user_desc_t * 823 init_gdt(void) 824 { 825 uint64_t gdtpa; 826 ulong_t ma[1]; /* XXPV should be a memory_t */ 827 828 #if !defined(__lint) 829 /* 830 * Our gdt is never larger than a single page. 831 */ 832 ASSERT((sizeof (*gdt0) * NGDT) <= PAGESIZE); 833 #endif 834 gdt0 = (user_desc_t *)BOP_ALLOC(bootops, (caddr_t)GDT_VA, 835 PAGESIZE, PAGESIZE); 836 bzero(gdt0, PAGESIZE); 837 838 init_gdt_common(gdt0); 839 gdtpa = pfn_to_pa(va_to_pfn(gdt0)); 840 841 /* 842 * XXX Since we never invoke kmdb until after the kernel takes 843 * over the descriptor tables why not have it use the kernel's 844 * selectors? 845 */ 846 if (boothowto & RB_DEBUG) { 847 set_usegd(&gdt0[GDT_B32DATA], NULL, -1, SDT_MEMRWA, SEL_KPL, 848 SDP_PAGES, SDP_OP32); 849 set_usegd(&gdt0[GDT_B32CODE], NULL, -1, SDT_MEMERA, SEL_KPL, 850 SDP_PAGES, SDP_OP32); 851 } 852 853 /* 854 * Clear write permission for page containing the gdt and install it. 855 */ 856 ma[0] = (ulong_t)(pa_to_ma(gdtpa) >> PAGESHIFT); 857 kbm_read_only((uintptr_t)gdt0, gdtpa); 858 xen_set_gdt(ma, NGDT); 859 860 /* 861 * Reload the segment registers to use the new GDT 862 */ 863 load_segment_registers( 864 KCS_SEL, KDS_SEL, KDS_SEL, KFS_SEL, KGS_SEL, KDS_SEL); 865 866 return (gdt0); 867 } 868 869 #else /* __xpv */ 870 871 static user_desc_t * 872 init_gdt(void) 873 { 874 desctbr_t r_bgdt, r_gdt; 875 user_desc_t *bgdt; 876 877 #if !defined(__lint) 878 /* 879 * Our gdt is never larger than a single page. 880 */ 881 ASSERT((sizeof (*gdt0) * NGDT) <= PAGESIZE); 882 #endif 883 /* 884 * XXX this allocation belongs in our caller, not here. 885 */ 886 gdt0 = (user_desc_t *)BOP_ALLOC(bootops, (caddr_t)GDT_VA, 887 PAGESIZE, PAGESIZE); 888 bzero(gdt0, PAGESIZE); 889 890 init_gdt_common(gdt0); 891 892 /* 893 * Copy in from boot's gdt to our gdt entries. 894 * Entry 0 is null descriptor by definition. 895 */ 896 rd_gdtr(&r_bgdt); 897 bgdt = (user_desc_t *)r_bgdt.dtr_base; 898 if (bgdt == NULL) 899 panic("null boot gdt"); 900 901 gdt0[GDT_B32DATA] = bgdt[GDT_B32DATA]; 902 gdt0[GDT_B32CODE] = bgdt[GDT_B32CODE]; 903 gdt0[GDT_B16CODE] = bgdt[GDT_B16CODE]; 904 gdt0[GDT_B16DATA] = bgdt[GDT_B16DATA]; 905 906 /* 907 * Install our new GDT 908 */ 909 r_gdt.dtr_limit = (sizeof (*gdt0) * NGDT) - 1; 910 r_gdt.dtr_base = (uintptr_t)gdt0; 911 wr_gdtr(&r_gdt); 912 913 /* 914 * Reload the segment registers to use the new GDT 915 */ 916 load_segment_registers( 917 KCS_SEL, KDS_SEL, KDS_SEL, KFS_SEL, KGS_SEL, KDS_SEL); 918 919 return (gdt0); 920 } 921 922 #endif /* __xpv */ 923 #endif /* __i386 */ 924 925 /* 926 * Build kernel IDT. 927 * 928 * Note that for amd64 we pretty much require every gate to be an interrupt 929 * gate which blocks interrupts atomically on entry; that's because of our 930 * dependency on using 'swapgs' every time we come into the kernel to find 931 * the cpu structure. If we get interrupted just before doing that, %cs could 932 * be in kernel mode (so that the trap prolog doesn't do a swapgs), but 933 * %gsbase is really still pointing at something in userland. Bad things will 934 * ensue. We also use interrupt gates for i386 as well even though this is not 935 * required for some traps. 936 * 937 * Perhaps they should have invented a trap gate that does an atomic swapgs? 938 */ 939 static void 940 init_idt_common(gate_desc_t *idt) 941 { 942 set_gatesegd(&idt[T_ZERODIV], 943 (kpti_enable == 1) ? &tr_div0trap : &div0trap, 944 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_ZERODIV)); 945 set_gatesegd(&idt[T_SGLSTP], 946 (kpti_enable == 1) ? &tr_dbgtrap : &dbgtrap, 947 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_SGLSTP)); 948 set_gatesegd(&idt[T_NMIFLT], 949 (kpti_enable == 1) ? &tr_nmiint : &nmiint, 950 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_NMIFLT)); 951 set_gatesegd(&idt[T_BPTFLT], 952 (kpti_enable == 1) ? &tr_brktrap : &brktrap, 953 KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_BPTFLT)); 954 set_gatesegd(&idt[T_OVFLW], 955 (kpti_enable == 1) ? &tr_ovflotrap : &ovflotrap, 956 KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_OVFLW)); 957 set_gatesegd(&idt[T_BOUNDFLT], 958 (kpti_enable == 1) ? &tr_boundstrap : &boundstrap, 959 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_BOUNDFLT)); 960 set_gatesegd(&idt[T_ILLINST], 961 (kpti_enable == 1) ? &tr_invoptrap : &invoptrap, 962 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_ILLINST)); 963 set_gatesegd(&idt[T_NOEXTFLT], 964 (kpti_enable == 1) ? &tr_ndptrap : &ndptrap, 965 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_NOEXTFLT)); 966 967 /* 968 * double fault handler. 969 * 970 * Note that on the hypervisor a guest does not receive #df faults. 971 * Instead a failsafe event is injected into the guest if its selectors 972 * and/or stack is in a broken state. See xen_failsafe_callback. 973 */ 974 #if !defined(__xpv) 975 set_gatesegd(&idt[T_DBLFLT], 976 (kpti_enable == 1) ? &tr_syserrtrap : &syserrtrap, 977 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_DBLFLT)); 978 #endif /* !__xpv */ 979 980 /* 981 * T_EXTOVRFLT coprocessor-segment-overrun not supported. 982 */ 983 set_gatesegd(&idt[T_TSSFLT], 984 (kpti_enable == 1) ? &tr_invtsstrap : &invtsstrap, 985 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_TSSFLT)); 986 set_gatesegd(&idt[T_SEGFLT], 987 (kpti_enable == 1) ? &tr_segnptrap : &segnptrap, 988 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_SEGFLT)); 989 set_gatesegd(&idt[T_STKFLT], 990 (kpti_enable == 1) ? &tr_stktrap : &stktrap, 991 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_STKFLT)); 992 set_gatesegd(&idt[T_GPFLT], 993 (kpti_enable == 1) ? &tr_gptrap : &gptrap, 994 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_GPFLT)); 995 set_gatesegd(&idt[T_PGFLT], 996 (kpti_enable == 1) ? &tr_pftrap : &pftrap, 997 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_PGFLT)); 998 set_gatesegd(&idt[T_EXTERRFLT], 999 (kpti_enable == 1) ? &tr_ndperr : &ndperr, 1000 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_EXTERRFLT)); 1001 set_gatesegd(&idt[T_ALIGNMENT], 1002 (kpti_enable == 1) ? &tr_achktrap : &achktrap, 1003 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_ALIGNMENT)); 1004 set_gatesegd(&idt[T_MCE], 1005 (kpti_enable == 1) ? &tr_mcetrap : &mcetrap, 1006 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_MCE)); 1007 set_gatesegd(&idt[T_SIMDFPE], 1008 (kpti_enable == 1) ? &tr_xmtrap : &xmtrap, 1009 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_SIMDFPE)); 1010 1011 /* 1012 * install fast trap handler at 210. 1013 */ 1014 set_gatesegd(&idt[T_FASTTRAP], 1015 (kpti_enable == 1) ? &tr_fasttrap : &fasttrap, 1016 KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_FASTTRAP)); 1017 1018 /* 1019 * System call handler. 1020 */ 1021 set_gatesegd(&idt[T_SYSCALLINT], 1022 (kpti_enable == 1) ? &tr_sys_syscall_int : &sys_syscall_int, 1023 KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_SYSCALLINT)); 1024 1025 /* 1026 * Install the DTrace interrupt handler for the pid provider. 1027 */ 1028 set_gatesegd(&idt[T_DTRACE_RET], 1029 (kpti_enable == 1) ? &tr_dtrace_ret : &dtrace_ret, 1030 KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_DTRACE_RET)); 1031 1032 /* 1033 * Prepare interposing descriptor for the syscall handler 1034 * and cache copy of the default descriptor. 1035 */ 1036 brand_tbl[0].ih_inum = T_SYSCALLINT; 1037 brand_tbl[0].ih_default_desc = idt0[T_SYSCALLINT]; 1038 1039 set_gatesegd(&(brand_tbl[0].ih_interp_desc), 1040 (kpti_enable == 1) ? &tr_brand_sys_syscall_int : 1041 &brand_sys_syscall_int, KCS_SEL, SDT_SYSIGT, TRP_UPL, 1042 idt_vector_to_ist(T_SYSCALLINT)); 1043 1044 brand_tbl[1].ih_inum = 0; 1045 } 1046 1047 #if defined(__xpv) 1048 1049 static void 1050 init_idt(gate_desc_t *idt) 1051 { 1052 init_idt_common(idt); 1053 } 1054 1055 #else /* __xpv */ 1056 1057 static void 1058 init_idt(gate_desc_t *idt) 1059 { 1060 char ivctname[80]; 1061 void (*ivctptr)(void); 1062 int i; 1063 1064 /* 1065 * Initialize entire table with 'reserved' trap and then overwrite 1066 * specific entries. T_EXTOVRFLT (9) is unsupported and reserved 1067 * since it can only be generated on a 386 processor. 15 is also 1068 * unsupported and reserved. 1069 */ 1070 #if !defined(__xpv) 1071 for (i = 0; i < NIDT; i++) { 1072 set_gatesegd(&idt[i], 1073 (kpti_enable == 1) ? &tr_resvtrap : &resvtrap, 1074 KCS_SEL, SDT_SYSIGT, TRP_KPL, 1075 idt_vector_to_ist(T_RESVTRAP)); 1076 } 1077 #else 1078 for (i = 0; i < NIDT; i++) { 1079 set_gatesegd(&idt[i], &resvtrap, KCS_SEL, SDT_SYSIGT, TRP_KPL, 1080 IST_NONE); 1081 } 1082 #endif 1083 1084 /* 1085 * 20-31 reserved 1086 */ 1087 #if !defined(__xpv) 1088 for (i = 20; i < 32; i++) { 1089 set_gatesegd(&idt[i], 1090 (kpti_enable == 1) ? &tr_invaltrap : &invaltrap, 1091 KCS_SEL, SDT_SYSIGT, TRP_KPL, 1092 idt_vector_to_ist(T_INVALTRAP)); 1093 } 1094 #else 1095 for (i = 20; i < 32; i++) { 1096 set_gatesegd(&idt[i], &invaltrap, KCS_SEL, SDT_SYSIGT, TRP_KPL, 1097 IST_NONE); 1098 } 1099 #endif 1100 1101 /* 1102 * interrupts 32 - 255 1103 */ 1104 for (i = 32; i < 256; i++) { 1105 #if !defined(__xpv) 1106 (void) snprintf(ivctname, sizeof (ivctname), 1107 (kpti_enable == 1) ? "tr_ivct%d" : "ivct%d", i); 1108 #else 1109 (void) snprintf(ivctname, sizeof (ivctname), "ivct%d", i); 1110 #endif 1111 ivctptr = (void (*)(void))kobj_getsymvalue(ivctname, 0); 1112 if (ivctptr == NULL) 1113 panic("kobj_getsymvalue(%s) failed", ivctname); 1114 1115 set_gatesegd(&idt[i], ivctptr, KCS_SEL, SDT_SYSIGT, TRP_KPL, 1116 idt_vector_to_ist(i)); 1117 } 1118 1119 /* 1120 * Now install the common ones. Note that it will overlay some 1121 * entries installed above like T_SYSCALLINT, T_FASTTRAP etc. 1122 */ 1123 init_idt_common(idt); 1124 } 1125 1126 #endif /* __xpv */ 1127 1128 /* 1129 * The kernel does not deal with LDTs unless a user explicitly creates 1130 * one. Under normal circumstances, the LDTR contains 0. Any process attempting 1131 * to reference the LDT will therefore cause a #gp. System calls made via the 1132 * obsolete lcall mechanism are emulated by the #gp fault handler. 1133 */ 1134 static void 1135 init_ldt(void) 1136 { 1137 #if defined(__xpv) 1138 xen_set_ldt(NULL, 0); 1139 #else 1140 wr_ldtr(0); 1141 #endif 1142 } 1143 1144 #if !defined(__xpv) 1145 1146 static void 1147 init_tss(void) 1148 { 1149 extern struct cpu cpus[]; 1150 1151 /* 1152 * tss_rsp0 is dynamically filled in by resume() (in swtch.s) on each 1153 * context switch but it'll be overwritten with this same value anyway. 1154 */ 1155 if (kpti_enable == 1) { 1156 ktss0->tss_rsp0 = (uint64_t)&cpus->cpu_m.mcpu_kpti.kf_tr_rsp; 1157 } 1158 1159 /* Set up the IST stacks for double fault, NMI, MCE. */ 1160 ktss0->tss_ist1 = (uintptr_t)&dblfault_stack0[sizeof (dblfault_stack0)]; 1161 ktss0->tss_ist2 = (uintptr_t)&nmi_stack0[sizeof (nmi_stack0)]; 1162 ktss0->tss_ist3 = (uintptr_t)&mce_stack0[sizeof (mce_stack0)]; 1163 1164 /* 1165 * This IST stack is used for #DB,#BP (debug) interrupts (when KPTI is 1166 * enabled), and also for KDI (always). 1167 */ 1168 ktss0->tss_ist4 = (uint64_t)&cpus->cpu_m.mcpu_kpti_dbg.kf_tr_rsp; 1169 1170 if (kpti_enable == 1) { 1171 /* This IST stack is used for #GP,#PF,#SS (fault) interrupts. */ 1172 ktss0->tss_ist5 = 1173 (uint64_t)&cpus->cpu_m.mcpu_kpti_flt.kf_tr_rsp; 1174 1175 /* This IST stack is used for all other intrs (for KPTI). */ 1176 ktss0->tss_ist6 = (uint64_t)&cpus->cpu_m.mcpu_kpti.kf_tr_rsp; 1177 } 1178 1179 /* 1180 * Set I/O bit map offset equal to size of TSS segment limit 1181 * for no I/O permission map. This will force all user I/O 1182 * instructions to generate #gp fault. 1183 */ 1184 ktss0->tss_bitmapbase = sizeof (*ktss0); 1185 1186 /* 1187 * Point %tr to descriptor for ktss0 in gdt. 1188 */ 1189 wr_tsr(KTSS_SEL); 1190 } 1191 1192 #endif /* !__xpv */ 1193 1194 #if defined(__xpv) 1195 1196 void 1197 init_desctbls(void) 1198 { 1199 uint_t vec; 1200 user_desc_t *gdt; 1201 1202 /* 1203 * Setup and install our GDT. 1204 */ 1205 gdt = init_gdt(); 1206 1207 /* 1208 * Store static pa of gdt to speed up pa_to_ma() translations 1209 * on lwp context switches. 1210 */ 1211 ASSERT(IS_P2ALIGNED((uintptr_t)gdt, PAGESIZE)); 1212 CPU->cpu_gdt = gdt; 1213 CPU->cpu_m.mcpu_gdtpa = pfn_to_pa(va_to_pfn(gdt)); 1214 1215 /* 1216 * Setup and install our IDT. 1217 */ 1218 #if !defined(__lint) 1219 ASSERT(NIDT * sizeof (*idt0) <= PAGESIZE); 1220 #endif 1221 idt0 = (gate_desc_t *)BOP_ALLOC(bootops, (caddr_t)IDT_VA, 1222 PAGESIZE, PAGESIZE); 1223 bzero(idt0, PAGESIZE); 1224 init_idt(idt0); 1225 for (vec = 0; vec < NIDT; vec++) 1226 xen_idt_write(&idt0[vec], vec); 1227 1228 CPU->cpu_idt = idt0; 1229 1230 /* 1231 * set default kernel stack 1232 */ 1233 xen_stack_switch(KDS_SEL, 1234 (ulong_t)&dblfault_stack0[sizeof (dblfault_stack0)]); 1235 1236 xen_init_callbacks(); 1237 1238 init_ldt(); 1239 } 1240 1241 #else /* __xpv */ 1242 1243 void 1244 init_desctbls(void) 1245 { 1246 user_desc_t *gdt; 1247 desctbr_t idtr; 1248 1249 /* 1250 * Allocate IDT and TSS structures on unique pages for better 1251 * performance in virtual machines. 1252 */ 1253 #if !defined(__lint) 1254 ASSERT(NIDT * sizeof (*idt0) <= PAGESIZE); 1255 #endif 1256 idt0 = (gate_desc_t *)BOP_ALLOC(bootops, (caddr_t)IDT_VA, 1257 PAGESIZE, PAGESIZE); 1258 bzero(idt0, PAGESIZE); 1259 #if !defined(__lint) 1260 ASSERT(sizeof (*ktss0) <= PAGESIZE); 1261 #endif 1262 ktss0 = (tss_t *)BOP_ALLOC(bootops, (caddr_t)KTSS_VA, 1263 PAGESIZE, PAGESIZE); 1264 bzero(ktss0, PAGESIZE); 1265 1266 #if defined(__i386) 1267 #if !defined(__lint) 1268 ASSERT(sizeof (*dftss0) <= PAGESIZE); 1269 #endif 1270 dftss0 = (tss_t *)BOP_ALLOC(bootops, (caddr_t)DFTSS_VA, 1271 PAGESIZE, PAGESIZE); 1272 bzero(dftss0, PAGESIZE); 1273 #endif 1274 1275 /* 1276 * Setup and install our GDT. 1277 */ 1278 gdt = init_gdt(); 1279 ASSERT(IS_P2ALIGNED((uintptr_t)gdt, PAGESIZE)); 1280 CPU->cpu_gdt = gdt; 1281 1282 /* 1283 * Initialize this CPU's LDT. 1284 */ 1285 CPU->cpu_m.mcpu_ldt = BOP_ALLOC(bootops, (caddr_t)LDT_VA, 1286 LDT_CPU_SIZE, PAGESIZE); 1287 bzero(CPU->cpu_m.mcpu_ldt, LDT_CPU_SIZE); 1288 CPU->cpu_m.mcpu_ldt_len = 0; 1289 1290 /* 1291 * Setup and install our IDT. 1292 */ 1293 init_idt(idt0); 1294 1295 idtr.dtr_base = (uintptr_t)idt0; 1296 idtr.dtr_limit = (NIDT * sizeof (*idt0)) - 1; 1297 wr_idtr(&idtr); 1298 CPU->cpu_idt = idt0; 1299 1300 #if defined(__i386) 1301 /* 1302 * We maintain a description of idt0 in convenient IDTR format 1303 * for #pf's on some older pentium processors. See pentium_pftrap(). 1304 */ 1305 idt0_default_r = idtr; 1306 #endif /* __i386 */ 1307 1308 init_tss(); 1309 CPU->cpu_tss = ktss0; 1310 init_ldt(); 1311 1312 /* Stash this so that the NMI,MCE,#DF and KDI handlers can use it. */ 1313 kpti_safe_cr3 = (uint64_t)getcr3(); 1314 } 1315 1316 #endif /* __xpv */ 1317 1318 /* 1319 * In the early kernel, we need to set up a simple GDT to run on. 1320 * 1321 * XXPV Can dboot use this too? See dboot_gdt.s 1322 */ 1323 void 1324 init_boot_gdt(user_desc_t *bgdt) 1325 { 1326 #if defined(__amd64) 1327 set_usegd(&bgdt[GDT_B32DATA], SDP_LONG, NULL, -1, SDT_MEMRWA, SEL_KPL, 1328 SDP_PAGES, SDP_OP32); 1329 set_usegd(&bgdt[GDT_B64CODE], SDP_LONG, NULL, -1, SDT_MEMERA, SEL_KPL, 1330 SDP_PAGES, SDP_OP32); 1331 #elif defined(__i386) 1332 set_usegd(&bgdt[GDT_B32DATA], NULL, -1, SDT_MEMRWA, SEL_KPL, 1333 SDP_PAGES, SDP_OP32); 1334 set_usegd(&bgdt[GDT_B32CODE], NULL, -1, SDT_MEMERA, SEL_KPL, 1335 SDP_PAGES, SDP_OP32); 1336 #endif /* __i386 */ 1337 } 1338 1339 /* 1340 * Enable interpositioning on the system call path by rewriting the 1341 * sys{call|enter} MSRs and the syscall-related entries in the IDT to use 1342 * the branded entry points. 1343 */ 1344 void 1345 brand_interpositioning_enable(void) 1346 { 1347 gate_desc_t *idt = CPU->cpu_idt; 1348 int i; 1349 1350 ASSERT(curthread->t_preempt != 0 || getpil() >= DISP_LEVEL); 1351 1352 for (i = 0; brand_tbl[i].ih_inum; i++) { 1353 idt[brand_tbl[i].ih_inum] = brand_tbl[i].ih_interp_desc; 1354 #if defined(__xpv) 1355 xen_idt_write(&idt[brand_tbl[i].ih_inum], 1356 brand_tbl[i].ih_inum); 1357 #endif 1358 } 1359 1360 #if defined(__amd64) 1361 #if defined(__xpv) 1362 1363 /* 1364 * Currently the hypervisor only supports 64-bit syscalls via 1365 * syscall instruction. The 32-bit syscalls are handled by 1366 * interrupt gate above. 1367 */ 1368 xen_set_callback(brand_sys_syscall, CALLBACKTYPE_syscall, 1369 CALLBACKF_mask_events); 1370 1371 #else 1372 1373 if (is_x86_feature(x86_featureset, X86FSET_ASYSC)) { 1374 if (kpti_enable == 1) { 1375 wrmsr(MSR_AMD_LSTAR, (uintptr_t)tr_brand_sys_syscall); 1376 wrmsr(MSR_AMD_CSTAR, (uintptr_t)tr_brand_sys_syscall32); 1377 } else { 1378 wrmsr(MSR_AMD_LSTAR, (uintptr_t)brand_sys_syscall); 1379 wrmsr(MSR_AMD_CSTAR, (uintptr_t)brand_sys_syscall32); 1380 } 1381 } 1382 1383 #endif 1384 #endif /* __amd64 */ 1385 1386 if (is_x86_feature(x86_featureset, X86FSET_SEP)) { 1387 if (kpti_enable == 1) { 1388 wrmsr(MSR_INTC_SEP_EIP, 1389 (uintptr_t)tr_brand_sys_sysenter); 1390 } else { 1391 wrmsr(MSR_INTC_SEP_EIP, (uintptr_t)brand_sys_sysenter); 1392 } 1393 } 1394 } 1395 1396 /* 1397 * Disable interpositioning on the system call path by rewriting the 1398 * sys{call|enter} MSRs and the syscall-related entries in the IDT to use 1399 * the standard entry points, which bypass the interpositioning hooks. 1400 */ 1401 void 1402 brand_interpositioning_disable(void) 1403 { 1404 gate_desc_t *idt = CPU->cpu_idt; 1405 int i; 1406 1407 ASSERT(curthread->t_preempt != 0 || getpil() >= DISP_LEVEL); 1408 1409 for (i = 0; brand_tbl[i].ih_inum; i++) { 1410 idt[brand_tbl[i].ih_inum] = brand_tbl[i].ih_default_desc; 1411 #if defined(__xpv) 1412 xen_idt_write(&idt[brand_tbl[i].ih_inum], 1413 brand_tbl[i].ih_inum); 1414 #endif 1415 } 1416 1417 #if defined(__amd64) 1418 #if defined(__xpv) 1419 1420 /* 1421 * See comment above in brand_interpositioning_enable. 1422 */ 1423 xen_set_callback(sys_syscall, CALLBACKTYPE_syscall, 1424 CALLBACKF_mask_events); 1425 1426 #else 1427 1428 if (is_x86_feature(x86_featureset, X86FSET_ASYSC)) { 1429 if (kpti_enable == 1) { 1430 wrmsr(MSR_AMD_LSTAR, (uintptr_t)tr_sys_syscall); 1431 wrmsr(MSR_AMD_CSTAR, (uintptr_t)tr_sys_syscall32); 1432 } else { 1433 wrmsr(MSR_AMD_LSTAR, (uintptr_t)sys_syscall); 1434 wrmsr(MSR_AMD_CSTAR, (uintptr_t)sys_syscall32); 1435 } 1436 } 1437 1438 #endif 1439 #endif /* __amd64 */ 1440 1441 if (is_x86_feature(x86_featureset, X86FSET_SEP)) { 1442 if (kpti_enable == 1) { 1443 wrmsr(MSR_INTC_SEP_EIP, (uintptr_t)tr_sys_sysenter); 1444 } else { 1445 wrmsr(MSR_INTC_SEP_EIP, (uintptr_t)sys_sysenter); 1446 } 1447 } 1448 }