1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
  24  */
  25 
  26 /*
  27  * Copyright 2018 Joyent, Inc. All rights reserved.
  28  */
  29 
  30 /*
  31  * Copyright (c) 1992 Terrence R. Lambert.
  32  * Copyright (c) 1990 The Regents of the University of California.
  33  * All rights reserved.
  34  *
  35  * This code is derived from software contributed to Berkeley by
  36  * William Jolitz.
  37  *
  38  * Redistribution and use in source and binary forms, with or without
  39  * modification, are permitted provided that the following conditions
  40  * are met:
  41  * 1. Redistributions of source code must retain the above copyright
  42  *    notice, this list of conditions and the following disclaimer.
  43  * 2. Redistributions in binary form must reproduce the above copyright
  44  *    notice, this list of conditions and the following disclaimer in the
  45  *    documentation and/or other materials provided with the distribution.
  46  * 3. All advertising materials mentioning features or use of this software
  47  *    must display the following acknowledgement:
  48  *      This product includes software developed by the University of
  49  *      California, Berkeley and its contributors.
  50  * 4. Neither the name of the University nor the names of its contributors
  51  *    may be used to endorse or promote products derived from this software
  52  *    without specific prior written permission.
  53  *
  54  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  55  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  56  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  57  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  58  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  59  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  60  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  61  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  62  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  63  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  64  * SUCH DAMAGE.
  65  *
  66  *      from: @(#)machdep.c     7.4 (Berkeley) 6/3/91
  67  */
  68 
  69 #include <sys/types.h>
  70 #include <sys/sysmacros.h>
  71 #include <sys/tss.h>
  72 #include <sys/segments.h>
  73 #include <sys/trap.h>
  74 #include <sys/cpuvar.h>
  75 #include <sys/bootconf.h>
  76 #include <sys/x86_archext.h>
  77 #include <sys/controlregs.h>
  78 #include <sys/archsystm.h>
  79 #include <sys/machsystm.h>
  80 #include <sys/kobj.h>
  81 #include <sys/cmn_err.h>
  82 #include <sys/reboot.h>
  83 #include <sys/kdi.h>
  84 #include <sys/mach_mmu.h>
  85 #include <sys/systm.h>
  86 #include <sys/note.h>
  87 
  88 #ifdef __xpv
  89 #include <sys/hypervisor.h>
  90 #include <vm/as.h>
  91 #endif
  92 
  93 #include <sys/promif.h>
  94 #include <sys/bootinfo.h>
  95 #include <vm/kboot_mmu.h>
  96 #include <vm/hat_pte.h>
  97 
  98 /*
  99  * cpu0 and default tables and structures.
 100  */
 101 user_desc_t     *gdt0;
 102 #if !defined(__xpv)
 103 desctbr_t       gdt0_default_r;
 104 #endif
 105 
 106 gate_desc_t     *idt0;          /* interrupt descriptor table */
 107 #if defined(__i386)
 108 desctbr_t       idt0_default_r;         /* describes idt0 in IDTR format */
 109 #endif
 110 
 111 tss_t           *ktss0;                 /* kernel task state structure */
 112 
 113 #if defined(__i386)
 114 tss_t           *dftss0;                /* #DF double-fault exception */
 115 #endif  /* __i386 */
 116 
 117 user_desc_t     zero_udesc;             /* base zero user desc native procs */
 118 user_desc_t     null_udesc;             /* null user descriptor */
 119 system_desc_t   null_sdesc;             /* null system descriptor */
 120 
 121 #if defined(__amd64)
 122 user_desc_t     zero_u32desc;           /* 32-bit compatibility procs */
 123 #endif  /* __amd64 */
 124 
 125 #if defined(__amd64)
 126 user_desc_t     ucs_on;
 127 user_desc_t     ucs_off;
 128 user_desc_t     ucs32_on;
 129 user_desc_t     ucs32_off;
 130 #endif  /* __amd64 */
 131 
 132 /*
 133  * If the size of this is changed, you must update hat_pcp_setup() and the
 134  * definitions in exception.s
 135  */
 136 extern char dblfault_stack0[DEFAULTSTKSZ];
 137 extern char nmi_stack0[DEFAULTSTKSZ];
 138 extern char mce_stack0[DEFAULTSTKSZ];
 139 
 140 extern void     fast_null(void);
 141 extern hrtime_t get_hrtime(void);
 142 extern hrtime_t gethrvtime(void);
 143 extern hrtime_t get_hrestime(void);
 144 extern uint64_t getlgrp(void);
 145 
 146 void (*(fasttable[]))(void) = {
 147         fast_null,                      /* T_FNULL routine */
 148         fast_null,                      /* T_FGETFP routine (initially null) */
 149         fast_null,                      /* T_FSETFP routine (initially null) */
 150         (void (*)())get_hrtime,         /* T_GETHRTIME */
 151         (void (*)())gethrvtime,         /* T_GETHRVTIME */
 152         (void (*)())get_hrestime,       /* T_GETHRESTIME */
 153         (void (*)())getlgrp             /* T_GETLGRP */
 154 };
 155 
 156 /*
 157  * Structure containing pre-computed descriptors to allow us to temporarily
 158  * interpose on a standard handler.
 159  */
 160 struct interposing_handler {
 161         int ih_inum;
 162         gate_desc_t ih_interp_desc;
 163         gate_desc_t ih_default_desc;
 164 };
 165 
 166 /*
 167  * The brand infrastructure interposes on two handlers, and we use one as a
 168  * NULL signpost.
 169  */
 170 static struct interposing_handler brand_tbl[2];
 171 
 172 /*
 173  * software prototypes for default local descriptor table
 174  */
 175 
 176 /*
 177  * Routines for loading segment descriptors in format the hardware
 178  * can understand.
 179  */
 180 
 181 #if defined(__amd64)
 182 
 183 /*
 184  * In long mode we have the new L or long mode attribute bit
 185  * for code segments. Only the conforming bit in type is used along
 186  * with descriptor priority and present bits. Default operand size must
 187  * be zero when in long mode. In 32-bit compatibility mode all fields
 188  * are treated as in legacy mode. For data segments while in long mode
 189  * only the present bit is loaded.
 190  */
 191 void
 192 set_usegd(user_desc_t *dp, uint_t lmode, void *base, size_t size,
 193     uint_t type, uint_t dpl, uint_t gran, uint_t defopsz)
 194 {
 195         ASSERT(lmode == SDP_SHORT || lmode == SDP_LONG);
 196 
 197         /*
 198          * 64-bit long mode.
 199          */
 200         if (lmode == SDP_LONG)
 201                 dp->usd_def32 = 0;           /* 32-bit operands only */
 202         else
 203                 /*
 204                  * 32-bit compatibility mode.
 205                  */
 206                 dp->usd_def32 = defopsz;     /* 0 = 16, 1 = 32-bit ops */
 207 
 208         dp->usd_long = lmode;        /* 64-bit mode */
 209         dp->usd_type = type;
 210         dp->usd_dpl = dpl;
 211         dp->usd_p = 1;
 212         dp->usd_gran = gran;         /* 0 = bytes, 1 = pages */
 213 
 214         dp->usd_lobase = (uintptr_t)base;
 215         dp->usd_midbase = (uintptr_t)base >> 16;
 216         dp->usd_hibase = (uintptr_t)base >> (16 + 8);
 217         dp->usd_lolimit = size;
 218         dp->usd_hilimit = (uintptr_t)size >> 16;
 219 }
 220 
 221 #elif defined(__i386)
 222 
 223 /*
 224  * Install user segment descriptor for code and data.
 225  */
 226 void
 227 set_usegd(user_desc_t *dp, void *base, size_t size, uint_t type,
 228     uint_t dpl, uint_t gran, uint_t defopsz)
 229 {
 230         dp->usd_lolimit = size;
 231         dp->usd_hilimit = (uintptr_t)size >> 16;
 232 
 233         dp->usd_lobase = (uintptr_t)base;
 234         dp->usd_midbase = (uintptr_t)base >> 16;
 235         dp->usd_hibase = (uintptr_t)base >> (16 + 8);
 236 
 237         dp->usd_type = type;
 238         dp->usd_dpl = dpl;
 239         dp->usd_p = 1;
 240         dp->usd_def32 = defopsz;     /* 0 = 16, 1 = 32 bit operands */
 241         dp->usd_gran = gran;         /* 0 = bytes, 1 = pages */
 242 }
 243 
 244 #endif  /* __i386 */
 245 
 246 /*
 247  * Install system segment descriptor for LDT and TSS segments.
 248  */
 249 
 250 #if defined(__amd64)
 251 
 252 void
 253 set_syssegd(system_desc_t *dp, void *base, size_t size, uint_t type,
 254     uint_t dpl)
 255 {
 256         dp->ssd_lolimit = size;
 257         dp->ssd_hilimit = (uintptr_t)size >> 16;
 258 
 259         dp->ssd_lobase = (uintptr_t)base;
 260         dp->ssd_midbase = (uintptr_t)base >> 16;
 261         dp->ssd_hibase = (uintptr_t)base >> (16 + 8);
 262         dp->ssd_hi64base = (uintptr_t)base >> (16 + 8 + 8);
 263 
 264         dp->ssd_type = type;
 265         dp->ssd_zero1 = 0;   /* must be zero */
 266         dp->ssd_zero2 = 0;
 267         dp->ssd_dpl = dpl;
 268         dp->ssd_p = 1;
 269         dp->ssd_gran = 0;    /* force byte units */
 270 }
 271 
 272 void *
 273 get_ssd_base(system_desc_t *dp)
 274 {
 275         uintptr_t       base;
 276 
 277         base = (uintptr_t)dp->ssd_lobase |
 278             (uintptr_t)dp->ssd_midbase << 16 |
 279             (uintptr_t)dp->ssd_hibase << (16 + 8) |
 280             (uintptr_t)dp->ssd_hi64base << (16 + 8 + 8);
 281         return ((void *)base);
 282 }
 283 
 284 #elif defined(__i386)
 285 
 286 void
 287 set_syssegd(system_desc_t *dp, void *base, size_t size, uint_t type,
 288     uint_t dpl)
 289 {
 290         dp->ssd_lolimit = size;
 291         dp->ssd_hilimit = (uintptr_t)size >> 16;
 292 
 293         dp->ssd_lobase = (uintptr_t)base;
 294         dp->ssd_midbase = (uintptr_t)base >> 16;
 295         dp->ssd_hibase = (uintptr_t)base >> (16 + 8);
 296 
 297         dp->ssd_type = type;
 298         dp->ssd_zero = 0;    /* must be zero */
 299         dp->ssd_dpl = dpl;
 300         dp->ssd_p = 1;
 301         dp->ssd_gran = 0;    /* force byte units */
 302 }
 303 
 304 void *
 305 get_ssd_base(system_desc_t *dp)
 306 {
 307         uintptr_t       base;
 308 
 309         base = (uintptr_t)dp->ssd_lobase |
 310             (uintptr_t)dp->ssd_midbase << 16 |
 311             (uintptr_t)dp->ssd_hibase << (16 + 8);
 312         return ((void *)base);
 313 }
 314 
 315 #endif  /* __i386 */
 316 
 317 /*
 318  * Install gate segment descriptor for interrupt, trap, call and task gates.
 319  *
 320  * For 64 bit native if we have KPTI enabled, we use the IST stack mechanism on
 321  * all interrupts.  We have different ISTs for each class of exceptions that are
 322  * most likely to occur while handling an existing exception; while many of
 323  * these are just going to panic, it's nice not to trample on the existing
 324  * exception state for debugging purposes.
 325  *
 326  * Normal interrupts are all redirected unconditionally to the KPTI trampoline
 327  * stack space. This unifies the trampoline handling between user and kernel
 328  * space (and avoids the need to touch %gs).
 329  *
 330  * The KDI IDT *all* uses the DBG IST: consider single stepping tr_pftrap, when
 331  * we do a read from KMDB that cause another #PF.  Without its own IST, this
 332  * would stomp on the kernel's mcpu_kpti_flt frame.
 333  */
 334 uint_t
 335 idt_vector_to_ist(uint_t vector)
 336 {
 337 #if defined(__xpv)
 338         _NOTE(ARGUNUSED(vector));
 339         return (IST_NONE);
 340 #else
 341         switch (vector) {
 342         /* These should always use IST even without KPTI enabled. */
 343         case T_DBLFLT:
 344                 return (IST_DF);
 345         case T_NMIFLT:
 346                 return (IST_NMI);
 347         case T_MCE:
 348                 return (IST_MCE);
 349 
 350         case T_BPTFLT:
 351         case T_SGLSTP:
 352                 if (kpti_enable == 1) {
 353                         return (IST_DBG);
 354                 }
 355                 return (IST_NONE);
 356         case T_STKFLT:
 357         case T_GPFLT:
 358         case T_PGFLT:
 359                 if (kpti_enable == 1) {
 360                         return (IST_NESTABLE);
 361                 }
 362                 return (IST_NONE);
 363         default:
 364                 if (kpti_enable == 1) {
 365                         return (IST_DEFAULT);
 366                 }
 367                 return (IST_NONE);
 368         }
 369 #endif
 370 }
 371 
 372 void
 373 set_gatesegd(gate_desc_t *dp, void (*func)(void), selector_t sel,
 374     uint_t type, uint_t dpl, uint_t ist)
 375 {
 376         dp->sgd_looffset = (uintptr_t)func;
 377         dp->sgd_hioffset = (uintptr_t)func >> 16;
 378         dp->sgd_hi64offset = (uintptr_t)func >> (16 + 16);
 379         dp->sgd_selector =  (uint16_t)sel;
 380         dp->sgd_ist = ist;
 381         dp->sgd_type = type;
 382         dp->sgd_dpl = dpl;
 383         dp->sgd_p = 1;
 384 }
 385 
 386 /*
 387  * Updates a single user descriptor in the the GDT of the current cpu.
 388  * Caller is responsible for preventing cpu migration.
 389  */
 390 
 391 void
 392 gdt_update_usegd(uint_t sidx, user_desc_t *udp)
 393 {
 394 #if defined(__xpv)
 395 
 396         uint64_t dpa = CPU->cpu_m.mcpu_gdtpa + sizeof (*udp) * sidx;
 397 
 398         if (HYPERVISOR_update_descriptor(pa_to_ma(dpa), *(uint64_t *)udp))
 399                 panic("gdt_update_usegd: HYPERVISOR_update_descriptor");
 400 
 401 #else   /* __xpv */
 402 
 403         CPU->cpu_gdt[sidx] = *udp;
 404 
 405 #endif  /* __xpv */
 406 }
 407 
 408 /*
 409  * Writes single descriptor pointed to by udp into a processes
 410  * LDT entry pointed to by ldp.
 411  */
 412 int
 413 ldt_update_segd(user_desc_t *ldp, user_desc_t *udp)
 414 {
 415 #if defined(__xpv)
 416 
 417         uint64_t dpa;
 418 
 419         dpa = mmu_ptob(hat_getpfnum(kas.a_hat, (caddr_t)ldp)) |
 420             ((uintptr_t)ldp & PAGEOFFSET);
 421 
 422         /*
 423          * The hypervisor is a little more restrictive about what it
 424          * supports in the LDT.
 425          */
 426         if (HYPERVISOR_update_descriptor(pa_to_ma(dpa), *(uint64_t *)udp) != 0)
 427                 return (EINVAL);
 428 
 429 #else   /* __xpv */
 430 
 431         *ldp = *udp;
 432 
 433 #endif  /* __xpv */
 434         return (0);
 435 }
 436 
 437 #if defined(__xpv)
 438 
 439 /*
 440  * Converts hw format gate descriptor into pseudo-IDT format for the hypervisor.
 441  * Returns true if a valid entry was written.
 442  */
 443 int
 444 xen_idt_to_trap_info(uint_t vec, gate_desc_t *sgd, void *ti_arg)
 445 {
 446         trap_info_t *ti = ti_arg;       /* XXPV Aargh - segments.h comment */
 447 
 448         /*
 449          * skip holes in the IDT
 450          */
 451         if (GATESEG_GETOFFSET(sgd) == 0)
 452                 return (0);
 453 
 454         ASSERT(sgd->sgd_type == SDT_SYSIGT);
 455         ti->vector = vec;
 456         TI_SET_DPL(ti, sgd->sgd_dpl);
 457 
 458         /*
 459          * Is this an interrupt gate?
 460          */
 461         if (sgd->sgd_type == SDT_SYSIGT) {
 462                 /* LINTED */
 463                 TI_SET_IF(ti, 1);
 464         }
 465         ti->cs = sgd->sgd_selector;
 466 #if defined(__amd64)
 467         ti->cs |= SEL_KPL;   /* force into ring 3. see KCS_SEL  */
 468 #endif
 469         ti->address = GATESEG_GETOFFSET(sgd);
 470         return (1);
 471 }
 472 
 473 /*
 474  * Convert a single hw format gate descriptor and write it into our virtual IDT.
 475  */
 476 void
 477 xen_idt_write(gate_desc_t *sgd, uint_t vec)
 478 {
 479         trap_info_t trapinfo[2];
 480 
 481         bzero(trapinfo, sizeof (trapinfo));
 482         if (xen_idt_to_trap_info(vec, sgd, &trapinfo[0]) == 0)
 483                 return;
 484         if (xen_set_trap_table(trapinfo) != 0)
 485                 panic("xen_idt_write: xen_set_trap_table() failed");
 486 }
 487 
 488 #endif  /* __xpv */
 489 
 490 #if defined(__amd64)
 491 
 492 /*
 493  * Build kernel GDT.
 494  */
 495 
 496 static void
 497 init_gdt_common(user_desc_t *gdt)
 498 {
 499         int i;
 500 
 501         /*
 502          * 64-bit kernel code segment.
 503          */
 504         set_usegd(&gdt[GDT_KCODE], SDP_LONG, NULL, 0, SDT_MEMERA, SEL_KPL,
 505             SDP_PAGES, SDP_OP32);
 506 
 507         /*
 508          * 64-bit kernel data segment. The limit attribute is ignored in 64-bit
 509          * mode, but we set it here to 0xFFFF so that we can use the SYSRET
 510          * instruction to return from system calls back to 32-bit applications.
 511          * SYSRET doesn't update the base, limit, or attributes of %ss or %ds
 512          * descriptors. We therefore must ensure that the kernel uses something,
 513          * though it will be ignored by hardware, that is compatible with 32-bit
 514          * apps. For the same reason we must set the default op size of this
 515          * descriptor to 32-bit operands.
 516          */
 517         set_usegd(&gdt[GDT_KDATA], SDP_LONG, NULL, -1, SDT_MEMRWA,
 518             SEL_KPL, SDP_PAGES, SDP_OP32);
 519         gdt[GDT_KDATA].usd_def32 = 1;
 520 
 521         /*
 522          * 64-bit user code segment.
 523          */
 524         set_usegd(&gdt[GDT_UCODE], SDP_LONG, NULL, 0, SDT_MEMERA, SEL_UPL,
 525             SDP_PAGES, SDP_OP32);
 526 
 527         /*
 528          * 32-bit user code segment.
 529          */
 530         set_usegd(&gdt[GDT_U32CODE], SDP_SHORT, NULL, -1, SDT_MEMERA,
 531             SEL_UPL, SDP_PAGES, SDP_OP32);
 532 
 533         /*
 534          * See gdt_ucode32() and gdt_ucode_native().
 535          */
 536         ucs_on = ucs_off = gdt[GDT_UCODE];
 537         ucs_off.usd_p = 0;      /* forces #np fault */
 538 
 539         ucs32_on = ucs32_off = gdt[GDT_U32CODE];
 540         ucs32_off.usd_p = 0;    /* forces #np fault */
 541 
 542         /*
 543          * 32 and 64 bit data segments can actually share the same descriptor.
 544          * In long mode only the present bit is checked but all other fields
 545          * are loaded. But in compatibility mode all fields are interpreted
 546          * as in legacy mode so they must be set correctly for a 32-bit data
 547          * segment.
 548          */
 549         set_usegd(&gdt[GDT_UDATA], SDP_SHORT, NULL, -1, SDT_MEMRWA, SEL_UPL,
 550             SDP_PAGES, SDP_OP32);
 551 
 552 #if !defined(__xpv)
 553 
 554         /*
 555          * The 64-bit kernel has no default LDT. By default, the LDT descriptor
 556          * in the GDT is 0.
 557          */
 558 
 559         /*
 560          * Kernel TSS
 561          */
 562         set_syssegd((system_desc_t *)&gdt[GDT_KTSS], ktss0,
 563             sizeof (*ktss0) - 1, SDT_SYSTSS, SEL_KPL);
 564 
 565 #endif  /* !__xpv */
 566 
 567         /*
 568          * Initialize fs and gs descriptors for 32 bit processes.
 569          * Only attributes and limits are initialized, the effective
 570          * base address is programmed via fsbase/gsbase.
 571          */
 572         set_usegd(&gdt[GDT_LWPFS], SDP_SHORT, NULL, -1, SDT_MEMRWA,
 573             SEL_UPL, SDP_PAGES, SDP_OP32);
 574         set_usegd(&gdt[GDT_LWPGS], SDP_SHORT, NULL, -1, SDT_MEMRWA,
 575             SEL_UPL, SDP_PAGES, SDP_OP32);
 576 
 577         /*
 578          * Initialize the descriptors set aside for brand usage.
 579          * Only attributes and limits are initialized.
 580          */
 581         for (i = GDT_BRANDMIN; i <= GDT_BRANDMAX; i++)
 582                 set_usegd(&gdt0[i], SDP_SHORT, NULL, -1, SDT_MEMRWA,
 583                     SEL_UPL, SDP_PAGES, SDP_OP32);
 584 
 585         /*
 586          * Initialize convenient zero base user descriptors for clearing
 587          * lwp private %fs and %gs descriptors in GDT. See setregs() for
 588          * an example.
 589          */
 590         set_usegd(&zero_udesc, SDP_LONG, 0, 0, SDT_MEMRWA, SEL_UPL,
 591             SDP_BYTES, SDP_OP32);
 592         set_usegd(&zero_u32desc, SDP_SHORT, 0, -1, SDT_MEMRWA, SEL_UPL,
 593             SDP_PAGES, SDP_OP32);
 594 }
 595 
 596 #if defined(__xpv)
 597 
 598 static user_desc_t *
 599 init_gdt(void)
 600 {
 601         uint64_t gdtpa;
 602         ulong_t ma[1];          /* XXPV should be a memory_t */
 603         ulong_t addr;
 604 
 605 #if !defined(__lint)
 606         /*
 607          * Our gdt is never larger than a single page.
 608          */
 609         ASSERT((sizeof (*gdt0) * NGDT) <= PAGESIZE);
 610 #endif
 611         gdt0 = (user_desc_t *)BOP_ALLOC(bootops, (caddr_t)GDT_VA,
 612             PAGESIZE, PAGESIZE);
 613         bzero(gdt0, PAGESIZE);
 614 
 615         init_gdt_common(gdt0);
 616 
 617         /*
 618          * XXX Since we never invoke kmdb until after the kernel takes
 619          * over the descriptor tables why not have it use the kernel's
 620          * selectors?
 621          */
 622         if (boothowto & RB_DEBUG) {
 623                 set_usegd(&gdt0[GDT_B32DATA], SDP_LONG, NULL, -1, SDT_MEMRWA,
 624                     SEL_KPL, SDP_PAGES, SDP_OP32);
 625                 set_usegd(&gdt0[GDT_B64CODE], SDP_LONG, NULL, -1, SDT_MEMERA,
 626                     SEL_KPL, SDP_PAGES, SDP_OP32);
 627         }
 628 
 629         /*
 630          * Clear write permission for page containing the gdt and install it.
 631          */
 632         gdtpa = pfn_to_pa(va_to_pfn(gdt0));
 633         ma[0] = (ulong_t)(pa_to_ma(gdtpa) >> PAGESHIFT);
 634         kbm_read_only((uintptr_t)gdt0, gdtpa);
 635         xen_set_gdt(ma, NGDT);
 636 
 637         /*
 638          * Reload the segment registers to use the new GDT.
 639          * On 64-bit, fixup KCS_SEL to be in ring 3.
 640          * See KCS_SEL in segments.h.
 641          */
 642         load_segment_registers((KCS_SEL | SEL_KPL), KFS_SEL, KGS_SEL, KDS_SEL);
 643 
 644         /*
 645          *  setup %gs for kernel
 646          */
 647         xen_set_segment_base(SEGBASE_GS_KERNEL, (ulong_t)&cpus[0]);
 648 
 649         /*
 650          * XX64 We should never dereference off "other gsbase" or
 651          * "fsbase".  So, we should arrange to point FSBASE and
 652          * KGSBASE somewhere truly awful e.g. point it at the last
 653          * valid address below the hole so that any attempts to index
 654          * off them cause an exception.
 655          *
 656          * For now, point it at 8G -- at least it should be unmapped
 657          * until some 64-bit processes run.
 658          */
 659         addr = 0x200000000ul;
 660         xen_set_segment_base(SEGBASE_FS, addr);
 661         xen_set_segment_base(SEGBASE_GS_USER, addr);
 662         xen_set_segment_base(SEGBASE_GS_USER_SEL, 0);
 663 
 664         return (gdt0);
 665 }
 666 
 667 #else   /* __xpv */
 668 
 669 static user_desc_t *
 670 init_gdt(void)
 671 {
 672         desctbr_t       r_bgdt, r_gdt;
 673         user_desc_t     *bgdt;
 674 
 675 #if !defined(__lint)
 676         /*
 677          * Our gdt is never larger than a single page.
 678          */
 679         ASSERT((sizeof (*gdt0) * NGDT) <= PAGESIZE);
 680 #endif
 681         gdt0 = (user_desc_t *)BOP_ALLOC(bootops, (caddr_t)GDT_VA,
 682             PAGESIZE, PAGESIZE);
 683         bzero(gdt0, PAGESIZE);
 684 
 685         init_gdt_common(gdt0);
 686 
 687         /*
 688          * Copy in from boot's gdt to our gdt.
 689          * Entry 0 is the null descriptor by definition.
 690          */
 691         rd_gdtr(&r_bgdt);
 692         bgdt = (user_desc_t *)r_bgdt.dtr_base;
 693         if (bgdt == NULL)
 694                 panic("null boot gdt");
 695 
 696         gdt0[GDT_B32DATA] = bgdt[GDT_B32DATA];
 697         gdt0[GDT_B32CODE] = bgdt[GDT_B32CODE];
 698         gdt0[GDT_B16CODE] = bgdt[GDT_B16CODE];
 699         gdt0[GDT_B16DATA] = bgdt[GDT_B16DATA];
 700         gdt0[GDT_B64CODE] = bgdt[GDT_B64CODE];
 701 
 702         /*
 703          * Install our new GDT
 704          */
 705         r_gdt.dtr_limit = (sizeof (*gdt0) * NGDT) - 1;
 706         r_gdt.dtr_base = (uintptr_t)gdt0;
 707         wr_gdtr(&r_gdt);
 708 
 709         /*
 710          * Reload the segment registers to use the new GDT
 711          */
 712         load_segment_registers(KCS_SEL, KFS_SEL, KGS_SEL, KDS_SEL);
 713 
 714         /*
 715          *  setup %gs for kernel
 716          */
 717         wrmsr(MSR_AMD_GSBASE, (uint64_t)&cpus[0]);
 718 
 719         /*
 720          * XX64 We should never dereference off "other gsbase" or
 721          * "fsbase".  So, we should arrange to point FSBASE and
 722          * KGSBASE somewhere truly awful e.g. point it at the last
 723          * valid address below the hole so that any attempts to index
 724          * off them cause an exception.
 725          *
 726          * For now, point it at 8G -- at least it should be unmapped
 727          * until some 64-bit processes run.
 728          */
 729         wrmsr(MSR_AMD_FSBASE, 0x200000000ul);
 730         wrmsr(MSR_AMD_KGSBASE, 0x200000000ul);
 731         return (gdt0);
 732 }
 733 
 734 #endif  /* __xpv */
 735 
 736 #elif defined(__i386)
 737 
 738 static void
 739 init_gdt_common(user_desc_t *gdt)
 740 {
 741         int i;
 742 
 743         /*
 744          * Text and data for both kernel and user span entire 32 bit
 745          * address space.
 746          */
 747 
 748         /*
 749          * kernel code segment.
 750          */
 751         set_usegd(&gdt[GDT_KCODE], NULL, -1, SDT_MEMERA, SEL_KPL, SDP_PAGES,
 752             SDP_OP32);
 753 
 754         /*
 755          * kernel data segment.
 756          */
 757         set_usegd(&gdt[GDT_KDATA], NULL, -1, SDT_MEMRWA, SEL_KPL, SDP_PAGES,
 758             SDP_OP32);
 759 
 760         /*
 761          * user code segment.
 762          */
 763         set_usegd(&gdt[GDT_UCODE], NULL, -1, SDT_MEMERA, SEL_UPL, SDP_PAGES,
 764             SDP_OP32);
 765 
 766         /*
 767          * user data segment.
 768          */
 769         set_usegd(&gdt[GDT_UDATA], NULL, -1, SDT_MEMRWA, SEL_UPL, SDP_PAGES,
 770             SDP_OP32);
 771 
 772 #if !defined(__xpv)
 773 
 774         /*
 775          * TSS for T_DBLFLT (double fault) handler
 776          */
 777         set_syssegd((system_desc_t *)&gdt[GDT_DBFLT], dftss0,
 778             sizeof (*dftss0) - 1, SDT_SYSTSS, SEL_KPL);
 779 
 780         /*
 781          * TSS for kernel
 782          */
 783         set_syssegd((system_desc_t *)&gdt[GDT_KTSS], ktss0,
 784             sizeof (*ktss0) - 1, SDT_SYSTSS, SEL_KPL);
 785 
 786 #endif  /* !__xpv */
 787 
 788         /*
 789          * %gs selector for kernel
 790          */
 791         set_usegd(&gdt[GDT_GS], &cpus[0], sizeof (struct cpu) -1, SDT_MEMRWA,
 792             SEL_KPL, SDP_BYTES, SDP_OP32);
 793 
 794         /*
 795          * Initialize lwp private descriptors.
 796          * Only attributes and limits are initialized, the effective
 797          * base address is programmed via fsbase/gsbase.
 798          */
 799         set_usegd(&gdt[GDT_LWPFS], NULL, (size_t)-1, SDT_MEMRWA, SEL_UPL,
 800             SDP_PAGES, SDP_OP32);
 801         set_usegd(&gdt[GDT_LWPGS], NULL, (size_t)-1, SDT_MEMRWA, SEL_UPL,
 802             SDP_PAGES, SDP_OP32);
 803 
 804         /*
 805          * Initialize the descriptors set aside for brand usage.
 806          * Only attributes and limits are initialized.
 807          */
 808         for (i = GDT_BRANDMIN; i <= GDT_BRANDMAX; i++)
 809                 set_usegd(&gdt0[i], NULL, (size_t)-1, SDT_MEMRWA, SEL_UPL,
 810                     SDP_PAGES, SDP_OP32);
 811         /*
 812          * Initialize convenient zero base user descriptor for clearing
 813          * lwp  private %fs and %gs descriptors in GDT. See setregs() for
 814          * an example.
 815          */
 816         set_usegd(&zero_udesc, NULL, -1, SDT_MEMRWA, SEL_UPL,
 817             SDP_BYTES, SDP_OP32);
 818 }
 819 
 820 #if defined(__xpv)
 821 
 822 static user_desc_t *
 823 init_gdt(void)
 824 {
 825         uint64_t gdtpa;
 826         ulong_t ma[1];          /* XXPV should be a memory_t */
 827 
 828 #if !defined(__lint)
 829         /*
 830          * Our gdt is never larger than a single page.
 831          */
 832         ASSERT((sizeof (*gdt0) * NGDT) <= PAGESIZE);
 833 #endif
 834         gdt0 = (user_desc_t *)BOP_ALLOC(bootops, (caddr_t)GDT_VA,
 835             PAGESIZE, PAGESIZE);
 836         bzero(gdt0, PAGESIZE);
 837 
 838         init_gdt_common(gdt0);
 839         gdtpa = pfn_to_pa(va_to_pfn(gdt0));
 840 
 841         /*
 842          * XXX Since we never invoke kmdb until after the kernel takes
 843          * over the descriptor tables why not have it use the kernel's
 844          * selectors?
 845          */
 846         if (boothowto & RB_DEBUG) {
 847                 set_usegd(&gdt0[GDT_B32DATA], NULL, -1, SDT_MEMRWA, SEL_KPL,
 848                     SDP_PAGES, SDP_OP32);
 849                 set_usegd(&gdt0[GDT_B32CODE], NULL, -1, SDT_MEMERA, SEL_KPL,
 850                     SDP_PAGES, SDP_OP32);
 851         }
 852 
 853         /*
 854          * Clear write permission for page containing the gdt and install it.
 855          */
 856         ma[0] = (ulong_t)(pa_to_ma(gdtpa) >> PAGESHIFT);
 857         kbm_read_only((uintptr_t)gdt0, gdtpa);
 858         xen_set_gdt(ma, NGDT);
 859 
 860         /*
 861          * Reload the segment registers to use the new GDT
 862          */
 863         load_segment_registers(
 864             KCS_SEL, KDS_SEL, KDS_SEL, KFS_SEL, KGS_SEL, KDS_SEL);
 865 
 866         return (gdt0);
 867 }
 868 
 869 #else   /* __xpv */
 870 
 871 static user_desc_t *
 872 init_gdt(void)
 873 {
 874         desctbr_t       r_bgdt, r_gdt;
 875         user_desc_t     *bgdt;
 876 
 877 #if !defined(__lint)
 878         /*
 879          * Our gdt is never larger than a single page.
 880          */
 881         ASSERT((sizeof (*gdt0) * NGDT) <= PAGESIZE);
 882 #endif
 883         /*
 884          * XXX this allocation belongs in our caller, not here.
 885          */
 886         gdt0 = (user_desc_t *)BOP_ALLOC(bootops, (caddr_t)GDT_VA,
 887             PAGESIZE, PAGESIZE);
 888         bzero(gdt0, PAGESIZE);
 889 
 890         init_gdt_common(gdt0);
 891 
 892         /*
 893          * Copy in from boot's gdt to our gdt entries.
 894          * Entry 0 is null descriptor by definition.
 895          */
 896         rd_gdtr(&r_bgdt);
 897         bgdt = (user_desc_t *)r_bgdt.dtr_base;
 898         if (bgdt == NULL)
 899                 panic("null boot gdt");
 900 
 901         gdt0[GDT_B32DATA] = bgdt[GDT_B32DATA];
 902         gdt0[GDT_B32CODE] = bgdt[GDT_B32CODE];
 903         gdt0[GDT_B16CODE] = bgdt[GDT_B16CODE];
 904         gdt0[GDT_B16DATA] = bgdt[GDT_B16DATA];
 905 
 906         /*
 907          * Install our new GDT
 908          */
 909         r_gdt.dtr_limit = (sizeof (*gdt0) * NGDT) - 1;
 910         r_gdt.dtr_base = (uintptr_t)gdt0;
 911         wr_gdtr(&r_gdt);
 912 
 913         /*
 914          * Reload the segment registers to use the new GDT
 915          */
 916         load_segment_registers(
 917             KCS_SEL, KDS_SEL, KDS_SEL, KFS_SEL, KGS_SEL, KDS_SEL);
 918 
 919         return (gdt0);
 920 }
 921 
 922 #endif  /* __xpv */
 923 #endif  /* __i386 */
 924 
 925 /*
 926  * Build kernel IDT.
 927  *
 928  * Note that for amd64 we pretty much require every gate to be an interrupt
 929  * gate which blocks interrupts atomically on entry; that's because of our
 930  * dependency on using 'swapgs' every time we come into the kernel to find
 931  * the cpu structure. If we get interrupted just before doing that, %cs could
 932  * be in kernel mode (so that the trap prolog doesn't do a swapgs), but
 933  * %gsbase is really still pointing at something in userland. Bad things will
 934  * ensue. We also use interrupt gates for i386 as well even though this is not
 935  * required for some traps.
 936  *
 937  * Perhaps they should have invented a trap gate that does an atomic swapgs?
 938  */
 939 static void
 940 init_idt_common(gate_desc_t *idt)
 941 {
 942         set_gatesegd(&idt[T_ZERODIV],
 943             (kpti_enable == 1) ? &tr_div0trap : &div0trap,
 944             KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_ZERODIV));
 945         set_gatesegd(&idt[T_SGLSTP],
 946             (kpti_enable == 1) ? &tr_dbgtrap : &dbgtrap,
 947             KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_SGLSTP));
 948         set_gatesegd(&idt[T_NMIFLT],
 949             (kpti_enable == 1) ? &tr_nmiint : &nmiint,
 950             KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_NMIFLT));
 951         set_gatesegd(&idt[T_BPTFLT],
 952             (kpti_enable == 1) ? &tr_brktrap : &brktrap,
 953             KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_BPTFLT));
 954         set_gatesegd(&idt[T_OVFLW],
 955             (kpti_enable == 1) ? &tr_ovflotrap : &ovflotrap,
 956             KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_OVFLW));
 957         set_gatesegd(&idt[T_BOUNDFLT],
 958             (kpti_enable == 1) ? &tr_boundstrap : &boundstrap,
 959             KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_BOUNDFLT));
 960         set_gatesegd(&idt[T_ILLINST],
 961             (kpti_enable == 1) ? &tr_invoptrap : &invoptrap,
 962             KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_ILLINST));
 963         set_gatesegd(&idt[T_NOEXTFLT],
 964             (kpti_enable == 1) ? &tr_ndptrap : &ndptrap,
 965             KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_NOEXTFLT));
 966 
 967         /*
 968          * double fault handler.
 969          *
 970          * Note that on the hypervisor a guest does not receive #df faults.
 971          * Instead a failsafe event is injected into the guest if its selectors
 972          * and/or stack is in a broken state. See xen_failsafe_callback.
 973          */
 974 #if !defined(__xpv)
 975         set_gatesegd(&idt[T_DBLFLT],
 976             (kpti_enable == 1) ? &tr_syserrtrap : &syserrtrap,
 977             KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_DBLFLT));
 978 #endif  /* !__xpv */
 979 
 980         /*
 981          * T_EXTOVRFLT coprocessor-segment-overrun not supported.
 982          */
 983         set_gatesegd(&idt[T_TSSFLT],
 984             (kpti_enable == 1) ? &tr_invtsstrap : &invtsstrap,
 985             KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_TSSFLT));
 986         set_gatesegd(&idt[T_SEGFLT],
 987             (kpti_enable == 1) ? &tr_segnptrap : &segnptrap,
 988             KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_SEGFLT));
 989         set_gatesegd(&idt[T_STKFLT],
 990             (kpti_enable == 1) ? &tr_stktrap : &stktrap,
 991             KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_STKFLT));
 992         set_gatesegd(&idt[T_GPFLT],
 993             (kpti_enable == 1) ? &tr_gptrap : &gptrap,
 994             KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_GPFLT));
 995         set_gatesegd(&idt[T_PGFLT],
 996             (kpti_enable == 1) ? &tr_pftrap : &pftrap,
 997             KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_PGFLT));
 998         set_gatesegd(&idt[T_EXTERRFLT],
 999             (kpti_enable == 1) ? &tr_ndperr : &ndperr,
1000             KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_EXTERRFLT));
1001         set_gatesegd(&idt[T_ALIGNMENT],
1002             (kpti_enable == 1) ? &tr_achktrap : &achktrap,
1003             KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_ALIGNMENT));
1004         set_gatesegd(&idt[T_MCE],
1005             (kpti_enable == 1) ? &tr_mcetrap : &mcetrap,
1006             KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_MCE));
1007         set_gatesegd(&idt[T_SIMDFPE],
1008             (kpti_enable == 1) ? &tr_xmtrap : &xmtrap,
1009             KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_SIMDFPE));
1010 
1011         /*
1012          * install fast trap handler at 210.
1013          */
1014         set_gatesegd(&idt[T_FASTTRAP],
1015             (kpti_enable == 1) ? &tr_fasttrap : &fasttrap,
1016             KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_FASTTRAP));
1017 
1018         /*
1019          * System call handler.
1020          */
1021         set_gatesegd(&idt[T_SYSCALLINT],
1022             (kpti_enable == 1) ? &tr_sys_syscall_int : &sys_syscall_int,
1023             KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_SYSCALLINT));
1024 
1025         /*
1026          * Install the DTrace interrupt handler for the pid provider.
1027          */
1028         set_gatesegd(&idt[T_DTRACE_RET],
1029             (kpti_enable == 1) ? &tr_dtrace_ret : &dtrace_ret,
1030             KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_DTRACE_RET));
1031 
1032         /*
1033          * Prepare interposing descriptor for the syscall handler
1034          * and cache copy of the default descriptor.
1035          */
1036         brand_tbl[0].ih_inum = T_SYSCALLINT;
1037         brand_tbl[0].ih_default_desc = idt0[T_SYSCALLINT];
1038 
1039         set_gatesegd(&(brand_tbl[0].ih_interp_desc),
1040             (kpti_enable == 1) ? &tr_brand_sys_syscall_int :
1041             &brand_sys_syscall_int, KCS_SEL, SDT_SYSIGT, TRP_UPL,
1042             idt_vector_to_ist(T_SYSCALLINT));
1043 
1044         brand_tbl[1].ih_inum = 0;
1045 }
1046 
1047 #if defined(__xpv)
1048 
1049 static void
1050 init_idt(gate_desc_t *idt)
1051 {
1052         init_idt_common(idt);
1053 }
1054 
1055 #else   /* __xpv */
1056 
1057 static void
1058 init_idt(gate_desc_t *idt)
1059 {
1060         char    ivctname[80];
1061         void    (*ivctptr)(void);
1062         int     i;
1063 
1064         /*
1065          * Initialize entire table with 'reserved' trap and then overwrite
1066          * specific entries. T_EXTOVRFLT (9) is unsupported and reserved
1067          * since it can only be generated on a 386 processor. 15 is also
1068          * unsupported and reserved.
1069          */
1070 #if !defined(__xpv)
1071         for (i = 0; i < NIDT; i++) {
1072                 set_gatesegd(&idt[i],
1073                     (kpti_enable == 1) ? &tr_resvtrap : &resvtrap,
1074                     KCS_SEL, SDT_SYSIGT, TRP_KPL,
1075                     idt_vector_to_ist(T_RESVTRAP));
1076         }
1077 #else
1078         for (i = 0; i < NIDT; i++) {
1079                 set_gatesegd(&idt[i], &resvtrap, KCS_SEL, SDT_SYSIGT, TRP_KPL,
1080                     IST_NONE);
1081         }
1082 #endif
1083 
1084         /*
1085          * 20-31 reserved
1086          */
1087 #if !defined(__xpv)
1088         for (i = 20; i < 32; i++) {
1089                 set_gatesegd(&idt[i],
1090                     (kpti_enable == 1) ? &tr_invaltrap : &invaltrap,
1091                     KCS_SEL, SDT_SYSIGT, TRP_KPL,
1092                     idt_vector_to_ist(T_INVALTRAP));
1093         }
1094 #else
1095         for (i = 20; i < 32; i++) {
1096                 set_gatesegd(&idt[i], &invaltrap, KCS_SEL, SDT_SYSIGT, TRP_KPL,
1097                     IST_NONE);
1098         }
1099 #endif
1100 
1101         /*
1102          * interrupts 32 - 255
1103          */
1104         for (i = 32; i < 256; i++) {
1105 #if !defined(__xpv)
1106                 (void) snprintf(ivctname, sizeof (ivctname),
1107                     (kpti_enable == 1) ? "tr_ivct%d" : "ivct%d", i);
1108 #else
1109                 (void) snprintf(ivctname, sizeof (ivctname), "ivct%d", i);
1110 #endif
1111                 ivctptr = (void (*)(void))kobj_getsymvalue(ivctname, 0);
1112                 if (ivctptr == NULL)
1113                         panic("kobj_getsymvalue(%s) failed", ivctname);
1114 
1115                 set_gatesegd(&idt[i], ivctptr, KCS_SEL, SDT_SYSIGT, TRP_KPL,
1116                     idt_vector_to_ist(i));
1117         }
1118 
1119         /*
1120          * Now install the common ones. Note that it will overlay some
1121          * entries installed above like T_SYSCALLINT, T_FASTTRAP etc.
1122          */
1123         init_idt_common(idt);
1124 }
1125 
1126 #endif  /* __xpv */
1127 
1128 /*
1129  * The kernel does not deal with LDTs unless a user explicitly creates
1130  * one. Under normal circumstances, the LDTR contains 0. Any process attempting
1131  * to reference the LDT will therefore cause a #gp. System calls made via the
1132  * obsolete lcall mechanism are emulated by the #gp fault handler.
1133  */
1134 static void
1135 init_ldt(void)
1136 {
1137 #if defined(__xpv)
1138         xen_set_ldt(NULL, 0);
1139 #else
1140         wr_ldtr(0);
1141 #endif
1142 }
1143 
1144 #if !defined(__xpv)
1145 
1146 static void
1147 init_tss(void)
1148 {
1149         extern struct cpu cpus[];
1150 
1151         /*
1152          * tss_rsp0 is dynamically filled in by resume() (in swtch.s) on each
1153          * context switch but it'll be overwritten with this same value anyway.
1154          */
1155         if (kpti_enable == 1) {
1156                 ktss0->tss_rsp0 = (uint64_t)&cpus->cpu_m.mcpu_kpti.kf_tr_rsp;
1157         }
1158 
1159         /* Set up the IST stacks for double fault, NMI, MCE. */
1160         ktss0->tss_ist1 = (uintptr_t)&dblfault_stack0[sizeof (dblfault_stack0)];
1161         ktss0->tss_ist2 = (uintptr_t)&nmi_stack0[sizeof (nmi_stack0)];
1162         ktss0->tss_ist3 = (uintptr_t)&mce_stack0[sizeof (mce_stack0)];
1163 
1164         /*
1165          * This IST stack is used for #DB,#BP (debug) interrupts (when KPTI is
1166          * enabled), and also for KDI (always).
1167          */
1168         ktss0->tss_ist4 = (uint64_t)&cpus->cpu_m.mcpu_kpti_dbg.kf_tr_rsp;
1169 
1170         if (kpti_enable == 1) {
1171                 /* This IST stack is used for #GP,#PF,#SS (fault) interrupts. */
1172                 ktss0->tss_ist5 =
1173                     (uint64_t)&cpus->cpu_m.mcpu_kpti_flt.kf_tr_rsp;
1174 
1175                 /* This IST stack is used for all other intrs (for KPTI). */
1176                 ktss0->tss_ist6 = (uint64_t)&cpus->cpu_m.mcpu_kpti.kf_tr_rsp;
1177         }
1178 
1179         /*
1180          * Set I/O bit map offset equal to size of TSS segment limit
1181          * for no I/O permission map. This will force all user I/O
1182          * instructions to generate #gp fault.
1183          */
1184         ktss0->tss_bitmapbase = sizeof (*ktss0);
1185 
1186         /*
1187          * Point %tr to descriptor for ktss0 in gdt.
1188          */
1189         wr_tsr(KTSS_SEL);
1190 }
1191 
1192 #endif  /* !__xpv */
1193 
1194 #if defined(__xpv)
1195 
1196 void
1197 init_desctbls(void)
1198 {
1199         uint_t vec;
1200         user_desc_t *gdt;
1201 
1202         /*
1203          * Setup and install our GDT.
1204          */
1205         gdt = init_gdt();
1206 
1207         /*
1208          * Store static pa of gdt to speed up pa_to_ma() translations
1209          * on lwp context switches.
1210          */
1211         ASSERT(IS_P2ALIGNED((uintptr_t)gdt, PAGESIZE));
1212         CPU->cpu_gdt = gdt;
1213         CPU->cpu_m.mcpu_gdtpa = pfn_to_pa(va_to_pfn(gdt));
1214 
1215         /*
1216          * Setup and install our IDT.
1217          */
1218 #if !defined(__lint)
1219         ASSERT(NIDT * sizeof (*idt0) <= PAGESIZE);
1220 #endif
1221         idt0 = (gate_desc_t *)BOP_ALLOC(bootops, (caddr_t)IDT_VA,
1222             PAGESIZE, PAGESIZE);
1223         bzero(idt0, PAGESIZE);
1224         init_idt(idt0);
1225         for (vec = 0; vec < NIDT; vec++)
1226                 xen_idt_write(&idt0[vec], vec);
1227 
1228         CPU->cpu_idt = idt0;
1229 
1230         /*
1231          * set default kernel stack
1232          */
1233         xen_stack_switch(KDS_SEL,
1234             (ulong_t)&dblfault_stack0[sizeof (dblfault_stack0)]);
1235 
1236         xen_init_callbacks();
1237 
1238         init_ldt();
1239 }
1240 
1241 #else   /* __xpv */
1242 
1243 void
1244 init_desctbls(void)
1245 {
1246         user_desc_t *gdt;
1247         desctbr_t idtr;
1248 
1249         /*
1250          * Allocate IDT and TSS structures on unique pages for better
1251          * performance in virtual machines.
1252          */
1253 #if !defined(__lint)
1254         ASSERT(NIDT * sizeof (*idt0) <= PAGESIZE);
1255 #endif
1256         idt0 = (gate_desc_t *)BOP_ALLOC(bootops, (caddr_t)IDT_VA,
1257             PAGESIZE, PAGESIZE);
1258         bzero(idt0, PAGESIZE);
1259 #if !defined(__lint)
1260         ASSERT(sizeof (*ktss0) <= PAGESIZE);
1261 #endif
1262         ktss0 = (tss_t *)BOP_ALLOC(bootops, (caddr_t)KTSS_VA,
1263             PAGESIZE, PAGESIZE);
1264         bzero(ktss0, PAGESIZE);
1265 
1266 #if defined(__i386)
1267 #if !defined(__lint)
1268         ASSERT(sizeof (*dftss0) <= PAGESIZE);
1269 #endif
1270         dftss0 = (tss_t *)BOP_ALLOC(bootops, (caddr_t)DFTSS_VA,
1271             PAGESIZE, PAGESIZE);
1272         bzero(dftss0, PAGESIZE);
1273 #endif
1274 
1275         /*
1276          * Setup and install our GDT.
1277          */
1278         gdt = init_gdt();
1279         ASSERT(IS_P2ALIGNED((uintptr_t)gdt, PAGESIZE));
1280         CPU->cpu_gdt = gdt;
1281 
1282         /*
1283          * Initialize this CPU's LDT.
1284          */
1285         CPU->cpu_m.mcpu_ldt = BOP_ALLOC(bootops, (caddr_t)LDT_VA,
1286             LDT_CPU_SIZE, PAGESIZE);
1287         bzero(CPU->cpu_m.mcpu_ldt, LDT_CPU_SIZE);
1288         CPU->cpu_m.mcpu_ldt_len = 0;
1289 
1290         /*
1291          * Setup and install our IDT.
1292          */
1293         init_idt(idt0);
1294 
1295         idtr.dtr_base = (uintptr_t)idt0;
1296         idtr.dtr_limit = (NIDT * sizeof (*idt0)) - 1;
1297         wr_idtr(&idtr);
1298         CPU->cpu_idt = idt0;
1299 
1300 #if defined(__i386)
1301         /*
1302          * We maintain a description of idt0 in convenient IDTR format
1303          * for #pf's on some older pentium processors. See pentium_pftrap().
1304          */
1305         idt0_default_r = idtr;
1306 #endif  /* __i386 */
1307 
1308         init_tss();
1309         CPU->cpu_tss = ktss0;
1310         init_ldt();
1311 
1312         /* Stash this so that the NMI,MCE,#DF and KDI handlers can use it. */
1313         kpti_safe_cr3 = (uint64_t)getcr3();
1314 }
1315 
1316 #endif  /* __xpv */
1317 
1318 /*
1319  * In the early kernel, we need to set up a simple GDT to run on.
1320  *
1321  * XXPV Can dboot use this too?  See dboot_gdt.s
1322  */
1323 void
1324 init_boot_gdt(user_desc_t *bgdt)
1325 {
1326 #if defined(__amd64)
1327         set_usegd(&bgdt[GDT_B32DATA], SDP_LONG, NULL, -1, SDT_MEMRWA, SEL_KPL,
1328             SDP_PAGES, SDP_OP32);
1329         set_usegd(&bgdt[GDT_B64CODE], SDP_LONG, NULL, -1, SDT_MEMERA, SEL_KPL,
1330             SDP_PAGES, SDP_OP32);
1331 #elif defined(__i386)
1332         set_usegd(&bgdt[GDT_B32DATA], NULL, -1, SDT_MEMRWA, SEL_KPL,
1333             SDP_PAGES, SDP_OP32);
1334         set_usegd(&bgdt[GDT_B32CODE], NULL, -1, SDT_MEMERA, SEL_KPL,
1335             SDP_PAGES, SDP_OP32);
1336 #endif  /* __i386 */
1337 }
1338 
1339 /*
1340  * Enable interpositioning on the system call path by rewriting the
1341  * sys{call|enter} MSRs and the syscall-related entries in the IDT to use
1342  * the branded entry points.
1343  */
1344 void
1345 brand_interpositioning_enable(void)
1346 {
1347         gate_desc_t     *idt = CPU->cpu_idt;
1348         int             i;
1349 
1350         ASSERT(curthread->t_preempt != 0 || getpil() >= DISP_LEVEL);
1351 
1352         for (i = 0; brand_tbl[i].ih_inum; i++) {
1353                 idt[brand_tbl[i].ih_inum] = brand_tbl[i].ih_interp_desc;
1354 #if defined(__xpv)
1355                 xen_idt_write(&idt[brand_tbl[i].ih_inum],
1356                     brand_tbl[i].ih_inum);
1357 #endif
1358         }
1359 
1360 #if defined(__amd64)
1361 #if defined(__xpv)
1362 
1363         /*
1364          * Currently the hypervisor only supports 64-bit syscalls via
1365          * syscall instruction. The 32-bit syscalls are handled by
1366          * interrupt gate above.
1367          */
1368         xen_set_callback(brand_sys_syscall, CALLBACKTYPE_syscall,
1369             CALLBACKF_mask_events);
1370 
1371 #else
1372 
1373         if (is_x86_feature(x86_featureset, X86FSET_ASYSC)) {
1374                 if (kpti_enable == 1) {
1375                         wrmsr(MSR_AMD_LSTAR, (uintptr_t)tr_brand_sys_syscall);
1376                         wrmsr(MSR_AMD_CSTAR, (uintptr_t)tr_brand_sys_syscall32);
1377                 } else {
1378                         wrmsr(MSR_AMD_LSTAR, (uintptr_t)brand_sys_syscall);
1379                         wrmsr(MSR_AMD_CSTAR, (uintptr_t)brand_sys_syscall32);
1380                 }
1381         }
1382 
1383 #endif
1384 #endif  /* __amd64 */
1385 
1386         if (is_x86_feature(x86_featureset, X86FSET_SEP)) {
1387                 if (kpti_enable == 1) {
1388                         wrmsr(MSR_INTC_SEP_EIP,
1389                             (uintptr_t)tr_brand_sys_sysenter);
1390                 } else {
1391                         wrmsr(MSR_INTC_SEP_EIP, (uintptr_t)brand_sys_sysenter);
1392                 }
1393         }
1394 }
1395 
1396 /*
1397  * Disable interpositioning on the system call path by rewriting the
1398  * sys{call|enter} MSRs and the syscall-related entries in the IDT to use
1399  * the standard entry points, which bypass the interpositioning hooks.
1400  */
1401 void
1402 brand_interpositioning_disable(void)
1403 {
1404         gate_desc_t     *idt = CPU->cpu_idt;
1405         int i;
1406 
1407         ASSERT(curthread->t_preempt != 0 || getpil() >= DISP_LEVEL);
1408 
1409         for (i = 0; brand_tbl[i].ih_inum; i++) {
1410                 idt[brand_tbl[i].ih_inum] = brand_tbl[i].ih_default_desc;
1411 #if defined(__xpv)
1412                 xen_idt_write(&idt[brand_tbl[i].ih_inum],
1413                     brand_tbl[i].ih_inum);
1414 #endif
1415         }
1416 
1417 #if defined(__amd64)
1418 #if defined(__xpv)
1419 
1420         /*
1421          * See comment above in brand_interpositioning_enable.
1422          */
1423         xen_set_callback(sys_syscall, CALLBACKTYPE_syscall,
1424             CALLBACKF_mask_events);
1425 
1426 #else
1427 
1428         if (is_x86_feature(x86_featureset, X86FSET_ASYSC)) {
1429                 if (kpti_enable == 1) {
1430                         wrmsr(MSR_AMD_LSTAR, (uintptr_t)tr_sys_syscall);
1431                         wrmsr(MSR_AMD_CSTAR, (uintptr_t)tr_sys_syscall32);
1432                 } else {
1433                         wrmsr(MSR_AMD_LSTAR, (uintptr_t)sys_syscall);
1434                         wrmsr(MSR_AMD_CSTAR, (uintptr_t)sys_syscall32);
1435                 }
1436         }
1437 
1438 #endif
1439 #endif  /* __amd64 */
1440 
1441         if (is_x86_feature(x86_featureset, X86FSET_SEP)) {
1442                 if (kpti_enable == 1) {
1443                         wrmsr(MSR_INTC_SEP_EIP, (uintptr_t)tr_sys_sysenter);
1444                 } else {
1445                         wrmsr(MSR_INTC_SEP_EIP, (uintptr_t)sys_sysenter);
1446                 }
1447         }
1448 }