9723 provide support for VMM's GDT handling
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Patrick Mooney <patrick.mooney@joyent.com>

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
  24  */
  25 
  26 /*
  27  * Copyright 2018 Joyent, Inc. All rights reserved.
  28  */
  29 
  30 /*
  31  * Copyright (c) 1992 Terrence R. Lambert.
  32  * Copyright (c) 1990 The Regents of the University of California.
  33  * All rights reserved.
  34  *
  35  * This code is derived from software contributed to Berkeley by
  36  * William Jolitz.
  37  *
  38  * Redistribution and use in source and binary forms, with or without
  39  * modification, are permitted provided that the following conditions
  40  * are met:
  41  * 1. Redistributions of source code must retain the above copyright
  42  *    notice, this list of conditions and the following disclaimer.
  43  * 2. Redistributions in binary form must reproduce the above copyright
  44  *    notice, this list of conditions and the following disclaimer in the
  45  *    documentation and/or other materials provided with the distribution.
  46  * 3. All advertising materials mentioning features or use of this software
  47  *    must display the following acknowledgement:
  48  *      This product includes software developed by the University of
  49  *      California, Berkeley and its contributors.
  50  * 4. Neither the name of the University nor the names of its contributors
  51  *    may be used to endorse or promote products derived from this software
  52  *    without specific prior written permission.
  53  *
  54  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  55  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  56  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  57  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  58  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  59  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  60  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  61  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  62  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  63  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  64  * SUCH DAMAGE.
  65  *
  66  *      from: @(#)machdep.c     7.4 (Berkeley) 6/3/91
  67  */
  68 
  69 #include <sys/types.h>
  70 #include <sys/sysmacros.h>
  71 #include <sys/tss.h>
  72 #include <sys/segments.h>
  73 #include <sys/trap.h>
  74 #include <sys/cpuvar.h>
  75 #include <sys/bootconf.h>
  76 #include <sys/x86_archext.h>
  77 #include <sys/controlregs.h>
  78 #include <sys/archsystm.h>
  79 #include <sys/machsystm.h>
  80 #include <sys/kobj.h>
  81 #include <sys/cmn_err.h>
  82 #include <sys/reboot.h>
  83 #include <sys/kdi.h>
  84 #include <sys/mach_mmu.h>
  85 #include <sys/systm.h>
  86 #include <sys/note.h>
  87 
  88 #ifdef __xpv
  89 #include <sys/hypervisor.h>
  90 #include <vm/as.h>
  91 #endif
  92 
  93 #include <sys/promif.h>
  94 #include <sys/bootinfo.h>
  95 #include <vm/kboot_mmu.h>
  96 #include <vm/hat_pte.h>
  97 
  98 /*
  99  * cpu0 and default tables and structures.
 100  */
 101 user_desc_t     *gdt0;
 102 #if !defined(__xpv)
 103 desctbr_t       gdt0_default_r;
 104 #endif
 105 
 106 gate_desc_t     *idt0;          /* interrupt descriptor table */
 107 #if defined(__i386)
 108 desctbr_t       idt0_default_r;         /* describes idt0 in IDTR format */
 109 #endif
 110 
 111 tss_t           *ktss0;                 /* kernel task state structure */
 112 
 113 #if defined(__i386)
 114 tss_t           *dftss0;                /* #DF double-fault exception */
 115 #endif  /* __i386 */
 116 
 117 user_desc_t     zero_udesc;             /* base zero user desc native procs */
 118 user_desc_t     null_udesc;             /* null user descriptor */
 119 system_desc_t   null_sdesc;             /* null system descriptor */
 120 
 121 #if defined(__amd64)
 122 user_desc_t     zero_u32desc;           /* 32-bit compatibility procs */
 123 #endif  /* __amd64 */
 124 
 125 #if defined(__amd64)
 126 user_desc_t     ucs_on;
 127 user_desc_t     ucs_off;
 128 user_desc_t     ucs32_on;
 129 user_desc_t     ucs32_off;
 130 #endif  /* __amd64 */
 131 
 132 /*
 133  * If the size of this is changed, you must update hat_pcp_setup() and the
 134  * definitions in exception.s
 135  */
 136 extern char dblfault_stack0[DEFAULTSTKSZ];
 137 extern char nmi_stack0[DEFAULTSTKSZ];
 138 extern char mce_stack0[DEFAULTSTKSZ];
 139 
 140 extern void     fast_null(void);
 141 extern hrtime_t get_hrtime(void);
 142 extern hrtime_t gethrvtime(void);
 143 extern hrtime_t get_hrestime(void);
 144 extern uint64_t getlgrp(void);
 145 
 146 void (*(fasttable[]))(void) = {
 147         fast_null,                      /* T_FNULL routine */
 148         fast_null,                      /* T_FGETFP routine (initially null) */
 149         fast_null,                      /* T_FSETFP routine (initially null) */
 150         (void (*)())get_hrtime,         /* T_GETHRTIME */
 151         (void (*)())gethrvtime,         /* T_GETHRVTIME */
 152         (void (*)())get_hrestime,       /* T_GETHRESTIME */
 153         (void (*)())getlgrp             /* T_GETLGRP */
 154 };
 155 
 156 /*
 157  * Structure containing pre-computed descriptors to allow us to temporarily
 158  * interpose on a standard handler.
 159  */
 160 struct interposing_handler {
 161         int ih_inum;
 162         gate_desc_t ih_interp_desc;
 163         gate_desc_t ih_default_desc;
 164 };
 165 
 166 /*
 167  * The brand infrastructure interposes on two handlers, and we use one as a
 168  * NULL signpost.
 169  */
 170 static struct interposing_handler brand_tbl[2];
 171 
 172 /*
 173  * software prototypes for default local descriptor table
 174  */
 175 
 176 /*
 177  * Routines for loading segment descriptors in format the hardware
 178  * can understand.
 179  */
 180 
 181 /*
 182  * In long mode we have the new L or long mode attribute bit
 183  * for code segments. Only the conforming bit in type is used along
 184  * with descriptor priority and present bits. Default operand size must
 185  * be zero when in long mode. In 32-bit compatibility mode all fields
 186  * are treated as in legacy mode. For data segments while in long mode
 187  * only the present bit is loaded.
 188  */
 189 void
 190 set_usegd(user_desc_t *dp, uint_t lmode, void *base, size_t size,
 191     uint_t type, uint_t dpl, uint_t gran, uint_t defopsz)
 192 {
 193         ASSERT(lmode == SDP_SHORT || lmode == SDP_LONG);
 194         /* This should never be a "system" segment. */
 195         ASSERT3U(type & SDT_S, !=, 0);
 196 
 197         /*
 198          * 64-bit long mode.
 199          */
 200         if (lmode == SDP_LONG)
 201                 dp->usd_def32 = 0;           /* 32-bit operands only */
 202         else
 203                 /*
 204                  * 32-bit compatibility mode.
 205                  */
 206                 dp->usd_def32 = defopsz;     /* 0 = 16, 1 = 32-bit ops */
 207 
 208         /*
 209          * We should always set the "accessed" bit (SDT_A), otherwise the CPU
 210          * will write to the GDT whenever we change segment registers around.
 211          * With KPTI on, the GDT is read-only in the user page table, which
 212          * causes crashes if we don't set this.
 213          */
 214         ASSERT3U(type & SDT_A, !=, 0);
 215 
 216         dp->usd_long = lmode;        /* 64-bit mode */
 217         dp->usd_type = type;
 218         dp->usd_dpl = dpl;
 219         dp->usd_p = 1;
 220         dp->usd_gran = gran;         /* 0 = bytes, 1 = pages */
 221 
 222         dp->usd_lobase = (uintptr_t)base;
 223         dp->usd_midbase = (uintptr_t)base >> 16;
 224         dp->usd_hibase = (uintptr_t)base >> (16 + 8);
 225         dp->usd_lolimit = size;
 226         dp->usd_hilimit = (uintptr_t)size >> 16;
 227 }
 228 
 229 /*
 230  * Install system segment descriptor for LDT and TSS segments.
 231  */
 232 
 233 void
 234 set_syssegd(system_desc_t *dp, void *base, size_t size, uint_t type,
 235     uint_t dpl)
 236 {
 237         dp->ssd_lolimit = size;
 238         dp->ssd_hilimit = (uintptr_t)size >> 16;
 239 
 240         dp->ssd_lobase = (uintptr_t)base;
 241         dp->ssd_midbase = (uintptr_t)base >> 16;
 242         dp->ssd_hibase = (uintptr_t)base >> (16 + 8);
 243         dp->ssd_hi64base = (uintptr_t)base >> (16 + 8 + 8);
 244 
 245         dp->ssd_type = type;
 246         dp->ssd_zero1 = 0;   /* must be zero */
 247         dp->ssd_zero2 = 0;
 248         dp->ssd_dpl = dpl;
 249         dp->ssd_p = 1;
 250         dp->ssd_gran = 0;    /* force byte units */
 251 }
 252 
 253 void *
 254 get_ssd_base(system_desc_t *dp)
 255 {
 256         uintptr_t       base;
 257 
 258         base = (uintptr_t)dp->ssd_lobase |
 259             (uintptr_t)dp->ssd_midbase << 16 |
 260             (uintptr_t)dp->ssd_hibase << (16 + 8) |
 261             (uintptr_t)dp->ssd_hi64base << (16 + 8 + 8);
 262         return ((void *)base);
 263 }
 264 
 265 /*
 266  * Install gate segment descriptor for interrupt, trap, call and task gates.
 267  *
 268  * For 64 bit native if we have KPTI enabled, we use the IST stack mechanism on
 269  * all interrupts.  We have different ISTs for each class of exceptions that are
 270  * most likely to occur while handling an existing exception; while many of
 271  * these are just going to panic, it's nice not to trample on the existing
 272  * exception state for debugging purposes.
 273  *
 274  * Normal interrupts are all redirected unconditionally to the KPTI trampoline
 275  * stack space. This unifies the trampoline handling between user and kernel
 276  * space (and avoids the need to touch %gs).
 277  *
 278  * The KDI IDT *all* uses the DBG IST: consider single stepping tr_pftrap, when
 279  * we do a read from KMDB that cause another #PF.  Without its own IST, this
 280  * would stomp on the kernel's mcpu_kpti_flt frame.
 281  */
 282 uint_t
 283 idt_vector_to_ist(uint_t vector)
 284 {
 285 #if defined(__xpv)
 286         _NOTE(ARGUNUSED(vector));
 287         return (IST_NONE);
 288 #else
 289         switch (vector) {
 290         /* These should always use IST even without KPTI enabled. */
 291         case T_DBLFLT:
 292                 return (IST_DF);
 293         case T_NMIFLT:
 294                 return (IST_NMI);
 295         case T_MCE:
 296                 return (IST_MCE);
 297 
 298         case T_BPTFLT:
 299         case T_SGLSTP:
 300                 if (kpti_enable == 1) {
 301                         return (IST_DBG);
 302                 }
 303                 return (IST_NONE);
 304         case T_STKFLT:
 305         case T_GPFLT:
 306         case T_PGFLT:
 307                 if (kpti_enable == 1) {
 308                         return (IST_NESTABLE);
 309                 }
 310                 return (IST_NONE);
 311         default:
 312                 if (kpti_enable == 1) {
 313                         return (IST_DEFAULT);
 314                 }
 315                 return (IST_NONE);
 316         }
 317 #endif
 318 }
 319 
 320 void
 321 set_gatesegd(gate_desc_t *dp, void (*func)(void), selector_t sel,
 322     uint_t type, uint_t dpl, uint_t ist)
 323 {
 324         dp->sgd_looffset = (uintptr_t)func;
 325         dp->sgd_hioffset = (uintptr_t)func >> 16;
 326         dp->sgd_hi64offset = (uintptr_t)func >> (16 + 16);
 327         dp->sgd_selector =  (uint16_t)sel;
 328         dp->sgd_ist = ist;
 329         dp->sgd_type = type;
 330         dp->sgd_dpl = dpl;
 331         dp->sgd_p = 1;
 332 }
 333 
 334 /*
 335  * Updates a single user descriptor in the the GDT of the current cpu.
 336  * Caller is responsible for preventing cpu migration.
 337  */
 338 
 339 void
 340 gdt_update_usegd(uint_t sidx, user_desc_t *udp)
 341 {
 342 #if defined(DEBUG)
 343         /* This should never be a "system" segment, but it might be null. */
 344         if (udp->usd_p != 0 || udp->usd_type != 0) {
 345                 ASSERT3U(udp->usd_type & SDT_S, !=, 0);
 346         }
 347         /*
 348          * We should always set the "accessed" bit (SDT_A), otherwise the CPU
 349          * will write to the GDT whenever we change segment registers around.
 350          * With KPTI on, the GDT is read-only in the user page table, which
 351          * causes crashes if we don't set this.
 352          */
 353         if (udp->usd_p != 0 || udp->usd_type != 0) {
 354                 ASSERT3U(udp->usd_type & SDT_A, !=, 0);
 355         }
 356 #endif
 357 
 358 #if defined(__xpv)
 359         uint64_t dpa = CPU->cpu_m.mcpu_gdtpa + sizeof (*udp) * sidx;
 360 
 361         if (HYPERVISOR_update_descriptor(pa_to_ma(dpa), *(uint64_t *)udp))
 362                 panic("gdt_update_usegd: HYPERVISOR_update_descriptor");
 363 
 364 #else   /* __xpv */
 365         CPU->cpu_gdt[sidx] = *udp;
 366 #endif  /* __xpv */
 367 }
 368 
 369 /*
 370  * Writes single descriptor pointed to by udp into a processes
 371  * LDT entry pointed to by ldp.
 372  */
 373 int
 374 ldt_update_segd(user_desc_t *ldp, user_desc_t *udp)
 375 {
 376 #if defined(DEBUG)
 377         /* This should never be a "system" segment, but it might be null. */
 378         if (udp->usd_p != 0 || udp->usd_type != 0) {
 379                 ASSERT3U(udp->usd_type & SDT_S, !=, 0);
 380         }
 381         /*
 382          * We should always set the "accessed" bit (SDT_A), otherwise the CPU
 383          * will write to the LDT whenever we change segment registers around.
 384          * With KPTI on, the LDT is read-only in the user page table, which
 385          * causes crashes if we don't set this.
 386          */
 387         if (udp->usd_p != 0 || udp->usd_type != 0) {
 388                 ASSERT3U(udp->usd_type & SDT_A, !=, 0);
 389         }
 390 #endif
 391 
 392 #if defined(__xpv)
 393         uint64_t dpa;
 394 
 395         dpa = mmu_ptob(hat_getpfnum(kas.a_hat, (caddr_t)ldp)) |
 396             ((uintptr_t)ldp & PAGEOFFSET);
 397 
 398         /*
 399          * The hypervisor is a little more restrictive about what it
 400          * supports in the LDT.
 401          */
 402         if (HYPERVISOR_update_descriptor(pa_to_ma(dpa), *(uint64_t *)udp) != 0)
 403                 return (EINVAL);
 404 
 405 #else   /* __xpv */
 406         *ldp = *udp;
 407 
 408 #endif  /* __xpv */
 409         return (0);
 410 }
 411 
 412 #if defined(__xpv)
 413 
 414 /*
 415  * Converts hw format gate descriptor into pseudo-IDT format for the hypervisor.
 416  * Returns true if a valid entry was written.
 417  */
 418 int
 419 xen_idt_to_trap_info(uint_t vec, gate_desc_t *sgd, void *ti_arg)
 420 {
 421         trap_info_t *ti = ti_arg;       /* XXPV Aargh - segments.h comment */
 422 
 423         /*
 424          * skip holes in the IDT
 425          */
 426         if (GATESEG_GETOFFSET(sgd) == 0)
 427                 return (0);
 428 
 429         ASSERT(sgd->sgd_type == SDT_SYSIGT);
 430         ti->vector = vec;
 431         TI_SET_DPL(ti, sgd->sgd_dpl);
 432 
 433         /*
 434          * Is this an interrupt gate?
 435          */
 436         if (sgd->sgd_type == SDT_SYSIGT) {
 437                 /* LINTED */
 438                 TI_SET_IF(ti, 1);
 439         }
 440         ti->cs = sgd->sgd_selector;
 441 #if defined(__amd64)
 442         ti->cs |= SEL_KPL;   /* force into ring 3. see KCS_SEL  */
 443 #endif
 444         ti->address = GATESEG_GETOFFSET(sgd);
 445         return (1);
 446 }
 447 
 448 /*
 449  * Convert a single hw format gate descriptor and write it into our virtual IDT.
 450  */
 451 void
 452 xen_idt_write(gate_desc_t *sgd, uint_t vec)
 453 {
 454         trap_info_t trapinfo[2];
 455 
 456         bzero(trapinfo, sizeof (trapinfo));
 457         if (xen_idt_to_trap_info(vec, sgd, &trapinfo[0]) == 0)
 458                 return;
 459         if (xen_set_trap_table(trapinfo) != 0)
 460                 panic("xen_idt_write: xen_set_trap_table() failed");
 461 }
 462 
 463 #endif  /* __xpv */
 464 
 465 #if defined(__amd64)
 466 
 467 /*
 468  * Build kernel GDT.
 469  */
 470 
 471 static void
 472 init_gdt_common(user_desc_t *gdt)
 473 {
 474         int i;
 475 
 476         /*
 477          * 64-bit kernel code segment.
 478          */
 479         set_usegd(&gdt[GDT_KCODE], SDP_LONG, NULL, 0, SDT_MEMERA, SEL_KPL,
 480             SDP_PAGES, SDP_OP32);
 481 
 482         /*
 483          * 64-bit kernel data segment. The limit attribute is ignored in 64-bit
 484          * mode, but we set it here to 0xFFFF so that we can use the SYSRET
 485          * instruction to return from system calls back to 32-bit applications.
 486          * SYSRET doesn't update the base, limit, or attributes of %ss or %ds
 487          * descriptors. We therefore must ensure that the kernel uses something,
 488          * though it will be ignored by hardware, that is compatible with 32-bit
 489          * apps. For the same reason we must set the default op size of this
 490          * descriptor to 32-bit operands.
 491          */
 492         set_usegd(&gdt[GDT_KDATA], SDP_LONG, NULL, -1, SDT_MEMRWA,
 493             SEL_KPL, SDP_PAGES, SDP_OP32);
 494         gdt[GDT_KDATA].usd_def32 = 1;
 495 
 496         /*
 497          * 64-bit user code segment.
 498          */
 499         set_usegd(&gdt[GDT_UCODE], SDP_LONG, NULL, 0, SDT_MEMERA, SEL_UPL,
 500             SDP_PAGES, SDP_OP32);
 501 
 502         /*
 503          * 32-bit user code segment.
 504          */
 505         set_usegd(&gdt[GDT_U32CODE], SDP_SHORT, NULL, -1, SDT_MEMERA,
 506             SEL_UPL, SDP_PAGES, SDP_OP32);
 507 
 508         /*
 509          * See gdt_ucode32() and gdt_ucode_native().
 510          */
 511         ucs_on = ucs_off = gdt[GDT_UCODE];
 512         ucs_off.usd_p = 0;      /* forces #np fault */
 513 
 514         ucs32_on = ucs32_off = gdt[GDT_U32CODE];
 515         ucs32_off.usd_p = 0;    /* forces #np fault */
 516 
 517         /*
 518          * 32 and 64 bit data segments can actually share the same descriptor.
 519          * In long mode only the present bit is checked but all other fields
 520          * are loaded. But in compatibility mode all fields are interpreted
 521          * as in legacy mode so they must be set correctly for a 32-bit data
 522          * segment.
 523          */
 524         set_usegd(&gdt[GDT_UDATA], SDP_SHORT, NULL, -1, SDT_MEMRWA, SEL_UPL,
 525             SDP_PAGES, SDP_OP32);
 526 
 527 #if !defined(__xpv)
 528 
 529         /*
 530          * The 64-bit kernel has no default LDT. By default, the LDT descriptor
 531          * in the GDT is 0.
 532          */
 533 
 534         /*
 535          * Kernel TSS
 536          */
 537         set_syssegd((system_desc_t *)&gdt[GDT_KTSS], ktss0,
 538             sizeof (*ktss0) - 1, SDT_SYSTSS, SEL_KPL);
 539 
 540 #endif  /* !__xpv */
 541 
 542         /*
 543          * Initialize fs and gs descriptors for 32 bit processes.
 544          * Only attributes and limits are initialized, the effective
 545          * base address is programmed via fsbase/gsbase.
 546          */
 547         set_usegd(&gdt[GDT_LWPFS], SDP_SHORT, NULL, -1, SDT_MEMRWA,
 548             SEL_UPL, SDP_PAGES, SDP_OP32);
 549         set_usegd(&gdt[GDT_LWPGS], SDP_SHORT, NULL, -1, SDT_MEMRWA,
 550             SEL_UPL, SDP_PAGES, SDP_OP32);
 551 
 552         /*
 553          * Initialize the descriptors set aside for brand usage.
 554          * Only attributes and limits are initialized.
 555          */
 556         for (i = GDT_BRANDMIN; i <= GDT_BRANDMAX; i++)
 557                 set_usegd(&gdt0[i], SDP_SHORT, NULL, -1, SDT_MEMRWA,
 558                     SEL_UPL, SDP_PAGES, SDP_OP32);
 559 
 560         /*
 561          * Initialize convenient zero base user descriptors for clearing
 562          * lwp private %fs and %gs descriptors in GDT. See setregs() for
 563          * an example.
 564          */
 565         set_usegd(&zero_udesc, SDP_LONG, 0, 0, SDT_MEMRWA, SEL_UPL,
 566             SDP_BYTES, SDP_OP32);
 567         set_usegd(&zero_u32desc, SDP_SHORT, 0, -1, SDT_MEMRWA, SEL_UPL,
 568             SDP_PAGES, SDP_OP32);
 569 }
 570 
 571 #if defined(__xpv)
 572 
 573 static user_desc_t *
 574 init_gdt(void)
 575 {
 576         uint64_t gdtpa;
 577         ulong_t ma[1];          /* XXPV should be a memory_t */
 578         ulong_t addr;
 579 
 580 #if !defined(__lint)
 581         /*
 582          * Our gdt is never larger than a single page.
 583          */
 584         ASSERT((sizeof (*gdt0) * NGDT) <= PAGESIZE);
 585 #endif
 586         gdt0 = (user_desc_t *)BOP_ALLOC(bootops, (caddr_t)GDT_VA,
 587             PAGESIZE, PAGESIZE);
 588         bzero(gdt0, PAGESIZE);
 589 
 590         init_gdt_common(gdt0);
 591 
 592         /*
 593          * XXX Since we never invoke kmdb until after the kernel takes
 594          * over the descriptor tables why not have it use the kernel's
 595          * selectors?
 596          */
 597         if (boothowto & RB_DEBUG) {
 598                 set_usegd(&gdt0[GDT_B32DATA], SDP_LONG, NULL, -1, SDT_MEMRWA,
 599                     SEL_KPL, SDP_PAGES, SDP_OP32);
 600                 set_usegd(&gdt0[GDT_B64CODE], SDP_LONG, NULL, -1, SDT_MEMERA,
 601                     SEL_KPL, SDP_PAGES, SDP_OP32);
 602         }
 603 
 604         /*
 605          * Clear write permission for page containing the gdt and install it.
 606          */
 607         gdtpa = pfn_to_pa(va_to_pfn(gdt0));
 608         ma[0] = (ulong_t)(pa_to_ma(gdtpa) >> PAGESHIFT);
 609         kbm_read_only((uintptr_t)gdt0, gdtpa);
 610         xen_set_gdt(ma, NGDT);
 611 
 612         /*
 613          * Reload the segment registers to use the new GDT.
 614          * On 64-bit, fixup KCS_SEL to be in ring 3.
 615          * See KCS_SEL in segments.h.
 616          */
 617         load_segment_registers((KCS_SEL | SEL_KPL), KFS_SEL, KGS_SEL, KDS_SEL);
 618 
 619         /*
 620          *  setup %gs for kernel
 621          */
 622         xen_set_segment_base(SEGBASE_GS_KERNEL, (ulong_t)&cpus[0]);
 623 
 624         /*
 625          * XX64 We should never dereference off "other gsbase" or
 626          * "fsbase".  So, we should arrange to point FSBASE and
 627          * KGSBASE somewhere truly awful e.g. point it at the last
 628          * valid address below the hole so that any attempts to index
 629          * off them cause an exception.
 630          *
 631          * For now, point it at 8G -- at least it should be unmapped
 632          * until some 64-bit processes run.
 633          */
 634         addr = 0x200000000ul;
 635         xen_set_segment_base(SEGBASE_FS, addr);
 636         xen_set_segment_base(SEGBASE_GS_USER, addr);
 637         xen_set_segment_base(SEGBASE_GS_USER_SEL, 0);
 638 
 639         return (gdt0);
 640 }
 641 
 642 #else   /* __xpv */
 643 
 644 static user_desc_t *
 645 init_gdt(void)
 646 {
 647         desctbr_t       r_bgdt, r_gdt;
 648         user_desc_t     *bgdt;
 649 
 650 #if !defined(__lint)
 651         /*
 652          * Our gdt is never larger than a single page.
 653          */
 654         ASSERT((sizeof (*gdt0) * NGDT) <= PAGESIZE);
 655 #endif
 656         gdt0 = (user_desc_t *)BOP_ALLOC(bootops, (caddr_t)GDT_VA,
 657             PAGESIZE, PAGESIZE);
 658         bzero(gdt0, PAGESIZE);
 659 
 660         init_gdt_common(gdt0);
 661 
 662         /*
 663          * Copy in from boot's gdt to our gdt.
 664          * Entry 0 is the null descriptor by definition.
 665          */
 666         rd_gdtr(&r_bgdt);
 667         bgdt = (user_desc_t *)r_bgdt.dtr_base;
 668         if (bgdt == NULL)
 669                 panic("null boot gdt");
 670 
 671         gdt0[GDT_B32DATA] = bgdt[GDT_B32DATA];
 672         gdt0[GDT_B32CODE] = bgdt[GDT_B32CODE];
 673         gdt0[GDT_B16CODE] = bgdt[GDT_B16CODE];
 674         gdt0[GDT_B16DATA] = bgdt[GDT_B16DATA];
 675         gdt0[GDT_B64CODE] = bgdt[GDT_B64CODE];
 676 
 677         /*
 678          * Install our new GDT
 679          */
 680         r_gdt.dtr_limit = (sizeof (*gdt0) * NGDT) - 1;
 681         r_gdt.dtr_base = (uintptr_t)gdt0;
 682         wr_gdtr(&r_gdt);
 683 
 684         /*
 685          * Reload the segment registers to use the new GDT
 686          */
 687         load_segment_registers(KCS_SEL, KFS_SEL, KGS_SEL, KDS_SEL);
 688 
 689         /*
 690          *  setup %gs for kernel
 691          */
 692         wrmsr(MSR_AMD_GSBASE, (uint64_t)&cpus[0]);
 693 
 694         /*
 695          * XX64 We should never dereference off "other gsbase" or
 696          * "fsbase".  So, we should arrange to point FSBASE and
 697          * KGSBASE somewhere truly awful e.g. point it at the last
 698          * valid address below the hole so that any attempts to index
 699          * off them cause an exception.
 700          *
 701          * For now, point it at 8G -- at least it should be unmapped
 702          * until some 64-bit processes run.
 703          */
 704         wrmsr(MSR_AMD_FSBASE, 0x200000000ul);
 705         wrmsr(MSR_AMD_KGSBASE, 0x200000000ul);
 706         return (gdt0);
 707 }
 708 
 709 #endif  /* __xpv */
 710 
 711 #elif defined(__i386)
 712 
 713 static void
 714 init_gdt_common(user_desc_t *gdt)
 715 {
 716         int i;
 717 
 718         /*
 719          * Text and data for both kernel and user span entire 32 bit
 720          * address space.
 721          */
 722 
 723         /*
 724          * kernel code segment.
 725          */
 726         set_usegd(&gdt[GDT_KCODE], NULL, -1, SDT_MEMERA, SEL_KPL, SDP_PAGES,
 727             SDP_OP32);
 728 
 729         /*
 730          * kernel data segment.
 731          */
 732         set_usegd(&gdt[GDT_KDATA], NULL, -1, SDT_MEMRWA, SEL_KPL, SDP_PAGES,
 733             SDP_OP32);
 734 
 735         /*
 736          * user code segment.
 737          */
 738         set_usegd(&gdt[GDT_UCODE], NULL, -1, SDT_MEMERA, SEL_UPL, SDP_PAGES,
 739             SDP_OP32);
 740 
 741         /*
 742          * user data segment.
 743          */
 744         set_usegd(&gdt[GDT_UDATA], NULL, -1, SDT_MEMRWA, SEL_UPL, SDP_PAGES,
 745             SDP_OP32);
 746 
 747 #if !defined(__xpv)
 748 
 749         /*
 750          * TSS for T_DBLFLT (double fault) handler
 751          */
 752         set_syssegd((system_desc_t *)&gdt[GDT_DBFLT], dftss0,
 753             sizeof (*dftss0) - 1, SDT_SYSTSS, SEL_KPL);
 754 
 755         /*
 756          * TSS for kernel
 757          */
 758         set_syssegd((system_desc_t *)&gdt[GDT_KTSS], ktss0,
 759             sizeof (*ktss0) - 1, SDT_SYSTSS, SEL_KPL);
 760 
 761 #endif  /* !__xpv */
 762 
 763         /*
 764          * %gs selector for kernel
 765          */
 766         set_usegd(&gdt[GDT_GS], &cpus[0], sizeof (struct cpu) -1, SDT_MEMRWA,
 767             SEL_KPL, SDP_BYTES, SDP_OP32);
 768 
 769         /*
 770          * Initialize lwp private descriptors.
 771          * Only attributes and limits are initialized, the effective
 772          * base address is programmed via fsbase/gsbase.
 773          */
 774         set_usegd(&gdt[GDT_LWPFS], NULL, (size_t)-1, SDT_MEMRWA, SEL_UPL,
 775             SDP_PAGES, SDP_OP32);
 776         set_usegd(&gdt[GDT_LWPGS], NULL, (size_t)-1, SDT_MEMRWA, SEL_UPL,
 777             SDP_PAGES, SDP_OP32);
 778 
 779         /*
 780          * Initialize the descriptors set aside for brand usage.
 781          * Only attributes and limits are initialized.
 782          */
 783         for (i = GDT_BRANDMIN; i <= GDT_BRANDMAX; i++)
 784                 set_usegd(&gdt0[i], NULL, (size_t)-1, SDT_MEMRWA, SEL_UPL,
 785                     SDP_PAGES, SDP_OP32);
 786         /*
 787          * Initialize convenient zero base user descriptor for clearing
 788          * lwp  private %fs and %gs descriptors in GDT. See setregs() for
 789          * an example.
 790          */
 791         set_usegd(&zero_udesc, NULL, -1, SDT_MEMRWA, SEL_UPL,
 792             SDP_BYTES, SDP_OP32);
 793 }
 794 
 795 #if defined(__xpv)
 796 
 797 static user_desc_t *
 798 init_gdt(void)
 799 {
 800         uint64_t gdtpa;
 801         ulong_t ma[1];          /* XXPV should be a memory_t */
 802 
 803 #if !defined(__lint)
 804         /*
 805          * Our gdt is never larger than a single page.
 806          */
 807         ASSERT((sizeof (*gdt0) * NGDT) <= PAGESIZE);
 808 #endif
 809         gdt0 = (user_desc_t *)BOP_ALLOC(bootops, (caddr_t)GDT_VA,
 810             PAGESIZE, PAGESIZE);
 811         bzero(gdt0, PAGESIZE);
 812 
 813         init_gdt_common(gdt0);
 814         gdtpa = pfn_to_pa(va_to_pfn(gdt0));
 815 
 816         /*
 817          * XXX Since we never invoke kmdb until after the kernel takes
 818          * over the descriptor tables why not have it use the kernel's
 819          * selectors?
 820          */
 821         if (boothowto & RB_DEBUG) {
 822                 set_usegd(&gdt0[GDT_B32DATA], NULL, -1, SDT_MEMRWA, SEL_KPL,
 823                     SDP_PAGES, SDP_OP32);
 824                 set_usegd(&gdt0[GDT_B32CODE], NULL, -1, SDT_MEMERA, SEL_KPL,
 825                     SDP_PAGES, SDP_OP32);
 826         }
 827 
 828         /*
 829          * Clear write permission for page containing the gdt and install it.
 830          */
 831         ma[0] = (ulong_t)(pa_to_ma(gdtpa) >> PAGESHIFT);
 832         kbm_read_only((uintptr_t)gdt0, gdtpa);
 833         xen_set_gdt(ma, NGDT);
 834 
 835         /*
 836          * Reload the segment registers to use the new GDT
 837          */
 838         load_segment_registers(
 839             KCS_SEL, KDS_SEL, KDS_SEL, KFS_SEL, KGS_SEL, KDS_SEL);
 840 
 841         return (gdt0);
 842 }
 843 
 844 #else   /* __xpv */
 845 
 846 static user_desc_t *
 847 init_gdt(void)
 848 {
 849         desctbr_t       r_bgdt, r_gdt;
 850         user_desc_t     *bgdt;
 851 
 852 #if !defined(__lint)
 853         /*
 854          * Our gdt is never larger than a single page.
 855          */
 856         ASSERT((sizeof (*gdt0) * NGDT) <= PAGESIZE);
 857 #endif
 858         /*
 859          * XXX this allocation belongs in our caller, not here.
 860          */
 861         gdt0 = (user_desc_t *)BOP_ALLOC(bootops, (caddr_t)GDT_VA,
 862             PAGESIZE, PAGESIZE);
 863         bzero(gdt0, PAGESIZE);
 864 
 865         init_gdt_common(gdt0);
 866 
 867         /*
 868          * Copy in from boot's gdt to our gdt entries.
 869          * Entry 0 is null descriptor by definition.
 870          */
 871         rd_gdtr(&r_bgdt);
 872         bgdt = (user_desc_t *)r_bgdt.dtr_base;
 873         if (bgdt == NULL)
 874                 panic("null boot gdt");
 875 
 876         gdt0[GDT_B32DATA] = bgdt[GDT_B32DATA];
 877         gdt0[GDT_B32CODE] = bgdt[GDT_B32CODE];
 878         gdt0[GDT_B16CODE] = bgdt[GDT_B16CODE];
 879         gdt0[GDT_B16DATA] = bgdt[GDT_B16DATA];
 880 
 881         /*
 882          * Install our new GDT
 883          */
 884         r_gdt.dtr_limit = (sizeof (*gdt0) * NGDT) - 1;
 885         r_gdt.dtr_base = (uintptr_t)gdt0;
 886         wr_gdtr(&r_gdt);
 887 
 888         /*
 889          * Reload the segment registers to use the new GDT
 890          */
 891         load_segment_registers(
 892             KCS_SEL, KDS_SEL, KDS_SEL, KFS_SEL, KGS_SEL, KDS_SEL);
 893 
 894         return (gdt0);
 895 }
 896 
 897 #endif  /* __xpv */
 898 #endif  /* __i386 */
 899 
 900 /*
 901  * Build kernel IDT.
 902  *
 903  * Note that for amd64 we pretty much require every gate to be an interrupt
 904  * gate which blocks interrupts atomically on entry; that's because of our
 905  * dependency on using 'swapgs' every time we come into the kernel to find
 906  * the cpu structure. If we get interrupted just before doing that, %cs could
 907  * be in kernel mode (so that the trap prolog doesn't do a swapgs), but
 908  * %gsbase is really still pointing at something in userland. Bad things will
 909  * ensue. We also use interrupt gates for i386 as well even though this is not
 910  * required for some traps.
 911  *
 912  * Perhaps they should have invented a trap gate that does an atomic swapgs?
 913  */
 914 static void
 915 init_idt_common(gate_desc_t *idt)
 916 {
 917         set_gatesegd(&idt[T_ZERODIV],
 918             (kpti_enable == 1) ? &tr_div0trap : &div0trap,
 919             KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_ZERODIV));
 920         set_gatesegd(&idt[T_SGLSTP],
 921             (kpti_enable == 1) ? &tr_dbgtrap : &dbgtrap,
 922             KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_SGLSTP));
 923         set_gatesegd(&idt[T_NMIFLT],
 924             (kpti_enable == 1) ? &tr_nmiint : &nmiint,
 925             KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_NMIFLT));
 926         set_gatesegd(&idt[T_BPTFLT],
 927             (kpti_enable == 1) ? &tr_brktrap : &brktrap,
 928             KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_BPTFLT));
 929         set_gatesegd(&idt[T_OVFLW],
 930             (kpti_enable == 1) ? &tr_ovflotrap : &ovflotrap,
 931             KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_OVFLW));
 932         set_gatesegd(&idt[T_BOUNDFLT],
 933             (kpti_enable == 1) ? &tr_boundstrap : &boundstrap,
 934             KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_BOUNDFLT));
 935         set_gatesegd(&idt[T_ILLINST],
 936             (kpti_enable == 1) ? &tr_invoptrap : &invoptrap,
 937             KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_ILLINST));
 938         set_gatesegd(&idt[T_NOEXTFLT],
 939             (kpti_enable == 1) ? &tr_ndptrap : &ndptrap,
 940             KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_NOEXTFLT));
 941 
 942         /*
 943          * double fault handler.
 944          *
 945          * Note that on the hypervisor a guest does not receive #df faults.
 946          * Instead a failsafe event is injected into the guest if its selectors
 947          * and/or stack is in a broken state. See xen_failsafe_callback.
 948          */
 949 #if !defined(__xpv)
 950         set_gatesegd(&idt[T_DBLFLT],
 951             (kpti_enable == 1) ? &tr_syserrtrap : &syserrtrap,
 952             KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_DBLFLT));
 953 #endif  /* !__xpv */
 954 
 955         /*
 956          * T_EXTOVRFLT coprocessor-segment-overrun not supported.
 957          */
 958         set_gatesegd(&idt[T_TSSFLT],
 959             (kpti_enable == 1) ? &tr_invtsstrap : &invtsstrap,
 960             KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_TSSFLT));
 961         set_gatesegd(&idt[T_SEGFLT],
 962             (kpti_enable == 1) ? &tr_segnptrap : &segnptrap,
 963             KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_SEGFLT));
 964         set_gatesegd(&idt[T_STKFLT],
 965             (kpti_enable == 1) ? &tr_stktrap : &stktrap,
 966             KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_STKFLT));
 967         set_gatesegd(&idt[T_GPFLT],
 968             (kpti_enable == 1) ? &tr_gptrap : &gptrap,
 969             KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_GPFLT));
 970         set_gatesegd(&idt[T_PGFLT],
 971             (kpti_enable == 1) ? &tr_pftrap : &pftrap,
 972             KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_PGFLT));
 973         set_gatesegd(&idt[T_EXTERRFLT],
 974             (kpti_enable == 1) ? &tr_ndperr : &ndperr,
 975             KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_EXTERRFLT));
 976         set_gatesegd(&idt[T_ALIGNMENT],
 977             (kpti_enable == 1) ? &tr_achktrap : &achktrap,
 978             KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_ALIGNMENT));
 979         set_gatesegd(&idt[T_MCE],
 980             (kpti_enable == 1) ? &tr_mcetrap : &mcetrap,
 981             KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_MCE));
 982         set_gatesegd(&idt[T_SIMDFPE],
 983             (kpti_enable == 1) ? &tr_xmtrap : &xmtrap,
 984             KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_SIMDFPE));
 985 
 986         /*
 987          * install fast trap handler at 210.
 988          */
 989         set_gatesegd(&idt[T_FASTTRAP],
 990             (kpti_enable == 1) ? &tr_fasttrap : &fasttrap,
 991             KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_FASTTRAP));
 992 
 993         /*
 994          * System call handler.
 995          */
 996         set_gatesegd(&idt[T_SYSCALLINT],
 997             (kpti_enable == 1) ? &tr_sys_syscall_int : &sys_syscall_int,
 998             KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_SYSCALLINT));
 999 
1000         /*
1001          * Install the DTrace interrupt handler for the pid provider.
1002          */
1003         set_gatesegd(&idt[T_DTRACE_RET],
1004             (kpti_enable == 1) ? &tr_dtrace_ret : &dtrace_ret,
1005             KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_DTRACE_RET));
1006 
1007         /*
1008          * Prepare interposing descriptor for the syscall handler
1009          * and cache copy of the default descriptor.
1010          */
1011         brand_tbl[0].ih_inum = T_SYSCALLINT;
1012         brand_tbl[0].ih_default_desc = idt0[T_SYSCALLINT];
1013 
1014         set_gatesegd(&(brand_tbl[0].ih_interp_desc),
1015             (kpti_enable == 1) ? &tr_brand_sys_syscall_int :
1016             &brand_sys_syscall_int, KCS_SEL, SDT_SYSIGT, TRP_UPL,
1017             idt_vector_to_ist(T_SYSCALLINT));
1018 
1019         brand_tbl[1].ih_inum = 0;
1020 }
1021 
1022 #if defined(__xpv)
1023 
1024 static void
1025 init_idt(gate_desc_t *idt)
1026 {
1027         init_idt_common(idt);
1028 }
1029 
1030 #else   /* __xpv */
1031 
1032 static void
1033 init_idt(gate_desc_t *idt)
1034 {
1035         char    ivctname[80];
1036         void    (*ivctptr)(void);
1037         int     i;
1038 
1039         /*
1040          * Initialize entire table with 'reserved' trap and then overwrite
1041          * specific entries. T_EXTOVRFLT (9) is unsupported and reserved
1042          * since it can only be generated on a 386 processor. 15 is also
1043          * unsupported and reserved.
1044          */
1045 #if !defined(__xpv)
1046         for (i = 0; i < NIDT; i++) {
1047                 set_gatesegd(&idt[i],
1048                     (kpti_enable == 1) ? &tr_resvtrap : &resvtrap,
1049                     KCS_SEL, SDT_SYSIGT, TRP_KPL,
1050                     idt_vector_to_ist(T_RESVTRAP));
1051         }
1052 #else
1053         for (i = 0; i < NIDT; i++) {
1054                 set_gatesegd(&idt[i], &resvtrap, KCS_SEL, SDT_SYSIGT, TRP_KPL,
1055                     IST_NONE);
1056         }
1057 #endif
1058 
1059         /*
1060          * 20-31 reserved
1061          */
1062 #if !defined(__xpv)
1063         for (i = 20; i < 32; i++) {
1064                 set_gatesegd(&idt[i],
1065                     (kpti_enable == 1) ? &tr_invaltrap : &invaltrap,
1066                     KCS_SEL, SDT_SYSIGT, TRP_KPL,
1067                     idt_vector_to_ist(T_INVALTRAP));
1068         }
1069 #else
1070         for (i = 20; i < 32; i++) {
1071                 set_gatesegd(&idt[i], &invaltrap, KCS_SEL, SDT_SYSIGT, TRP_KPL,
1072                     IST_NONE);
1073         }
1074 #endif
1075 
1076         /*
1077          * interrupts 32 - 255
1078          */
1079         for (i = 32; i < 256; i++) {
1080 #if !defined(__xpv)
1081                 (void) snprintf(ivctname, sizeof (ivctname),
1082                     (kpti_enable == 1) ? "tr_ivct%d" : "ivct%d", i);
1083 #else
1084                 (void) snprintf(ivctname, sizeof (ivctname), "ivct%d", i);
1085 #endif
1086                 ivctptr = (void (*)(void))kobj_getsymvalue(ivctname, 0);
1087                 if (ivctptr == NULL)
1088                         panic("kobj_getsymvalue(%s) failed", ivctname);
1089 
1090                 set_gatesegd(&idt[i], ivctptr, KCS_SEL, SDT_SYSIGT, TRP_KPL,
1091                     idt_vector_to_ist(i));
1092         }
1093 
1094         /*
1095          * Now install the common ones. Note that it will overlay some
1096          * entries installed above like T_SYSCALLINT, T_FASTTRAP etc.
1097          */
1098         init_idt_common(idt);
1099 }
1100 
1101 #endif  /* __xpv */
1102 
1103 /*
1104  * The kernel does not deal with LDTs unless a user explicitly creates
1105  * one. Under normal circumstances, the LDTR contains 0. Any process attempting
1106  * to reference the LDT will therefore cause a #gp. System calls made via the
1107  * obsolete lcall mechanism are emulated by the #gp fault handler.
1108  */
1109 static void
1110 init_ldt(void)
1111 {
1112 #if defined(__xpv)
1113         xen_set_ldt(NULL, 0);
1114 #else
1115         wr_ldtr(0);
1116 #endif
1117 }
1118 
1119 #if !defined(__xpv)
1120 
1121 static void
1122 init_tss(void)
1123 {
1124         extern struct cpu cpus[];
1125 
1126         /*
1127          * tss_rsp0 is dynamically filled in by resume() (in swtch.s) on each
1128          * context switch but it'll be overwritten with this same value anyway.
1129          */
1130         if (kpti_enable == 1) {
1131                 ktss0->tss_rsp0 = (uint64_t)&cpus->cpu_m.mcpu_kpti.kf_tr_rsp;
1132         }
1133 
1134         /* Set up the IST stacks for double fault, NMI, MCE. */
1135         ktss0->tss_ist1 = (uintptr_t)&dblfault_stack0[sizeof (dblfault_stack0)];
1136         ktss0->tss_ist2 = (uintptr_t)&nmi_stack0[sizeof (nmi_stack0)];
1137         ktss0->tss_ist3 = (uintptr_t)&mce_stack0[sizeof (mce_stack0)];
1138 
1139         /*
1140          * This IST stack is used for #DB,#BP (debug) interrupts (when KPTI is
1141          * enabled), and also for KDI (always).
1142          */
1143         ktss0->tss_ist4 = (uint64_t)&cpus->cpu_m.mcpu_kpti_dbg.kf_tr_rsp;
1144 
1145         if (kpti_enable == 1) {
1146                 /* This IST stack is used for #GP,#PF,#SS (fault) interrupts. */
1147                 ktss0->tss_ist5 =
1148                     (uint64_t)&cpus->cpu_m.mcpu_kpti_flt.kf_tr_rsp;
1149 
1150                 /* This IST stack is used for all other intrs (for KPTI). */
1151                 ktss0->tss_ist6 = (uint64_t)&cpus->cpu_m.mcpu_kpti.kf_tr_rsp;
1152         }
1153 
1154         /*
1155          * Set I/O bit map offset equal to size of TSS segment limit
1156          * for no I/O permission map. This will force all user I/O
1157          * instructions to generate #gp fault.
1158          */
1159         ktss0->tss_bitmapbase = sizeof (*ktss0);
1160 
1161         /*
1162          * Point %tr to descriptor for ktss0 in gdt.
1163          */
1164         wr_tsr(KTSS_SEL);
1165 }
1166 
1167 #endif  /* !__xpv */
1168 
1169 #if defined(__xpv)
1170 
1171 void
1172 init_desctbls(void)
1173 {
1174         uint_t vec;
1175         user_desc_t *gdt;
1176 
1177         /*
1178          * Setup and install our GDT.
1179          */
1180         gdt = init_gdt();
1181 
1182         /*
1183          * Store static pa of gdt to speed up pa_to_ma() translations
1184          * on lwp context switches.
1185          */
1186         ASSERT(IS_P2ALIGNED((uintptr_t)gdt, PAGESIZE));
1187         CPU->cpu_gdt = gdt;
1188         CPU->cpu_m.mcpu_gdtpa = pfn_to_pa(va_to_pfn(gdt));
1189 
1190         /*
1191          * Setup and install our IDT.
1192          */
1193 #if !defined(__lint)
1194         ASSERT(NIDT * sizeof (*idt0) <= PAGESIZE);
1195 #endif
1196         idt0 = (gate_desc_t *)BOP_ALLOC(bootops, (caddr_t)IDT_VA,
1197             PAGESIZE, PAGESIZE);
1198         bzero(idt0, PAGESIZE);
1199         init_idt(idt0);
1200         for (vec = 0; vec < NIDT; vec++)
1201                 xen_idt_write(&idt0[vec], vec);
1202 
1203         CPU->cpu_idt = idt0;
1204 
1205         /*
1206          * set default kernel stack
1207          */
1208         xen_stack_switch(KDS_SEL,
1209             (ulong_t)&dblfault_stack0[sizeof (dblfault_stack0)]);
1210 
1211         xen_init_callbacks();
1212 
1213         init_ldt();
1214 }
1215 
1216 #else   /* __xpv */
1217 
1218 void
1219 init_desctbls(void)
1220 {
1221         user_desc_t *gdt;
1222         desctbr_t idtr;
1223 
1224         /*
1225          * Allocate IDT and TSS structures on unique pages for better
1226          * performance in virtual machines.
1227          */
1228 #if !defined(__lint)
1229         ASSERT(NIDT * sizeof (*idt0) <= PAGESIZE);
1230 #endif
1231         idt0 = (gate_desc_t *)BOP_ALLOC(bootops, (caddr_t)IDT_VA,
1232             PAGESIZE, PAGESIZE);
1233         bzero(idt0, PAGESIZE);
1234 #if !defined(__lint)
1235         ASSERT(sizeof (*ktss0) <= PAGESIZE);
1236 #endif
1237         ktss0 = (tss_t *)BOP_ALLOC(bootops, (caddr_t)KTSS_VA,
1238             PAGESIZE, PAGESIZE);
1239         bzero(ktss0, PAGESIZE);
1240 
1241 #if defined(__i386)
1242 #if !defined(__lint)
1243         ASSERT(sizeof (*dftss0) <= PAGESIZE);
1244 #endif
1245         dftss0 = (tss_t *)BOP_ALLOC(bootops, (caddr_t)DFTSS_VA,
1246             PAGESIZE, PAGESIZE);
1247         bzero(dftss0, PAGESIZE);
1248 #endif
1249 
1250         /*
1251          * Setup and install our GDT.
1252          */
1253         gdt = init_gdt();
1254         ASSERT(IS_P2ALIGNED((uintptr_t)gdt, PAGESIZE));
1255         CPU->cpu_gdt = gdt;
1256 
1257         /*
1258          * Initialize this CPU's LDT.
1259          */
1260         CPU->cpu_m.mcpu_ldt = BOP_ALLOC(bootops, (caddr_t)LDT_VA,
1261             LDT_CPU_SIZE, PAGESIZE);
1262         bzero(CPU->cpu_m.mcpu_ldt, LDT_CPU_SIZE);
1263         CPU->cpu_m.mcpu_ldt_len = 0;
1264 
1265         /*
1266          * Setup and install our IDT.
1267          */
1268         init_idt(idt0);
1269 
1270         idtr.dtr_base = (uintptr_t)idt0;
1271         idtr.dtr_limit = (NIDT * sizeof (*idt0)) - 1;
1272         wr_idtr(&idtr);
1273         CPU->cpu_idt = idt0;
1274 
1275 #if defined(__i386)
1276         /*
1277          * We maintain a description of idt0 in convenient IDTR format
1278          * for #pf's on some older pentium processors. See pentium_pftrap().
1279          */
1280         idt0_default_r = idtr;
1281 #endif  /* __i386 */
1282 
1283         init_tss();
1284         CPU->cpu_tss = ktss0;
1285         init_ldt();
1286 
1287         /* Stash this so that the NMI,MCE,#DF and KDI handlers can use it. */
1288         kpti_safe_cr3 = (uint64_t)getcr3();
1289 }
1290 
1291 #endif  /* __xpv */
1292 
1293 #ifndef __xpv
1294 /*
1295  * As per Intel Vol 3 27.5.2, the GDTR limit is reset to 64Kb on a VM exit, so
1296  * we have to manually fix it up ourselves.
1297  *
1298  * The caller may still need to make sure that it can't go off-CPU with the
1299  * incorrect limit, before calling this (such as disabling pre-emption).
1300  */
1301 void
1302 reset_gdtr_limit(void)
1303 {
1304         ulong_t flags = intr_clear();
1305         desctbr_t gdtr;
1306 
1307         rd_gdtr(&gdtr);
1308         gdtr.dtr_limit = (sizeof (user_desc_t) * NGDT) - 1;
1309         wr_gdtr(&gdtr);
1310 
1311         intr_restore(flags);
1312 }
1313 #endif /* __xpv */
1314 
1315 /*
1316  * In the early kernel, we need to set up a simple GDT to run on.
1317  *
1318  * XXPV Can dboot use this too?  See dboot_gdt.s
1319  */
1320 void
1321 init_boot_gdt(user_desc_t *bgdt)
1322 {
1323 #if defined(__amd64)
1324         set_usegd(&bgdt[GDT_B32DATA], SDP_LONG, NULL, -1, SDT_MEMRWA, SEL_KPL,
1325             SDP_PAGES, SDP_OP32);
1326         set_usegd(&bgdt[GDT_B64CODE], SDP_LONG, NULL, -1, SDT_MEMERA, SEL_KPL,
1327             SDP_PAGES, SDP_OP32);
1328 #elif defined(__i386)
1329         set_usegd(&bgdt[GDT_B32DATA], NULL, -1, SDT_MEMRWA, SEL_KPL,
1330             SDP_PAGES, SDP_OP32);
1331         set_usegd(&bgdt[GDT_B32CODE], NULL, -1, SDT_MEMERA, SEL_KPL,
1332             SDP_PAGES, SDP_OP32);
1333 #endif  /* __i386 */
1334 }
1335 
1336 /*
1337  * Enable interpositioning on the system call path by rewriting the
1338  * sys{call|enter} MSRs and the syscall-related entries in the IDT to use
1339  * the branded entry points.
1340  */
1341 void
1342 brand_interpositioning_enable(void)
1343 {
1344         gate_desc_t     *idt = CPU->cpu_idt;
1345         int             i;
1346 
1347         ASSERT(curthread->t_preempt != 0 || getpil() >= DISP_LEVEL);
1348 
1349         for (i = 0; brand_tbl[i].ih_inum; i++) {
1350                 idt[brand_tbl[i].ih_inum] = brand_tbl[i].ih_interp_desc;
1351 #if defined(__xpv)
1352                 xen_idt_write(&idt[brand_tbl[i].ih_inum],
1353                     brand_tbl[i].ih_inum);
1354 #endif
1355         }
1356 
1357 #if defined(__amd64)
1358 #if defined(__xpv)
1359 
1360         /*
1361          * Currently the hypervisor only supports 64-bit syscalls via
1362          * syscall instruction. The 32-bit syscalls are handled by
1363          * interrupt gate above.
1364          */
1365         xen_set_callback(brand_sys_syscall, CALLBACKTYPE_syscall,
1366             CALLBACKF_mask_events);
1367 
1368 #else
1369 
1370         if (is_x86_feature(x86_featureset, X86FSET_ASYSC)) {
1371                 if (kpti_enable == 1) {
1372                         wrmsr(MSR_AMD_LSTAR, (uintptr_t)tr_brand_sys_syscall);
1373                         wrmsr(MSR_AMD_CSTAR, (uintptr_t)tr_brand_sys_syscall32);
1374                 } else {
1375                         wrmsr(MSR_AMD_LSTAR, (uintptr_t)brand_sys_syscall);
1376                         wrmsr(MSR_AMD_CSTAR, (uintptr_t)brand_sys_syscall32);
1377                 }
1378         }
1379 
1380 #endif
1381 #endif  /* __amd64 */
1382 
1383         if (is_x86_feature(x86_featureset, X86FSET_SEP)) {
1384                 if (kpti_enable == 1) {
1385                         wrmsr(MSR_INTC_SEP_EIP,
1386                             (uintptr_t)tr_brand_sys_sysenter);
1387                 } else {
1388                         wrmsr(MSR_INTC_SEP_EIP, (uintptr_t)brand_sys_sysenter);
1389                 }
1390         }
1391 }
1392 
1393 /*
1394  * Disable interpositioning on the system call path by rewriting the
1395  * sys{call|enter} MSRs and the syscall-related entries in the IDT to use
1396  * the standard entry points, which bypass the interpositioning hooks.
1397  */
1398 void
1399 brand_interpositioning_disable(void)
1400 {
1401         gate_desc_t     *idt = CPU->cpu_idt;
1402         int i;
1403 
1404         ASSERT(curthread->t_preempt != 0 || getpil() >= DISP_LEVEL);
1405 
1406         for (i = 0; brand_tbl[i].ih_inum; i++) {
1407                 idt[brand_tbl[i].ih_inum] = brand_tbl[i].ih_default_desc;
1408 #if defined(__xpv)
1409                 xen_idt_write(&idt[brand_tbl[i].ih_inum],
1410                     brand_tbl[i].ih_inum);
1411 #endif
1412         }
1413 
1414 #if defined(__amd64)
1415 #if defined(__xpv)
1416 
1417         /*
1418          * See comment above in brand_interpositioning_enable.
1419          */
1420         xen_set_callback(sys_syscall, CALLBACKTYPE_syscall,
1421             CALLBACKF_mask_events);
1422 
1423 #else
1424 
1425         if (is_x86_feature(x86_featureset, X86FSET_ASYSC)) {
1426                 if (kpti_enable == 1) {
1427                         wrmsr(MSR_AMD_LSTAR, (uintptr_t)tr_sys_syscall);
1428                         wrmsr(MSR_AMD_CSTAR, (uintptr_t)tr_sys_syscall32);
1429                 } else {
1430                         wrmsr(MSR_AMD_LSTAR, (uintptr_t)sys_syscall);
1431                         wrmsr(MSR_AMD_CSTAR, (uintptr_t)sys_syscall32);
1432                 }
1433         }
1434 
1435 #endif
1436 #endif  /* __amd64 */
1437 
1438         if (is_x86_feature(x86_featureset, X86FSET_SEP)) {
1439                 if (kpti_enable == 1) {
1440                         wrmsr(MSR_INTC_SEP_EIP, (uintptr_t)tr_sys_sysenter);
1441                 } else {
1442                         wrmsr(MSR_INTC_SEP_EIP, (uintptr_t)sys_sysenter);
1443                 }
1444         }
1445 }
--- EOF ---