9723 provide support for VMM's GDT handling
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Patrick Mooney <patrick.mooney@joyent.com>
1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
24 */
25
26 /*
27 * Copyright 2018 Joyent, Inc. All rights reserved.
28 */
29
30 /*
31 * Copyright (c) 1992 Terrence R. Lambert.
32 * Copyright (c) 1990 The Regents of the University of California.
33 * All rights reserved.
34 *
35 * This code is derived from software contributed to Berkeley by
36 * William Jolitz.
37 *
38 * Redistribution and use in source and binary forms, with or without
39 * modification, are permitted provided that the following conditions
40 * are met:
41 * 1. Redistributions of source code must retain the above copyright
42 * notice, this list of conditions and the following disclaimer.
43 * 2. Redistributions in binary form must reproduce the above copyright
44 * notice, this list of conditions and the following disclaimer in the
45 * documentation and/or other materials provided with the distribution.
46 * 3. All advertising materials mentioning features or use of this software
47 * must display the following acknowledgement:
48 * This product includes software developed by the University of
49 * California, Berkeley and its contributors.
50 * 4. Neither the name of the University nor the names of its contributors
51 * may be used to endorse or promote products derived from this software
52 * without specific prior written permission.
53 *
54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64 * SUCH DAMAGE.
65 *
66 * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91
67 */
68
69 #include <sys/types.h>
70 #include <sys/sysmacros.h>
71 #include <sys/tss.h>
72 #include <sys/segments.h>
73 #include <sys/trap.h>
74 #include <sys/cpuvar.h>
75 #include <sys/bootconf.h>
76 #include <sys/x86_archext.h>
77 #include <sys/controlregs.h>
78 #include <sys/archsystm.h>
79 #include <sys/machsystm.h>
80 #include <sys/kobj.h>
81 #include <sys/cmn_err.h>
82 #include <sys/reboot.h>
83 #include <sys/kdi.h>
84 #include <sys/mach_mmu.h>
85 #include <sys/systm.h>
86 #include <sys/note.h>
87
88 #ifdef __xpv
89 #include <sys/hypervisor.h>
90 #include <vm/as.h>
91 #endif
92
93 #include <sys/promif.h>
94 #include <sys/bootinfo.h>
95 #include <vm/kboot_mmu.h>
96 #include <vm/hat_pte.h>
97
98 /*
99 * cpu0 and default tables and structures.
100 */
101 user_desc_t *gdt0;
102 #if !defined(__xpv)
103 desctbr_t gdt0_default_r;
104 #endif
105
106 gate_desc_t *idt0; /* interrupt descriptor table */
107 #if defined(__i386)
108 desctbr_t idt0_default_r; /* describes idt0 in IDTR format */
109 #endif
110
111 tss_t *ktss0; /* kernel task state structure */
112
113 #if defined(__i386)
114 tss_t *dftss0; /* #DF double-fault exception */
115 #endif /* __i386 */
116
117 user_desc_t zero_udesc; /* base zero user desc native procs */
118 user_desc_t null_udesc; /* null user descriptor */
119 system_desc_t null_sdesc; /* null system descriptor */
120
121 #if defined(__amd64)
122 user_desc_t zero_u32desc; /* 32-bit compatibility procs */
123 #endif /* __amd64 */
124
125 #if defined(__amd64)
126 user_desc_t ucs_on;
127 user_desc_t ucs_off;
128 user_desc_t ucs32_on;
129 user_desc_t ucs32_off;
130 #endif /* __amd64 */
131
132 /*
133 * If the size of this is changed, you must update hat_pcp_setup() and the
134 * definitions in exception.s
135 */
136 extern char dblfault_stack0[DEFAULTSTKSZ];
137 extern char nmi_stack0[DEFAULTSTKSZ];
138 extern char mce_stack0[DEFAULTSTKSZ];
139
140 extern void fast_null(void);
141 extern hrtime_t get_hrtime(void);
142 extern hrtime_t gethrvtime(void);
143 extern hrtime_t get_hrestime(void);
144 extern uint64_t getlgrp(void);
145
146 void (*(fasttable[]))(void) = {
147 fast_null, /* T_FNULL routine */
148 fast_null, /* T_FGETFP routine (initially null) */
149 fast_null, /* T_FSETFP routine (initially null) */
150 (void (*)())get_hrtime, /* T_GETHRTIME */
151 (void (*)())gethrvtime, /* T_GETHRVTIME */
152 (void (*)())get_hrestime, /* T_GETHRESTIME */
153 (void (*)())getlgrp /* T_GETLGRP */
154 };
155
156 /*
157 * Structure containing pre-computed descriptors to allow us to temporarily
158 * interpose on a standard handler.
159 */
160 struct interposing_handler {
161 int ih_inum;
162 gate_desc_t ih_interp_desc;
163 gate_desc_t ih_default_desc;
164 };
165
166 /*
167 * The brand infrastructure interposes on two handlers, and we use one as a
168 * NULL signpost.
169 */
170 static struct interposing_handler brand_tbl[2];
171
172 /*
173 * software prototypes for default local descriptor table
174 */
175
176 /*
177 * Routines for loading segment descriptors in format the hardware
178 * can understand.
179 */
180
181 /*
182 * In long mode we have the new L or long mode attribute bit
183 * for code segments. Only the conforming bit in type is used along
184 * with descriptor priority and present bits. Default operand size must
185 * be zero when in long mode. In 32-bit compatibility mode all fields
186 * are treated as in legacy mode. For data segments while in long mode
187 * only the present bit is loaded.
188 */
189 void
190 set_usegd(user_desc_t *dp, uint_t lmode, void *base, size_t size,
191 uint_t type, uint_t dpl, uint_t gran, uint_t defopsz)
192 {
193 ASSERT(lmode == SDP_SHORT || lmode == SDP_LONG);
194 /* This should never be a "system" segment. */
195 ASSERT3U(type & SDT_S, !=, 0);
196
197 /*
198 * 64-bit long mode.
199 */
200 if (lmode == SDP_LONG)
201 dp->usd_def32 = 0; /* 32-bit operands only */
202 else
203 /*
204 * 32-bit compatibility mode.
205 */
206 dp->usd_def32 = defopsz; /* 0 = 16, 1 = 32-bit ops */
207
208 /*
209 * We should always set the "accessed" bit (SDT_A), otherwise the CPU
210 * will write to the GDT whenever we change segment registers around.
211 * With KPTI on, the GDT is read-only in the user page table, which
212 * causes crashes if we don't set this.
213 */
214 ASSERT3U(type & SDT_A, !=, 0);
215
216 dp->usd_long = lmode; /* 64-bit mode */
217 dp->usd_type = type;
218 dp->usd_dpl = dpl;
219 dp->usd_p = 1;
220 dp->usd_gran = gran; /* 0 = bytes, 1 = pages */
221
222 dp->usd_lobase = (uintptr_t)base;
223 dp->usd_midbase = (uintptr_t)base >> 16;
224 dp->usd_hibase = (uintptr_t)base >> (16 + 8);
225 dp->usd_lolimit = size;
226 dp->usd_hilimit = (uintptr_t)size >> 16;
227 }
228
229 /*
230 * Install system segment descriptor for LDT and TSS segments.
231 */
232
233 void
234 set_syssegd(system_desc_t *dp, void *base, size_t size, uint_t type,
235 uint_t dpl)
236 {
237 dp->ssd_lolimit = size;
238 dp->ssd_hilimit = (uintptr_t)size >> 16;
239
240 dp->ssd_lobase = (uintptr_t)base;
241 dp->ssd_midbase = (uintptr_t)base >> 16;
242 dp->ssd_hibase = (uintptr_t)base >> (16 + 8);
243 dp->ssd_hi64base = (uintptr_t)base >> (16 + 8 + 8);
244
245 dp->ssd_type = type;
246 dp->ssd_zero1 = 0; /* must be zero */
247 dp->ssd_zero2 = 0;
248 dp->ssd_dpl = dpl;
249 dp->ssd_p = 1;
250 dp->ssd_gran = 0; /* force byte units */
251 }
252
253 void *
254 get_ssd_base(system_desc_t *dp)
255 {
256 uintptr_t base;
257
258 base = (uintptr_t)dp->ssd_lobase |
259 (uintptr_t)dp->ssd_midbase << 16 |
260 (uintptr_t)dp->ssd_hibase << (16 + 8) |
261 (uintptr_t)dp->ssd_hi64base << (16 + 8 + 8);
262 return ((void *)base);
263 }
264
265 /*
266 * Install gate segment descriptor for interrupt, trap, call and task gates.
267 *
268 * For 64 bit native if we have KPTI enabled, we use the IST stack mechanism on
269 * all interrupts. We have different ISTs for each class of exceptions that are
270 * most likely to occur while handling an existing exception; while many of
271 * these are just going to panic, it's nice not to trample on the existing
272 * exception state for debugging purposes.
273 *
274 * Normal interrupts are all redirected unconditionally to the KPTI trampoline
275 * stack space. This unifies the trampoline handling between user and kernel
276 * space (and avoids the need to touch %gs).
277 *
278 * The KDI IDT *all* uses the DBG IST: consider single stepping tr_pftrap, when
279 * we do a read from KMDB that cause another #PF. Without its own IST, this
280 * would stomp on the kernel's mcpu_kpti_flt frame.
281 */
282 uint_t
283 idt_vector_to_ist(uint_t vector)
284 {
285 #if defined(__xpv)
286 _NOTE(ARGUNUSED(vector));
287 return (IST_NONE);
288 #else
289 switch (vector) {
290 /* These should always use IST even without KPTI enabled. */
291 case T_DBLFLT:
292 return (IST_DF);
293 case T_NMIFLT:
294 return (IST_NMI);
295 case T_MCE:
296 return (IST_MCE);
297
298 case T_BPTFLT:
299 case T_SGLSTP:
300 if (kpti_enable == 1) {
301 return (IST_DBG);
302 }
303 return (IST_NONE);
304 case T_STKFLT:
305 case T_GPFLT:
306 case T_PGFLT:
307 if (kpti_enable == 1) {
308 return (IST_NESTABLE);
309 }
310 return (IST_NONE);
311 default:
312 if (kpti_enable == 1) {
313 return (IST_DEFAULT);
314 }
315 return (IST_NONE);
316 }
317 #endif
318 }
319
320 void
321 set_gatesegd(gate_desc_t *dp, void (*func)(void), selector_t sel,
322 uint_t type, uint_t dpl, uint_t ist)
323 {
324 dp->sgd_looffset = (uintptr_t)func;
325 dp->sgd_hioffset = (uintptr_t)func >> 16;
326 dp->sgd_hi64offset = (uintptr_t)func >> (16 + 16);
327 dp->sgd_selector = (uint16_t)sel;
328 dp->sgd_ist = ist;
329 dp->sgd_type = type;
330 dp->sgd_dpl = dpl;
331 dp->sgd_p = 1;
332 }
333
334 /*
335 * Updates a single user descriptor in the the GDT of the current cpu.
336 * Caller is responsible for preventing cpu migration.
337 */
338
339 void
340 gdt_update_usegd(uint_t sidx, user_desc_t *udp)
341 {
342 #if defined(DEBUG)
343 /* This should never be a "system" segment, but it might be null. */
344 if (udp->usd_p != 0 || udp->usd_type != 0) {
345 ASSERT3U(udp->usd_type & SDT_S, !=, 0);
346 }
347 /*
348 * We should always set the "accessed" bit (SDT_A), otherwise the CPU
349 * will write to the GDT whenever we change segment registers around.
350 * With KPTI on, the GDT is read-only in the user page table, which
351 * causes crashes if we don't set this.
352 */
353 if (udp->usd_p != 0 || udp->usd_type != 0) {
354 ASSERT3U(udp->usd_type & SDT_A, !=, 0);
355 }
356 #endif
357
358 #if defined(__xpv)
359 uint64_t dpa = CPU->cpu_m.mcpu_gdtpa + sizeof (*udp) * sidx;
360
361 if (HYPERVISOR_update_descriptor(pa_to_ma(dpa), *(uint64_t *)udp))
362 panic("gdt_update_usegd: HYPERVISOR_update_descriptor");
363
364 #else /* __xpv */
365 CPU->cpu_gdt[sidx] = *udp;
366 #endif /* __xpv */
367 }
368
369 /*
370 * Writes single descriptor pointed to by udp into a processes
371 * LDT entry pointed to by ldp.
372 */
373 int
374 ldt_update_segd(user_desc_t *ldp, user_desc_t *udp)
375 {
376 #if defined(DEBUG)
377 /* This should never be a "system" segment, but it might be null. */
378 if (udp->usd_p != 0 || udp->usd_type != 0) {
379 ASSERT3U(udp->usd_type & SDT_S, !=, 0);
380 }
381 /*
382 * We should always set the "accessed" bit (SDT_A), otherwise the CPU
383 * will write to the LDT whenever we change segment registers around.
384 * With KPTI on, the LDT is read-only in the user page table, which
385 * causes crashes if we don't set this.
386 */
387 if (udp->usd_p != 0 || udp->usd_type != 0) {
388 ASSERT3U(udp->usd_type & SDT_A, !=, 0);
389 }
390 #endif
391
392 #if defined(__xpv)
393 uint64_t dpa;
394
395 dpa = mmu_ptob(hat_getpfnum(kas.a_hat, (caddr_t)ldp)) |
396 ((uintptr_t)ldp & PAGEOFFSET);
397
398 /*
399 * The hypervisor is a little more restrictive about what it
400 * supports in the LDT.
401 */
402 if (HYPERVISOR_update_descriptor(pa_to_ma(dpa), *(uint64_t *)udp) != 0)
403 return (EINVAL);
404
405 #else /* __xpv */
406 *ldp = *udp;
407
408 #endif /* __xpv */
409 return (0);
410 }
411
412 #if defined(__xpv)
413
414 /*
415 * Converts hw format gate descriptor into pseudo-IDT format for the hypervisor.
416 * Returns true if a valid entry was written.
417 */
418 int
419 xen_idt_to_trap_info(uint_t vec, gate_desc_t *sgd, void *ti_arg)
420 {
421 trap_info_t *ti = ti_arg; /* XXPV Aargh - segments.h comment */
422
423 /*
424 * skip holes in the IDT
425 */
426 if (GATESEG_GETOFFSET(sgd) == 0)
427 return (0);
428
429 ASSERT(sgd->sgd_type == SDT_SYSIGT);
430 ti->vector = vec;
431 TI_SET_DPL(ti, sgd->sgd_dpl);
432
433 /*
434 * Is this an interrupt gate?
435 */
436 if (sgd->sgd_type == SDT_SYSIGT) {
437 /* LINTED */
438 TI_SET_IF(ti, 1);
439 }
440 ti->cs = sgd->sgd_selector;
441 #if defined(__amd64)
442 ti->cs |= SEL_KPL; /* force into ring 3. see KCS_SEL */
443 #endif
444 ti->address = GATESEG_GETOFFSET(sgd);
445 return (1);
446 }
447
448 /*
449 * Convert a single hw format gate descriptor and write it into our virtual IDT.
450 */
451 void
452 xen_idt_write(gate_desc_t *sgd, uint_t vec)
453 {
454 trap_info_t trapinfo[2];
455
456 bzero(trapinfo, sizeof (trapinfo));
457 if (xen_idt_to_trap_info(vec, sgd, &trapinfo[0]) == 0)
458 return;
459 if (xen_set_trap_table(trapinfo) != 0)
460 panic("xen_idt_write: xen_set_trap_table() failed");
461 }
462
463 #endif /* __xpv */
464
465 #if defined(__amd64)
466
467 /*
468 * Build kernel GDT.
469 */
470
471 static void
472 init_gdt_common(user_desc_t *gdt)
473 {
474 int i;
475
476 /*
477 * 64-bit kernel code segment.
478 */
479 set_usegd(&gdt[GDT_KCODE], SDP_LONG, NULL, 0, SDT_MEMERA, SEL_KPL,
480 SDP_PAGES, SDP_OP32);
481
482 /*
483 * 64-bit kernel data segment. The limit attribute is ignored in 64-bit
484 * mode, but we set it here to 0xFFFF so that we can use the SYSRET
485 * instruction to return from system calls back to 32-bit applications.
486 * SYSRET doesn't update the base, limit, or attributes of %ss or %ds
487 * descriptors. We therefore must ensure that the kernel uses something,
488 * though it will be ignored by hardware, that is compatible with 32-bit
489 * apps. For the same reason we must set the default op size of this
490 * descriptor to 32-bit operands.
491 */
492 set_usegd(&gdt[GDT_KDATA], SDP_LONG, NULL, -1, SDT_MEMRWA,
493 SEL_KPL, SDP_PAGES, SDP_OP32);
494 gdt[GDT_KDATA].usd_def32 = 1;
495
496 /*
497 * 64-bit user code segment.
498 */
499 set_usegd(&gdt[GDT_UCODE], SDP_LONG, NULL, 0, SDT_MEMERA, SEL_UPL,
500 SDP_PAGES, SDP_OP32);
501
502 /*
503 * 32-bit user code segment.
504 */
505 set_usegd(&gdt[GDT_U32CODE], SDP_SHORT, NULL, -1, SDT_MEMERA,
506 SEL_UPL, SDP_PAGES, SDP_OP32);
507
508 /*
509 * See gdt_ucode32() and gdt_ucode_native().
510 */
511 ucs_on = ucs_off = gdt[GDT_UCODE];
512 ucs_off.usd_p = 0; /* forces #np fault */
513
514 ucs32_on = ucs32_off = gdt[GDT_U32CODE];
515 ucs32_off.usd_p = 0; /* forces #np fault */
516
517 /*
518 * 32 and 64 bit data segments can actually share the same descriptor.
519 * In long mode only the present bit is checked but all other fields
520 * are loaded. But in compatibility mode all fields are interpreted
521 * as in legacy mode so they must be set correctly for a 32-bit data
522 * segment.
523 */
524 set_usegd(&gdt[GDT_UDATA], SDP_SHORT, NULL, -1, SDT_MEMRWA, SEL_UPL,
525 SDP_PAGES, SDP_OP32);
526
527 #if !defined(__xpv)
528
529 /*
530 * The 64-bit kernel has no default LDT. By default, the LDT descriptor
531 * in the GDT is 0.
532 */
533
534 /*
535 * Kernel TSS
536 */
537 set_syssegd((system_desc_t *)&gdt[GDT_KTSS], ktss0,
538 sizeof (*ktss0) - 1, SDT_SYSTSS, SEL_KPL);
539
540 #endif /* !__xpv */
541
542 /*
543 * Initialize fs and gs descriptors for 32 bit processes.
544 * Only attributes and limits are initialized, the effective
545 * base address is programmed via fsbase/gsbase.
546 */
547 set_usegd(&gdt[GDT_LWPFS], SDP_SHORT, NULL, -1, SDT_MEMRWA,
548 SEL_UPL, SDP_PAGES, SDP_OP32);
549 set_usegd(&gdt[GDT_LWPGS], SDP_SHORT, NULL, -1, SDT_MEMRWA,
550 SEL_UPL, SDP_PAGES, SDP_OP32);
551
552 /*
553 * Initialize the descriptors set aside for brand usage.
554 * Only attributes and limits are initialized.
555 */
556 for (i = GDT_BRANDMIN; i <= GDT_BRANDMAX; i++)
557 set_usegd(&gdt0[i], SDP_SHORT, NULL, -1, SDT_MEMRWA,
558 SEL_UPL, SDP_PAGES, SDP_OP32);
559
560 /*
561 * Initialize convenient zero base user descriptors for clearing
562 * lwp private %fs and %gs descriptors in GDT. See setregs() for
563 * an example.
564 */
565 set_usegd(&zero_udesc, SDP_LONG, 0, 0, SDT_MEMRWA, SEL_UPL,
566 SDP_BYTES, SDP_OP32);
567 set_usegd(&zero_u32desc, SDP_SHORT, 0, -1, SDT_MEMRWA, SEL_UPL,
568 SDP_PAGES, SDP_OP32);
569 }
570
571 #if defined(__xpv)
572
573 static user_desc_t *
574 init_gdt(void)
575 {
576 uint64_t gdtpa;
577 ulong_t ma[1]; /* XXPV should be a memory_t */
578 ulong_t addr;
579
580 #if !defined(__lint)
581 /*
582 * Our gdt is never larger than a single page.
583 */
584 ASSERT((sizeof (*gdt0) * NGDT) <= PAGESIZE);
585 #endif
586 gdt0 = (user_desc_t *)BOP_ALLOC(bootops, (caddr_t)GDT_VA,
587 PAGESIZE, PAGESIZE);
588 bzero(gdt0, PAGESIZE);
589
590 init_gdt_common(gdt0);
591
592 /*
593 * XXX Since we never invoke kmdb until after the kernel takes
594 * over the descriptor tables why not have it use the kernel's
595 * selectors?
596 */
597 if (boothowto & RB_DEBUG) {
598 set_usegd(&gdt0[GDT_B32DATA], SDP_LONG, NULL, -1, SDT_MEMRWA,
599 SEL_KPL, SDP_PAGES, SDP_OP32);
600 set_usegd(&gdt0[GDT_B64CODE], SDP_LONG, NULL, -1, SDT_MEMERA,
601 SEL_KPL, SDP_PAGES, SDP_OP32);
602 }
603
604 /*
605 * Clear write permission for page containing the gdt and install it.
606 */
607 gdtpa = pfn_to_pa(va_to_pfn(gdt0));
608 ma[0] = (ulong_t)(pa_to_ma(gdtpa) >> PAGESHIFT);
609 kbm_read_only((uintptr_t)gdt0, gdtpa);
610 xen_set_gdt(ma, NGDT);
611
612 /*
613 * Reload the segment registers to use the new GDT.
614 * On 64-bit, fixup KCS_SEL to be in ring 3.
615 * See KCS_SEL in segments.h.
616 */
617 load_segment_registers((KCS_SEL | SEL_KPL), KFS_SEL, KGS_SEL, KDS_SEL);
618
619 /*
620 * setup %gs for kernel
621 */
622 xen_set_segment_base(SEGBASE_GS_KERNEL, (ulong_t)&cpus[0]);
623
624 /*
625 * XX64 We should never dereference off "other gsbase" or
626 * "fsbase". So, we should arrange to point FSBASE and
627 * KGSBASE somewhere truly awful e.g. point it at the last
628 * valid address below the hole so that any attempts to index
629 * off them cause an exception.
630 *
631 * For now, point it at 8G -- at least it should be unmapped
632 * until some 64-bit processes run.
633 */
634 addr = 0x200000000ul;
635 xen_set_segment_base(SEGBASE_FS, addr);
636 xen_set_segment_base(SEGBASE_GS_USER, addr);
637 xen_set_segment_base(SEGBASE_GS_USER_SEL, 0);
638
639 return (gdt0);
640 }
641
642 #else /* __xpv */
643
644 static user_desc_t *
645 init_gdt(void)
646 {
647 desctbr_t r_bgdt, r_gdt;
648 user_desc_t *bgdt;
649
650 #if !defined(__lint)
651 /*
652 * Our gdt is never larger than a single page.
653 */
654 ASSERT((sizeof (*gdt0) * NGDT) <= PAGESIZE);
655 #endif
656 gdt0 = (user_desc_t *)BOP_ALLOC(bootops, (caddr_t)GDT_VA,
657 PAGESIZE, PAGESIZE);
658 bzero(gdt0, PAGESIZE);
659
660 init_gdt_common(gdt0);
661
662 /*
663 * Copy in from boot's gdt to our gdt.
664 * Entry 0 is the null descriptor by definition.
665 */
666 rd_gdtr(&r_bgdt);
667 bgdt = (user_desc_t *)r_bgdt.dtr_base;
668 if (bgdt == NULL)
669 panic("null boot gdt");
670
671 gdt0[GDT_B32DATA] = bgdt[GDT_B32DATA];
672 gdt0[GDT_B32CODE] = bgdt[GDT_B32CODE];
673 gdt0[GDT_B16CODE] = bgdt[GDT_B16CODE];
674 gdt0[GDT_B16DATA] = bgdt[GDT_B16DATA];
675 gdt0[GDT_B64CODE] = bgdt[GDT_B64CODE];
676
677 /*
678 * Install our new GDT
679 */
680 r_gdt.dtr_limit = (sizeof (*gdt0) * NGDT) - 1;
681 r_gdt.dtr_base = (uintptr_t)gdt0;
682 wr_gdtr(&r_gdt);
683
684 /*
685 * Reload the segment registers to use the new GDT
686 */
687 load_segment_registers(KCS_SEL, KFS_SEL, KGS_SEL, KDS_SEL);
688
689 /*
690 * setup %gs for kernel
691 */
692 wrmsr(MSR_AMD_GSBASE, (uint64_t)&cpus[0]);
693
694 /*
695 * XX64 We should never dereference off "other gsbase" or
696 * "fsbase". So, we should arrange to point FSBASE and
697 * KGSBASE somewhere truly awful e.g. point it at the last
698 * valid address below the hole so that any attempts to index
699 * off them cause an exception.
700 *
701 * For now, point it at 8G -- at least it should be unmapped
702 * until some 64-bit processes run.
703 */
704 wrmsr(MSR_AMD_FSBASE, 0x200000000ul);
705 wrmsr(MSR_AMD_KGSBASE, 0x200000000ul);
706 return (gdt0);
707 }
708
709 #endif /* __xpv */
710
711 #elif defined(__i386)
712
713 static void
714 init_gdt_common(user_desc_t *gdt)
715 {
716 int i;
717
718 /*
719 * Text and data for both kernel and user span entire 32 bit
720 * address space.
721 */
722
723 /*
724 * kernel code segment.
725 */
726 set_usegd(&gdt[GDT_KCODE], NULL, -1, SDT_MEMERA, SEL_KPL, SDP_PAGES,
727 SDP_OP32);
728
729 /*
730 * kernel data segment.
731 */
732 set_usegd(&gdt[GDT_KDATA], NULL, -1, SDT_MEMRWA, SEL_KPL, SDP_PAGES,
733 SDP_OP32);
734
735 /*
736 * user code segment.
737 */
738 set_usegd(&gdt[GDT_UCODE], NULL, -1, SDT_MEMERA, SEL_UPL, SDP_PAGES,
739 SDP_OP32);
740
741 /*
742 * user data segment.
743 */
744 set_usegd(&gdt[GDT_UDATA], NULL, -1, SDT_MEMRWA, SEL_UPL, SDP_PAGES,
745 SDP_OP32);
746
747 #if !defined(__xpv)
748
749 /*
750 * TSS for T_DBLFLT (double fault) handler
751 */
752 set_syssegd((system_desc_t *)&gdt[GDT_DBFLT], dftss0,
753 sizeof (*dftss0) - 1, SDT_SYSTSS, SEL_KPL);
754
755 /*
756 * TSS for kernel
757 */
758 set_syssegd((system_desc_t *)&gdt[GDT_KTSS], ktss0,
759 sizeof (*ktss0) - 1, SDT_SYSTSS, SEL_KPL);
760
761 #endif /* !__xpv */
762
763 /*
764 * %gs selector for kernel
765 */
766 set_usegd(&gdt[GDT_GS], &cpus[0], sizeof (struct cpu) -1, SDT_MEMRWA,
767 SEL_KPL, SDP_BYTES, SDP_OP32);
768
769 /*
770 * Initialize lwp private descriptors.
771 * Only attributes and limits are initialized, the effective
772 * base address is programmed via fsbase/gsbase.
773 */
774 set_usegd(&gdt[GDT_LWPFS], NULL, (size_t)-1, SDT_MEMRWA, SEL_UPL,
775 SDP_PAGES, SDP_OP32);
776 set_usegd(&gdt[GDT_LWPGS], NULL, (size_t)-1, SDT_MEMRWA, SEL_UPL,
777 SDP_PAGES, SDP_OP32);
778
779 /*
780 * Initialize the descriptors set aside for brand usage.
781 * Only attributes and limits are initialized.
782 */
783 for (i = GDT_BRANDMIN; i <= GDT_BRANDMAX; i++)
784 set_usegd(&gdt0[i], NULL, (size_t)-1, SDT_MEMRWA, SEL_UPL,
785 SDP_PAGES, SDP_OP32);
786 /*
787 * Initialize convenient zero base user descriptor for clearing
788 * lwp private %fs and %gs descriptors in GDT. See setregs() for
789 * an example.
790 */
791 set_usegd(&zero_udesc, NULL, -1, SDT_MEMRWA, SEL_UPL,
792 SDP_BYTES, SDP_OP32);
793 }
794
795 #if defined(__xpv)
796
797 static user_desc_t *
798 init_gdt(void)
799 {
800 uint64_t gdtpa;
801 ulong_t ma[1]; /* XXPV should be a memory_t */
802
803 #if !defined(__lint)
804 /*
805 * Our gdt is never larger than a single page.
806 */
807 ASSERT((sizeof (*gdt0) * NGDT) <= PAGESIZE);
808 #endif
809 gdt0 = (user_desc_t *)BOP_ALLOC(bootops, (caddr_t)GDT_VA,
810 PAGESIZE, PAGESIZE);
811 bzero(gdt0, PAGESIZE);
812
813 init_gdt_common(gdt0);
814 gdtpa = pfn_to_pa(va_to_pfn(gdt0));
815
816 /*
817 * XXX Since we never invoke kmdb until after the kernel takes
818 * over the descriptor tables why not have it use the kernel's
819 * selectors?
820 */
821 if (boothowto & RB_DEBUG) {
822 set_usegd(&gdt0[GDT_B32DATA], NULL, -1, SDT_MEMRWA, SEL_KPL,
823 SDP_PAGES, SDP_OP32);
824 set_usegd(&gdt0[GDT_B32CODE], NULL, -1, SDT_MEMERA, SEL_KPL,
825 SDP_PAGES, SDP_OP32);
826 }
827
828 /*
829 * Clear write permission for page containing the gdt and install it.
830 */
831 ma[0] = (ulong_t)(pa_to_ma(gdtpa) >> PAGESHIFT);
832 kbm_read_only((uintptr_t)gdt0, gdtpa);
833 xen_set_gdt(ma, NGDT);
834
835 /*
836 * Reload the segment registers to use the new GDT
837 */
838 load_segment_registers(
839 KCS_SEL, KDS_SEL, KDS_SEL, KFS_SEL, KGS_SEL, KDS_SEL);
840
841 return (gdt0);
842 }
843
844 #else /* __xpv */
845
846 static user_desc_t *
847 init_gdt(void)
848 {
849 desctbr_t r_bgdt, r_gdt;
850 user_desc_t *bgdt;
851
852 #if !defined(__lint)
853 /*
854 * Our gdt is never larger than a single page.
855 */
856 ASSERT((sizeof (*gdt0) * NGDT) <= PAGESIZE);
857 #endif
858 /*
859 * XXX this allocation belongs in our caller, not here.
860 */
861 gdt0 = (user_desc_t *)BOP_ALLOC(bootops, (caddr_t)GDT_VA,
862 PAGESIZE, PAGESIZE);
863 bzero(gdt0, PAGESIZE);
864
865 init_gdt_common(gdt0);
866
867 /*
868 * Copy in from boot's gdt to our gdt entries.
869 * Entry 0 is null descriptor by definition.
870 */
871 rd_gdtr(&r_bgdt);
872 bgdt = (user_desc_t *)r_bgdt.dtr_base;
873 if (bgdt == NULL)
874 panic("null boot gdt");
875
876 gdt0[GDT_B32DATA] = bgdt[GDT_B32DATA];
877 gdt0[GDT_B32CODE] = bgdt[GDT_B32CODE];
878 gdt0[GDT_B16CODE] = bgdt[GDT_B16CODE];
879 gdt0[GDT_B16DATA] = bgdt[GDT_B16DATA];
880
881 /*
882 * Install our new GDT
883 */
884 r_gdt.dtr_limit = (sizeof (*gdt0) * NGDT) - 1;
885 r_gdt.dtr_base = (uintptr_t)gdt0;
886 wr_gdtr(&r_gdt);
887
888 /*
889 * Reload the segment registers to use the new GDT
890 */
891 load_segment_registers(
892 KCS_SEL, KDS_SEL, KDS_SEL, KFS_SEL, KGS_SEL, KDS_SEL);
893
894 return (gdt0);
895 }
896
897 #endif /* __xpv */
898 #endif /* __i386 */
899
900 /*
901 * Build kernel IDT.
902 *
903 * Note that for amd64 we pretty much require every gate to be an interrupt
904 * gate which blocks interrupts atomically on entry; that's because of our
905 * dependency on using 'swapgs' every time we come into the kernel to find
906 * the cpu structure. If we get interrupted just before doing that, %cs could
907 * be in kernel mode (so that the trap prolog doesn't do a swapgs), but
908 * %gsbase is really still pointing at something in userland. Bad things will
909 * ensue. We also use interrupt gates for i386 as well even though this is not
910 * required for some traps.
911 *
912 * Perhaps they should have invented a trap gate that does an atomic swapgs?
913 */
914 static void
915 init_idt_common(gate_desc_t *idt)
916 {
917 set_gatesegd(&idt[T_ZERODIV],
918 (kpti_enable == 1) ? &tr_div0trap : &div0trap,
919 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_ZERODIV));
920 set_gatesegd(&idt[T_SGLSTP],
921 (kpti_enable == 1) ? &tr_dbgtrap : &dbgtrap,
922 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_SGLSTP));
923 set_gatesegd(&idt[T_NMIFLT],
924 (kpti_enable == 1) ? &tr_nmiint : &nmiint,
925 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_NMIFLT));
926 set_gatesegd(&idt[T_BPTFLT],
927 (kpti_enable == 1) ? &tr_brktrap : &brktrap,
928 KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_BPTFLT));
929 set_gatesegd(&idt[T_OVFLW],
930 (kpti_enable == 1) ? &tr_ovflotrap : &ovflotrap,
931 KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_OVFLW));
932 set_gatesegd(&idt[T_BOUNDFLT],
933 (kpti_enable == 1) ? &tr_boundstrap : &boundstrap,
934 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_BOUNDFLT));
935 set_gatesegd(&idt[T_ILLINST],
936 (kpti_enable == 1) ? &tr_invoptrap : &invoptrap,
937 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_ILLINST));
938 set_gatesegd(&idt[T_NOEXTFLT],
939 (kpti_enable == 1) ? &tr_ndptrap : &ndptrap,
940 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_NOEXTFLT));
941
942 /*
943 * double fault handler.
944 *
945 * Note that on the hypervisor a guest does not receive #df faults.
946 * Instead a failsafe event is injected into the guest if its selectors
947 * and/or stack is in a broken state. See xen_failsafe_callback.
948 */
949 #if !defined(__xpv)
950 set_gatesegd(&idt[T_DBLFLT],
951 (kpti_enable == 1) ? &tr_syserrtrap : &syserrtrap,
952 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_DBLFLT));
953 #endif /* !__xpv */
954
955 /*
956 * T_EXTOVRFLT coprocessor-segment-overrun not supported.
957 */
958 set_gatesegd(&idt[T_TSSFLT],
959 (kpti_enable == 1) ? &tr_invtsstrap : &invtsstrap,
960 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_TSSFLT));
961 set_gatesegd(&idt[T_SEGFLT],
962 (kpti_enable == 1) ? &tr_segnptrap : &segnptrap,
963 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_SEGFLT));
964 set_gatesegd(&idt[T_STKFLT],
965 (kpti_enable == 1) ? &tr_stktrap : &stktrap,
966 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_STKFLT));
967 set_gatesegd(&idt[T_GPFLT],
968 (kpti_enable == 1) ? &tr_gptrap : &gptrap,
969 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_GPFLT));
970 set_gatesegd(&idt[T_PGFLT],
971 (kpti_enable == 1) ? &tr_pftrap : &pftrap,
972 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_PGFLT));
973 set_gatesegd(&idt[T_EXTERRFLT],
974 (kpti_enable == 1) ? &tr_ndperr : &ndperr,
975 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_EXTERRFLT));
976 set_gatesegd(&idt[T_ALIGNMENT],
977 (kpti_enable == 1) ? &tr_achktrap : &achktrap,
978 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_ALIGNMENT));
979 set_gatesegd(&idt[T_MCE],
980 (kpti_enable == 1) ? &tr_mcetrap : &mcetrap,
981 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_MCE));
982 set_gatesegd(&idt[T_SIMDFPE],
983 (kpti_enable == 1) ? &tr_xmtrap : &xmtrap,
984 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_SIMDFPE));
985
986 /*
987 * install fast trap handler at 210.
988 */
989 set_gatesegd(&idt[T_FASTTRAP],
990 (kpti_enable == 1) ? &tr_fasttrap : &fasttrap,
991 KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_FASTTRAP));
992
993 /*
994 * System call handler.
995 */
996 set_gatesegd(&idt[T_SYSCALLINT],
997 (kpti_enable == 1) ? &tr_sys_syscall_int : &sys_syscall_int,
998 KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_SYSCALLINT));
999
1000 /*
1001 * Install the DTrace interrupt handler for the pid provider.
1002 */
1003 set_gatesegd(&idt[T_DTRACE_RET],
1004 (kpti_enable == 1) ? &tr_dtrace_ret : &dtrace_ret,
1005 KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_DTRACE_RET));
1006
1007 /*
1008 * Prepare interposing descriptor for the syscall handler
1009 * and cache copy of the default descriptor.
1010 */
1011 brand_tbl[0].ih_inum = T_SYSCALLINT;
1012 brand_tbl[0].ih_default_desc = idt0[T_SYSCALLINT];
1013
1014 set_gatesegd(&(brand_tbl[0].ih_interp_desc),
1015 (kpti_enable == 1) ? &tr_brand_sys_syscall_int :
1016 &brand_sys_syscall_int, KCS_SEL, SDT_SYSIGT, TRP_UPL,
1017 idt_vector_to_ist(T_SYSCALLINT));
1018
1019 brand_tbl[1].ih_inum = 0;
1020 }
1021
1022 #if defined(__xpv)
1023
1024 static void
1025 init_idt(gate_desc_t *idt)
1026 {
1027 init_idt_common(idt);
1028 }
1029
1030 #else /* __xpv */
1031
1032 static void
1033 init_idt(gate_desc_t *idt)
1034 {
1035 char ivctname[80];
1036 void (*ivctptr)(void);
1037 int i;
1038
1039 /*
1040 * Initialize entire table with 'reserved' trap and then overwrite
1041 * specific entries. T_EXTOVRFLT (9) is unsupported and reserved
1042 * since it can only be generated on a 386 processor. 15 is also
1043 * unsupported and reserved.
1044 */
1045 #if !defined(__xpv)
1046 for (i = 0; i < NIDT; i++) {
1047 set_gatesegd(&idt[i],
1048 (kpti_enable == 1) ? &tr_resvtrap : &resvtrap,
1049 KCS_SEL, SDT_SYSIGT, TRP_KPL,
1050 idt_vector_to_ist(T_RESVTRAP));
1051 }
1052 #else
1053 for (i = 0; i < NIDT; i++) {
1054 set_gatesegd(&idt[i], &resvtrap, KCS_SEL, SDT_SYSIGT, TRP_KPL,
1055 IST_NONE);
1056 }
1057 #endif
1058
1059 /*
1060 * 20-31 reserved
1061 */
1062 #if !defined(__xpv)
1063 for (i = 20; i < 32; i++) {
1064 set_gatesegd(&idt[i],
1065 (kpti_enable == 1) ? &tr_invaltrap : &invaltrap,
1066 KCS_SEL, SDT_SYSIGT, TRP_KPL,
1067 idt_vector_to_ist(T_INVALTRAP));
1068 }
1069 #else
1070 for (i = 20; i < 32; i++) {
1071 set_gatesegd(&idt[i], &invaltrap, KCS_SEL, SDT_SYSIGT, TRP_KPL,
1072 IST_NONE);
1073 }
1074 #endif
1075
1076 /*
1077 * interrupts 32 - 255
1078 */
1079 for (i = 32; i < 256; i++) {
1080 #if !defined(__xpv)
1081 (void) snprintf(ivctname, sizeof (ivctname),
1082 (kpti_enable == 1) ? "tr_ivct%d" : "ivct%d", i);
1083 #else
1084 (void) snprintf(ivctname, sizeof (ivctname), "ivct%d", i);
1085 #endif
1086 ivctptr = (void (*)(void))kobj_getsymvalue(ivctname, 0);
1087 if (ivctptr == NULL)
1088 panic("kobj_getsymvalue(%s) failed", ivctname);
1089
1090 set_gatesegd(&idt[i], ivctptr, KCS_SEL, SDT_SYSIGT, TRP_KPL,
1091 idt_vector_to_ist(i));
1092 }
1093
1094 /*
1095 * Now install the common ones. Note that it will overlay some
1096 * entries installed above like T_SYSCALLINT, T_FASTTRAP etc.
1097 */
1098 init_idt_common(idt);
1099 }
1100
1101 #endif /* __xpv */
1102
1103 /*
1104 * The kernel does not deal with LDTs unless a user explicitly creates
1105 * one. Under normal circumstances, the LDTR contains 0. Any process attempting
1106 * to reference the LDT will therefore cause a #gp. System calls made via the
1107 * obsolete lcall mechanism are emulated by the #gp fault handler.
1108 */
1109 static void
1110 init_ldt(void)
1111 {
1112 #if defined(__xpv)
1113 xen_set_ldt(NULL, 0);
1114 #else
1115 wr_ldtr(0);
1116 #endif
1117 }
1118
1119 #if !defined(__xpv)
1120
1121 static void
1122 init_tss(void)
1123 {
1124 extern struct cpu cpus[];
1125
1126 /*
1127 * tss_rsp0 is dynamically filled in by resume() (in swtch.s) on each
1128 * context switch but it'll be overwritten with this same value anyway.
1129 */
1130 if (kpti_enable == 1) {
1131 ktss0->tss_rsp0 = (uint64_t)&cpus->cpu_m.mcpu_kpti.kf_tr_rsp;
1132 }
1133
1134 /* Set up the IST stacks for double fault, NMI, MCE. */
1135 ktss0->tss_ist1 = (uintptr_t)&dblfault_stack0[sizeof (dblfault_stack0)];
1136 ktss0->tss_ist2 = (uintptr_t)&nmi_stack0[sizeof (nmi_stack0)];
1137 ktss0->tss_ist3 = (uintptr_t)&mce_stack0[sizeof (mce_stack0)];
1138
1139 /*
1140 * This IST stack is used for #DB,#BP (debug) interrupts (when KPTI is
1141 * enabled), and also for KDI (always).
1142 */
1143 ktss0->tss_ist4 = (uint64_t)&cpus->cpu_m.mcpu_kpti_dbg.kf_tr_rsp;
1144
1145 if (kpti_enable == 1) {
1146 /* This IST stack is used for #GP,#PF,#SS (fault) interrupts. */
1147 ktss0->tss_ist5 =
1148 (uint64_t)&cpus->cpu_m.mcpu_kpti_flt.kf_tr_rsp;
1149
1150 /* This IST stack is used for all other intrs (for KPTI). */
1151 ktss0->tss_ist6 = (uint64_t)&cpus->cpu_m.mcpu_kpti.kf_tr_rsp;
1152 }
1153
1154 /*
1155 * Set I/O bit map offset equal to size of TSS segment limit
1156 * for no I/O permission map. This will force all user I/O
1157 * instructions to generate #gp fault.
1158 */
1159 ktss0->tss_bitmapbase = sizeof (*ktss0);
1160
1161 /*
1162 * Point %tr to descriptor for ktss0 in gdt.
1163 */
1164 wr_tsr(KTSS_SEL);
1165 }
1166
1167 #endif /* !__xpv */
1168
1169 #if defined(__xpv)
1170
1171 void
1172 init_desctbls(void)
1173 {
1174 uint_t vec;
1175 user_desc_t *gdt;
1176
1177 /*
1178 * Setup and install our GDT.
1179 */
1180 gdt = init_gdt();
1181
1182 /*
1183 * Store static pa of gdt to speed up pa_to_ma() translations
1184 * on lwp context switches.
1185 */
1186 ASSERT(IS_P2ALIGNED((uintptr_t)gdt, PAGESIZE));
1187 CPU->cpu_gdt = gdt;
1188 CPU->cpu_m.mcpu_gdtpa = pfn_to_pa(va_to_pfn(gdt));
1189
1190 /*
1191 * Setup and install our IDT.
1192 */
1193 #if !defined(__lint)
1194 ASSERT(NIDT * sizeof (*idt0) <= PAGESIZE);
1195 #endif
1196 idt0 = (gate_desc_t *)BOP_ALLOC(bootops, (caddr_t)IDT_VA,
1197 PAGESIZE, PAGESIZE);
1198 bzero(idt0, PAGESIZE);
1199 init_idt(idt0);
1200 for (vec = 0; vec < NIDT; vec++)
1201 xen_idt_write(&idt0[vec], vec);
1202
1203 CPU->cpu_idt = idt0;
1204
1205 /*
1206 * set default kernel stack
1207 */
1208 xen_stack_switch(KDS_SEL,
1209 (ulong_t)&dblfault_stack0[sizeof (dblfault_stack0)]);
1210
1211 xen_init_callbacks();
1212
1213 init_ldt();
1214 }
1215
1216 #else /* __xpv */
1217
1218 void
1219 init_desctbls(void)
1220 {
1221 user_desc_t *gdt;
1222 desctbr_t idtr;
1223
1224 /*
1225 * Allocate IDT and TSS structures on unique pages for better
1226 * performance in virtual machines.
1227 */
1228 #if !defined(__lint)
1229 ASSERT(NIDT * sizeof (*idt0) <= PAGESIZE);
1230 #endif
1231 idt0 = (gate_desc_t *)BOP_ALLOC(bootops, (caddr_t)IDT_VA,
1232 PAGESIZE, PAGESIZE);
1233 bzero(idt0, PAGESIZE);
1234 #if !defined(__lint)
1235 ASSERT(sizeof (*ktss0) <= PAGESIZE);
1236 #endif
1237 ktss0 = (tss_t *)BOP_ALLOC(bootops, (caddr_t)KTSS_VA,
1238 PAGESIZE, PAGESIZE);
1239 bzero(ktss0, PAGESIZE);
1240
1241 #if defined(__i386)
1242 #if !defined(__lint)
1243 ASSERT(sizeof (*dftss0) <= PAGESIZE);
1244 #endif
1245 dftss0 = (tss_t *)BOP_ALLOC(bootops, (caddr_t)DFTSS_VA,
1246 PAGESIZE, PAGESIZE);
1247 bzero(dftss0, PAGESIZE);
1248 #endif
1249
1250 /*
1251 * Setup and install our GDT.
1252 */
1253 gdt = init_gdt();
1254 ASSERT(IS_P2ALIGNED((uintptr_t)gdt, PAGESIZE));
1255 CPU->cpu_gdt = gdt;
1256
1257 /*
1258 * Initialize this CPU's LDT.
1259 */
1260 CPU->cpu_m.mcpu_ldt = BOP_ALLOC(bootops, (caddr_t)LDT_VA,
1261 LDT_CPU_SIZE, PAGESIZE);
1262 bzero(CPU->cpu_m.mcpu_ldt, LDT_CPU_SIZE);
1263 CPU->cpu_m.mcpu_ldt_len = 0;
1264
1265 /*
1266 * Setup and install our IDT.
1267 */
1268 init_idt(idt0);
1269
1270 idtr.dtr_base = (uintptr_t)idt0;
1271 idtr.dtr_limit = (NIDT * sizeof (*idt0)) - 1;
1272 wr_idtr(&idtr);
1273 CPU->cpu_idt = idt0;
1274
1275 #if defined(__i386)
1276 /*
1277 * We maintain a description of idt0 in convenient IDTR format
1278 * for #pf's on some older pentium processors. See pentium_pftrap().
1279 */
1280 idt0_default_r = idtr;
1281 #endif /* __i386 */
1282
1283 init_tss();
1284 CPU->cpu_tss = ktss0;
1285 init_ldt();
1286
1287 /* Stash this so that the NMI,MCE,#DF and KDI handlers can use it. */
1288 kpti_safe_cr3 = (uint64_t)getcr3();
1289 }
1290
1291 #endif /* __xpv */
1292
1293 #ifndef __xpv
1294 /*
1295 * As per Intel Vol 3 27.5.2, the GDTR limit is reset to 64Kb on a VM exit, so
1296 * we have to manually fix it up ourselves.
1297 *
1298 * The caller may still need to make sure that it can't go off-CPU with the
1299 * incorrect limit, before calling this (such as disabling pre-emption).
1300 */
1301 void
1302 reset_gdtr_limit(void)
1303 {
1304 ulong_t flags = intr_clear();
1305 desctbr_t gdtr;
1306
1307 rd_gdtr(&gdtr);
1308 gdtr.dtr_limit = (sizeof (user_desc_t) * NGDT) - 1;
1309 wr_gdtr(&gdtr);
1310
1311 intr_restore(flags);
1312 }
1313 #endif /* __xpv */
1314
1315 /*
1316 * In the early kernel, we need to set up a simple GDT to run on.
1317 *
1318 * XXPV Can dboot use this too? See dboot_gdt.s
1319 */
1320 void
1321 init_boot_gdt(user_desc_t *bgdt)
1322 {
1323 #if defined(__amd64)
1324 set_usegd(&bgdt[GDT_B32DATA], SDP_LONG, NULL, -1, SDT_MEMRWA, SEL_KPL,
1325 SDP_PAGES, SDP_OP32);
1326 set_usegd(&bgdt[GDT_B64CODE], SDP_LONG, NULL, -1, SDT_MEMERA, SEL_KPL,
1327 SDP_PAGES, SDP_OP32);
1328 #elif defined(__i386)
1329 set_usegd(&bgdt[GDT_B32DATA], NULL, -1, SDT_MEMRWA, SEL_KPL,
1330 SDP_PAGES, SDP_OP32);
1331 set_usegd(&bgdt[GDT_B32CODE], NULL, -1, SDT_MEMERA, SEL_KPL,
1332 SDP_PAGES, SDP_OP32);
1333 #endif /* __i386 */
1334 }
1335
1336 /*
1337 * Enable interpositioning on the system call path by rewriting the
1338 * sys{call|enter} MSRs and the syscall-related entries in the IDT to use
1339 * the branded entry points.
1340 */
1341 void
1342 brand_interpositioning_enable(void)
1343 {
1344 gate_desc_t *idt = CPU->cpu_idt;
1345 int i;
1346
1347 ASSERT(curthread->t_preempt != 0 || getpil() >= DISP_LEVEL);
1348
1349 for (i = 0; brand_tbl[i].ih_inum; i++) {
1350 idt[brand_tbl[i].ih_inum] = brand_tbl[i].ih_interp_desc;
1351 #if defined(__xpv)
1352 xen_idt_write(&idt[brand_tbl[i].ih_inum],
1353 brand_tbl[i].ih_inum);
1354 #endif
1355 }
1356
1357 #if defined(__amd64)
1358 #if defined(__xpv)
1359
1360 /*
1361 * Currently the hypervisor only supports 64-bit syscalls via
1362 * syscall instruction. The 32-bit syscalls are handled by
1363 * interrupt gate above.
1364 */
1365 xen_set_callback(brand_sys_syscall, CALLBACKTYPE_syscall,
1366 CALLBACKF_mask_events);
1367
1368 #else
1369
1370 if (is_x86_feature(x86_featureset, X86FSET_ASYSC)) {
1371 if (kpti_enable == 1) {
1372 wrmsr(MSR_AMD_LSTAR, (uintptr_t)tr_brand_sys_syscall);
1373 wrmsr(MSR_AMD_CSTAR, (uintptr_t)tr_brand_sys_syscall32);
1374 } else {
1375 wrmsr(MSR_AMD_LSTAR, (uintptr_t)brand_sys_syscall);
1376 wrmsr(MSR_AMD_CSTAR, (uintptr_t)brand_sys_syscall32);
1377 }
1378 }
1379
1380 #endif
1381 #endif /* __amd64 */
1382
1383 if (is_x86_feature(x86_featureset, X86FSET_SEP)) {
1384 if (kpti_enable == 1) {
1385 wrmsr(MSR_INTC_SEP_EIP,
1386 (uintptr_t)tr_brand_sys_sysenter);
1387 } else {
1388 wrmsr(MSR_INTC_SEP_EIP, (uintptr_t)brand_sys_sysenter);
1389 }
1390 }
1391 }
1392
1393 /*
1394 * Disable interpositioning on the system call path by rewriting the
1395 * sys{call|enter} MSRs and the syscall-related entries in the IDT to use
1396 * the standard entry points, which bypass the interpositioning hooks.
1397 */
1398 void
1399 brand_interpositioning_disable(void)
1400 {
1401 gate_desc_t *idt = CPU->cpu_idt;
1402 int i;
1403
1404 ASSERT(curthread->t_preempt != 0 || getpil() >= DISP_LEVEL);
1405
1406 for (i = 0; brand_tbl[i].ih_inum; i++) {
1407 idt[brand_tbl[i].ih_inum] = brand_tbl[i].ih_default_desc;
1408 #if defined(__xpv)
1409 xen_idt_write(&idt[brand_tbl[i].ih_inum],
1410 brand_tbl[i].ih_inum);
1411 #endif
1412 }
1413
1414 #if defined(__amd64)
1415 #if defined(__xpv)
1416
1417 /*
1418 * See comment above in brand_interpositioning_enable.
1419 */
1420 xen_set_callback(sys_syscall, CALLBACKTYPE_syscall,
1421 CALLBACKF_mask_events);
1422
1423 #else
1424
1425 if (is_x86_feature(x86_featureset, X86FSET_ASYSC)) {
1426 if (kpti_enable == 1) {
1427 wrmsr(MSR_AMD_LSTAR, (uintptr_t)tr_sys_syscall);
1428 wrmsr(MSR_AMD_CSTAR, (uintptr_t)tr_sys_syscall32);
1429 } else {
1430 wrmsr(MSR_AMD_LSTAR, (uintptr_t)sys_syscall);
1431 wrmsr(MSR_AMD_CSTAR, (uintptr_t)sys_syscall32);
1432 }
1433 }
1434
1435 #endif
1436 #endif /* __amd64 */
1437
1438 if (is_x86_feature(x86_featureset, X86FSET_SEP)) {
1439 if (kpti_enable == 1) {
1440 wrmsr(MSR_INTC_SEP_EIP, (uintptr_t)tr_sys_sysenter);
1441 } else {
1442 wrmsr(MSR_INTC_SEP_EIP, (uintptr_t)sys_sysenter);
1443 }
1444 }
1445 }
--- EOF ---