1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
24 */
25
26 /*
27 * Copyright 2018 Joyent, Inc. All rights reserved.
28 */
29
30 /*
31 * Copyright (c) 1992 Terrence R. Lambert.
32 * Copyright (c) 1990 The Regents of the University of California.
33 * All rights reserved.
34 *
35 * This code is derived from software contributed to Berkeley by
36 * William Jolitz.
37 *
38 * Redistribution and use in source and binary forms, with or without
39 * modification, are permitted provided that the following conditions
40 * are met:
41 * 1. Redistributions of source code must retain the above copyright
42 * notice, this list of conditions and the following disclaimer.
43 * 2. Redistributions in binary form must reproduce the above copyright
44 * notice, this list of conditions and the following disclaimer in the
45 * documentation and/or other materials provided with the distribution.
46 * 3. All advertising materials mentioning features or use of this software
47 * must display the following acknowledgement:
48 * This product includes software developed by the University of
49 * California, Berkeley and its contributors.
50 * 4. Neither the name of the University nor the names of its contributors
51 * may be used to endorse or promote products derived from this software
52 * without specific prior written permission.
53 *
54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64 * SUCH DAMAGE.
65 *
66 * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91
67 */
68
69 #include <sys/types.h>
70 #include <sys/sysmacros.h>
71 #include <sys/tss.h>
72 #include <sys/segments.h>
73 #include <sys/trap.h>
74 #include <sys/cpuvar.h>
75 #include <sys/bootconf.h>
76 #include <sys/x86_archext.h>
77 #include <sys/controlregs.h>
78 #include <sys/archsystm.h>
79 #include <sys/machsystm.h>
80 #include <sys/kobj.h>
81 #include <sys/cmn_err.h>
82 #include <sys/reboot.h>
83 #include <sys/kdi.h>
84 #include <sys/mach_mmu.h>
85 #include <sys/systm.h>
86 #include <sys/note.h>
87
88 #ifdef __xpv
89 #include <sys/hypervisor.h>
90 #include <vm/as.h>
91 #endif
92
93 #include <sys/promif.h>
94 #include <sys/bootinfo.h>
95 #include <vm/kboot_mmu.h>
96 #include <vm/hat_pte.h>
97
98 /*
99 * cpu0 and default tables and structures.
100 */
101 user_desc_t *gdt0;
102 #if !defined(__xpv)
103 desctbr_t gdt0_default_r;
104 #endif
105
106 gate_desc_t *idt0; /* interrupt descriptor table */
107 #if defined(__i386)
108 desctbr_t idt0_default_r; /* describes idt0 in IDTR format */
109 #endif
110
111 tss_t *ktss0; /* kernel task state structure */
112
113 #if defined(__i386)
114 tss_t *dftss0; /* #DF double-fault exception */
115 #endif /* __i386 */
116
117 user_desc_t zero_udesc; /* base zero user desc native procs */
118 user_desc_t null_udesc; /* null user descriptor */
119 system_desc_t null_sdesc; /* null system descriptor */
120
121 #if defined(__amd64)
122 user_desc_t zero_u32desc; /* 32-bit compatibility procs */
123 #endif /* __amd64 */
124
125 #if defined(__amd64)
126 user_desc_t ucs_on;
127 user_desc_t ucs_off;
128 user_desc_t ucs32_on;
129 user_desc_t ucs32_off;
130 #endif /* __amd64 */
131
132 /*
133 * If the size of this is changed, you must update hat_pcp_setup() and the
134 * definitions in exception.s
135 */
136 extern char dblfault_stack0[DEFAULTSTKSZ];
137 extern char nmi_stack0[DEFAULTSTKSZ];
138 extern char mce_stack0[DEFAULTSTKSZ];
139
140 extern void fast_null(void);
141 extern hrtime_t get_hrtime(void);
142 extern hrtime_t gethrvtime(void);
143 extern hrtime_t get_hrestime(void);
144 extern uint64_t getlgrp(void);
145
146 void (*(fasttable[]))(void) = {
147 fast_null, /* T_FNULL routine */
148 fast_null, /* T_FGETFP routine (initially null) */
149 fast_null, /* T_FSETFP routine (initially null) */
150 (void (*)())get_hrtime, /* T_GETHRTIME */
151 (void (*)())gethrvtime, /* T_GETHRVTIME */
152 (void (*)())get_hrestime, /* T_GETHRESTIME */
153 (void (*)())getlgrp /* T_GETLGRP */
154 };
155
156 /*
157 * Structure containing pre-computed descriptors to allow us to temporarily
158 * interpose on a standard handler.
159 */
160 struct interposing_handler {
161 int ih_inum;
162 gate_desc_t ih_interp_desc;
163 gate_desc_t ih_default_desc;
164 };
165
166 /*
167 * The brand infrastructure interposes on two handlers, and we use one as a
168 * NULL signpost.
169 */
170 static struct interposing_handler brand_tbl[2];
171
172 /*
173 * software prototypes for default local descriptor table
174 */
175
176 /*
177 * Routines for loading segment descriptors in format the hardware
178 * can understand.
179 */
180
181 #if defined(__amd64)
182
183 /*
184 * In long mode we have the new L or long mode attribute bit
185 * for code segments. Only the conforming bit in type is used along
186 * with descriptor priority and present bits. Default operand size must
187 * be zero when in long mode. In 32-bit compatibility mode all fields
188 * are treated as in legacy mode. For data segments while in long mode
189 * only the present bit is loaded.
190 */
191 void
192 set_usegd(user_desc_t *dp, uint_t lmode, void *base, size_t size,
193 uint_t type, uint_t dpl, uint_t gran, uint_t defopsz)
194 {
195 ASSERT(lmode == SDP_SHORT || lmode == SDP_LONG);
196
197 /*
198 * 64-bit long mode.
199 */
200 if (lmode == SDP_LONG)
201 dp->usd_def32 = 0; /* 32-bit operands only */
202 else
203 /*
204 * 32-bit compatibility mode.
205 */
206 dp->usd_def32 = defopsz; /* 0 = 16, 1 = 32-bit ops */
207
208 dp->usd_long = lmode; /* 64-bit mode */
209 dp->usd_type = type;
210 dp->usd_dpl = dpl;
211 dp->usd_p = 1;
212 dp->usd_gran = gran; /* 0 = bytes, 1 = pages */
213
214 dp->usd_lobase = (uintptr_t)base;
215 dp->usd_midbase = (uintptr_t)base >> 16;
216 dp->usd_hibase = (uintptr_t)base >> (16 + 8);
217 dp->usd_lolimit = size;
218 dp->usd_hilimit = (uintptr_t)size >> 16;
219 }
220
221 #elif defined(__i386)
222
223 /*
224 * Install user segment descriptor for code and data.
225 */
226 void
227 set_usegd(user_desc_t *dp, void *base, size_t size, uint_t type,
228 uint_t dpl, uint_t gran, uint_t defopsz)
229 {
230 dp->usd_lolimit = size;
231 dp->usd_hilimit = (uintptr_t)size >> 16;
232
233 dp->usd_lobase = (uintptr_t)base;
234 dp->usd_midbase = (uintptr_t)base >> 16;
235 dp->usd_hibase = (uintptr_t)base >> (16 + 8);
236
237 dp->usd_type = type;
238 dp->usd_dpl = dpl;
239 dp->usd_p = 1;
240 dp->usd_def32 = defopsz; /* 0 = 16, 1 = 32 bit operands */
241 dp->usd_gran = gran; /* 0 = bytes, 1 = pages */
242 }
243
244 #endif /* __i386 */
245
246 /*
247 * Install system segment descriptor for LDT and TSS segments.
248 */
249
250 #if defined(__amd64)
251
252 void
253 set_syssegd(system_desc_t *dp, void *base, size_t size, uint_t type,
254 uint_t dpl)
255 {
256 dp->ssd_lolimit = size;
257 dp->ssd_hilimit = (uintptr_t)size >> 16;
258
259 dp->ssd_lobase = (uintptr_t)base;
260 dp->ssd_midbase = (uintptr_t)base >> 16;
261 dp->ssd_hibase = (uintptr_t)base >> (16 + 8);
262 dp->ssd_hi64base = (uintptr_t)base >> (16 + 8 + 8);
263
264 dp->ssd_type = type;
265 dp->ssd_zero1 = 0; /* must be zero */
266 dp->ssd_zero2 = 0;
267 dp->ssd_dpl = dpl;
268 dp->ssd_p = 1;
269 dp->ssd_gran = 0; /* force byte units */
270 }
271
272 void *
273 get_ssd_base(system_desc_t *dp)
274 {
275 uintptr_t base;
276
277 base = (uintptr_t)dp->ssd_lobase |
278 (uintptr_t)dp->ssd_midbase << 16 |
279 (uintptr_t)dp->ssd_hibase << (16 + 8) |
280 (uintptr_t)dp->ssd_hi64base << (16 + 8 + 8);
281 return ((void *)base);
282 }
283
284 #elif defined(__i386)
285
286 void
287 set_syssegd(system_desc_t *dp, void *base, size_t size, uint_t type,
288 uint_t dpl)
289 {
290 dp->ssd_lolimit = size;
291 dp->ssd_hilimit = (uintptr_t)size >> 16;
292
293 dp->ssd_lobase = (uintptr_t)base;
294 dp->ssd_midbase = (uintptr_t)base >> 16;
295 dp->ssd_hibase = (uintptr_t)base >> (16 + 8);
296
297 dp->ssd_type = type;
298 dp->ssd_zero = 0; /* must be zero */
299 dp->ssd_dpl = dpl;
300 dp->ssd_p = 1;
301 dp->ssd_gran = 0; /* force byte units */
302 }
303
304 void *
305 get_ssd_base(system_desc_t *dp)
306 {
307 uintptr_t base;
308
309 base = (uintptr_t)dp->ssd_lobase |
310 (uintptr_t)dp->ssd_midbase << 16 |
311 (uintptr_t)dp->ssd_hibase << (16 + 8);
312 return ((void *)base);
313 }
314
315 #endif /* __i386 */
316
317 /*
318 * Install gate segment descriptor for interrupt, trap, call and task gates.
319 *
320 * For 64 bit native if we have KPTI enabled, we use the IST stack mechanism on
321 * all interrupts. We have different ISTs for each class of exceptions that are
322 * most likely to occur while handling an existing exception; while many of
323 * these are just going to panic, it's nice not to trample on the existing
324 * exception state for debugging purposes.
325 *
326 * Normal interrupts are all redirected unconditionally to the KPTI trampoline
327 * stack space. This unifies the trampoline handling between user and kernel
328 * space (and avoids the need to touch %gs).
329 *
330 * The KDI IDT *all* uses the DBG IST: consider single stepping tr_pftrap, when
331 * we do a read from KMDB that cause another #PF. Without its own IST, this
332 * would stomp on the kernel's mcpu_kpti_flt frame.
333 */
334 uint_t
335 idt_vector_to_ist(uint_t vector)
336 {
337 #if defined(__xpv)
338 _NOTE(ARGUNUSED(vector));
339 return (IST_NONE);
340 #else
341 switch (vector) {
342 /* These should always use IST even without KPTI enabled. */
343 case T_DBLFLT:
344 return (IST_DF);
345 case T_NMIFLT:
346 return (IST_NMI);
347 case T_MCE:
348 return (IST_MCE);
349
350 case T_BPTFLT:
351 case T_SGLSTP:
352 if (kpti_enable == 1) {
353 return (IST_DBG);
354 }
355 return (IST_NONE);
356 case T_STKFLT:
357 case T_GPFLT:
358 case T_PGFLT:
359 if (kpti_enable == 1) {
360 return (IST_NESTABLE);
361 }
362 return (IST_NONE);
363 default:
364 if (kpti_enable == 1) {
365 return (IST_DEFAULT);
366 }
367 return (IST_NONE);
368 }
369 #endif
370 }
371
372 void
373 set_gatesegd(gate_desc_t *dp, void (*func)(void), selector_t sel,
374 uint_t type, uint_t dpl, uint_t ist)
375 {
376 dp->sgd_looffset = (uintptr_t)func;
377 dp->sgd_hioffset = (uintptr_t)func >> 16;
378 dp->sgd_hi64offset = (uintptr_t)func >> (16 + 16);
379 dp->sgd_selector = (uint16_t)sel;
380 dp->sgd_ist = ist;
381 dp->sgd_type = type;
382 dp->sgd_dpl = dpl;
383 dp->sgd_p = 1;
384 }
385
386 /*
387 * Updates a single user descriptor in the the GDT of the current cpu.
388 * Caller is responsible for preventing cpu migration.
389 */
390
391 void
392 gdt_update_usegd(uint_t sidx, user_desc_t *udp)
393 {
394 #if defined(__xpv)
395
396 uint64_t dpa = CPU->cpu_m.mcpu_gdtpa + sizeof (*udp) * sidx;
397
398 if (HYPERVISOR_update_descriptor(pa_to_ma(dpa), *(uint64_t *)udp))
399 panic("gdt_update_usegd: HYPERVISOR_update_descriptor");
400
401 #else /* __xpv */
402
403 CPU->cpu_gdt[sidx] = *udp;
404
405 #endif /* __xpv */
406 }
407
408 /*
409 * Writes single descriptor pointed to by udp into a processes
410 * LDT entry pointed to by ldp.
411 */
412 int
413 ldt_update_segd(user_desc_t *ldp, user_desc_t *udp)
414 {
415 #if defined(__xpv)
416
417 uint64_t dpa;
418
419 dpa = mmu_ptob(hat_getpfnum(kas.a_hat, (caddr_t)ldp)) |
420 ((uintptr_t)ldp & PAGEOFFSET);
421
422 /*
423 * The hypervisor is a little more restrictive about what it
424 * supports in the LDT.
425 */
426 if (HYPERVISOR_update_descriptor(pa_to_ma(dpa), *(uint64_t *)udp) != 0)
427 return (EINVAL);
428
429 #else /* __xpv */
430
431 *ldp = *udp;
432
433 #endif /* __xpv */
434 return (0);
435 }
436
437 #if defined(__xpv)
438
439 /*
440 * Converts hw format gate descriptor into pseudo-IDT format for the hypervisor.
441 * Returns true if a valid entry was written.
442 */
443 int
444 xen_idt_to_trap_info(uint_t vec, gate_desc_t *sgd, void *ti_arg)
445 {
446 trap_info_t *ti = ti_arg; /* XXPV Aargh - segments.h comment */
447
448 /*
449 * skip holes in the IDT
450 */
451 if (GATESEG_GETOFFSET(sgd) == 0)
452 return (0);
453
454 ASSERT(sgd->sgd_type == SDT_SYSIGT);
455 ti->vector = vec;
456 TI_SET_DPL(ti, sgd->sgd_dpl);
457
458 /*
459 * Is this an interrupt gate?
460 */
461 if (sgd->sgd_type == SDT_SYSIGT) {
462 /* LINTED */
463 TI_SET_IF(ti, 1);
464 }
465 ti->cs = sgd->sgd_selector;
466 #if defined(__amd64)
467 ti->cs |= SEL_KPL; /* force into ring 3. see KCS_SEL */
468 #endif
469 ti->address = GATESEG_GETOFFSET(sgd);
470 return (1);
471 }
472
473 /*
474 * Convert a single hw format gate descriptor and write it into our virtual IDT.
475 */
476 void
477 xen_idt_write(gate_desc_t *sgd, uint_t vec)
478 {
479 trap_info_t trapinfo[2];
480
481 bzero(trapinfo, sizeof (trapinfo));
482 if (xen_idt_to_trap_info(vec, sgd, &trapinfo[0]) == 0)
483 return;
484 if (xen_set_trap_table(trapinfo) != 0)
485 panic("xen_idt_write: xen_set_trap_table() failed");
486 }
487
488 #endif /* __xpv */
489
490 #if defined(__amd64)
491
492 /*
493 * Build kernel GDT.
494 */
495
496 static void
497 init_gdt_common(user_desc_t *gdt)
498 {
499 int i;
500
501 /*
502 * 64-bit kernel code segment.
503 */
504 set_usegd(&gdt[GDT_KCODE], SDP_LONG, NULL, 0, SDT_MEMERA, SEL_KPL,
505 SDP_PAGES, SDP_OP32);
506
507 /*
508 * 64-bit kernel data segment. The limit attribute is ignored in 64-bit
509 * mode, but we set it here to 0xFFFF so that we can use the SYSRET
510 * instruction to return from system calls back to 32-bit applications.
511 * SYSRET doesn't update the base, limit, or attributes of %ss or %ds
512 * descriptors. We therefore must ensure that the kernel uses something,
513 * though it will be ignored by hardware, that is compatible with 32-bit
514 * apps. For the same reason we must set the default op size of this
515 * descriptor to 32-bit operands.
516 */
517 set_usegd(&gdt[GDT_KDATA], SDP_LONG, NULL, -1, SDT_MEMRWA,
518 SEL_KPL, SDP_PAGES, SDP_OP32);
519 gdt[GDT_KDATA].usd_def32 = 1;
520
521 /*
522 * 64-bit user code segment.
523 */
524 set_usegd(&gdt[GDT_UCODE], SDP_LONG, NULL, 0, SDT_MEMERA, SEL_UPL,
525 SDP_PAGES, SDP_OP32);
526
527 /*
528 * 32-bit user code segment.
529 */
530 set_usegd(&gdt[GDT_U32CODE], SDP_SHORT, NULL, -1, SDT_MEMERA,
531 SEL_UPL, SDP_PAGES, SDP_OP32);
532
533 /*
534 * See gdt_ucode32() and gdt_ucode_native().
535 */
536 ucs_on = ucs_off = gdt[GDT_UCODE];
537 ucs_off.usd_p = 0; /* forces #np fault */
538
539 ucs32_on = ucs32_off = gdt[GDT_U32CODE];
540 ucs32_off.usd_p = 0; /* forces #np fault */
541
542 /*
543 * 32 and 64 bit data segments can actually share the same descriptor.
544 * In long mode only the present bit is checked but all other fields
545 * are loaded. But in compatibility mode all fields are interpreted
546 * as in legacy mode so they must be set correctly for a 32-bit data
547 * segment.
548 */
549 set_usegd(&gdt[GDT_UDATA], SDP_SHORT, NULL, -1, SDT_MEMRWA, SEL_UPL,
550 SDP_PAGES, SDP_OP32);
551
552 #if !defined(__xpv)
553
554 /*
555 * The 64-bit kernel has no default LDT. By default, the LDT descriptor
556 * in the GDT is 0.
557 */
558
559 /*
560 * Kernel TSS
561 */
562 set_syssegd((system_desc_t *)&gdt[GDT_KTSS], ktss0,
563 sizeof (*ktss0) - 1, SDT_SYSTSS, SEL_KPL);
564
565 #endif /* !__xpv */
566
567 /*
568 * Initialize fs and gs descriptors for 32 bit processes.
569 * Only attributes and limits are initialized, the effective
570 * base address is programmed via fsbase/gsbase.
571 */
572 set_usegd(&gdt[GDT_LWPFS], SDP_SHORT, NULL, -1, SDT_MEMRWA,
573 SEL_UPL, SDP_PAGES, SDP_OP32);
574 set_usegd(&gdt[GDT_LWPGS], SDP_SHORT, NULL, -1, SDT_MEMRWA,
575 SEL_UPL, SDP_PAGES, SDP_OP32);
576
577 /*
578 * Initialize the descriptors set aside for brand usage.
579 * Only attributes and limits are initialized.
580 */
581 for (i = GDT_BRANDMIN; i <= GDT_BRANDMAX; i++)
582 set_usegd(&gdt0[i], SDP_SHORT, NULL, -1, SDT_MEMRWA,
583 SEL_UPL, SDP_PAGES, SDP_OP32);
584
585 /*
586 * Initialize convenient zero base user descriptors for clearing
587 * lwp private %fs and %gs descriptors in GDT. See setregs() for
588 * an example.
589 */
590 set_usegd(&zero_udesc, SDP_LONG, 0, 0, SDT_MEMRWA, SEL_UPL,
591 SDP_BYTES, SDP_OP32);
592 set_usegd(&zero_u32desc, SDP_SHORT, 0, -1, SDT_MEMRWA, SEL_UPL,
593 SDP_PAGES, SDP_OP32);
594 }
595
596 #if defined(__xpv)
597
598 static user_desc_t *
599 init_gdt(void)
600 {
601 uint64_t gdtpa;
602 ulong_t ma[1]; /* XXPV should be a memory_t */
603 ulong_t addr;
604
605 #if !defined(__lint)
606 /*
607 * Our gdt is never larger than a single page.
608 */
609 ASSERT((sizeof (*gdt0) * NGDT) <= PAGESIZE);
610 #endif
611 gdt0 = (user_desc_t *)BOP_ALLOC(bootops, (caddr_t)GDT_VA,
612 PAGESIZE, PAGESIZE);
613 bzero(gdt0, PAGESIZE);
614
615 init_gdt_common(gdt0);
616
617 /*
618 * XXX Since we never invoke kmdb until after the kernel takes
619 * over the descriptor tables why not have it use the kernel's
620 * selectors?
621 */
622 if (boothowto & RB_DEBUG) {
623 set_usegd(&gdt0[GDT_B32DATA], SDP_LONG, NULL, -1, SDT_MEMRWA,
624 SEL_KPL, SDP_PAGES, SDP_OP32);
625 set_usegd(&gdt0[GDT_B64CODE], SDP_LONG, NULL, -1, SDT_MEMERA,
626 SEL_KPL, SDP_PAGES, SDP_OP32);
627 }
628
629 /*
630 * Clear write permission for page containing the gdt and install it.
631 */
632 gdtpa = pfn_to_pa(va_to_pfn(gdt0));
633 ma[0] = (ulong_t)(pa_to_ma(gdtpa) >> PAGESHIFT);
634 kbm_read_only((uintptr_t)gdt0, gdtpa);
635 xen_set_gdt(ma, NGDT);
636
637 /*
638 * Reload the segment registers to use the new GDT.
639 * On 64-bit, fixup KCS_SEL to be in ring 3.
640 * See KCS_SEL in segments.h.
641 */
642 load_segment_registers((KCS_SEL | SEL_KPL), KFS_SEL, KGS_SEL, KDS_SEL);
643
644 /*
645 * setup %gs for kernel
646 */
647 xen_set_segment_base(SEGBASE_GS_KERNEL, (ulong_t)&cpus[0]);
648
649 /*
650 * XX64 We should never dereference off "other gsbase" or
651 * "fsbase". So, we should arrange to point FSBASE and
652 * KGSBASE somewhere truly awful e.g. point it at the last
653 * valid address below the hole so that any attempts to index
654 * off them cause an exception.
655 *
656 * For now, point it at 8G -- at least it should be unmapped
657 * until some 64-bit processes run.
658 */
659 addr = 0x200000000ul;
660 xen_set_segment_base(SEGBASE_FS, addr);
661 xen_set_segment_base(SEGBASE_GS_USER, addr);
662 xen_set_segment_base(SEGBASE_GS_USER_SEL, 0);
663
664 return (gdt0);
665 }
666
667 #else /* __xpv */
668
669 static user_desc_t *
670 init_gdt(void)
671 {
672 desctbr_t r_bgdt, r_gdt;
673 user_desc_t *bgdt;
674
675 #if !defined(__lint)
676 /*
677 * Our gdt is never larger than a single page.
678 */
679 ASSERT((sizeof (*gdt0) * NGDT) <= PAGESIZE);
680 #endif
681 gdt0 = (user_desc_t *)BOP_ALLOC(bootops, (caddr_t)GDT_VA,
682 PAGESIZE, PAGESIZE);
683 bzero(gdt0, PAGESIZE);
684
685 init_gdt_common(gdt0);
686
687 /*
688 * Copy in from boot's gdt to our gdt.
689 * Entry 0 is the null descriptor by definition.
690 */
691 rd_gdtr(&r_bgdt);
692 bgdt = (user_desc_t *)r_bgdt.dtr_base;
693 if (bgdt == NULL)
694 panic("null boot gdt");
695
696 gdt0[GDT_B32DATA] = bgdt[GDT_B32DATA];
697 gdt0[GDT_B32CODE] = bgdt[GDT_B32CODE];
698 gdt0[GDT_B16CODE] = bgdt[GDT_B16CODE];
699 gdt0[GDT_B16DATA] = bgdt[GDT_B16DATA];
700 gdt0[GDT_B64CODE] = bgdt[GDT_B64CODE];
701
702 /*
703 * Install our new GDT
704 */
705 r_gdt.dtr_limit = (sizeof (*gdt0) * NGDT) - 1;
706 r_gdt.dtr_base = (uintptr_t)gdt0;
707 wr_gdtr(&r_gdt);
708
709 /*
710 * Reload the segment registers to use the new GDT
711 */
712 load_segment_registers(KCS_SEL, KFS_SEL, KGS_SEL, KDS_SEL);
713
714 /*
715 * setup %gs for kernel
716 */
717 wrmsr(MSR_AMD_GSBASE, (uint64_t)&cpus[0]);
718
719 /*
720 * XX64 We should never dereference off "other gsbase" or
721 * "fsbase". So, we should arrange to point FSBASE and
722 * KGSBASE somewhere truly awful e.g. point it at the last
723 * valid address below the hole so that any attempts to index
724 * off them cause an exception.
725 *
726 * For now, point it at 8G -- at least it should be unmapped
727 * until some 64-bit processes run.
728 */
729 wrmsr(MSR_AMD_FSBASE, 0x200000000ul);
730 wrmsr(MSR_AMD_KGSBASE, 0x200000000ul);
731 return (gdt0);
732 }
733
734 #endif /* __xpv */
735
736 #elif defined(__i386)
737
738 static void
739 init_gdt_common(user_desc_t *gdt)
740 {
741 int i;
742
743 /*
744 * Text and data for both kernel and user span entire 32 bit
745 * address space.
746 */
747
748 /*
749 * kernel code segment.
750 */
751 set_usegd(&gdt[GDT_KCODE], NULL, -1, SDT_MEMERA, SEL_KPL, SDP_PAGES,
752 SDP_OP32);
753
754 /*
755 * kernel data segment.
756 */
757 set_usegd(&gdt[GDT_KDATA], NULL, -1, SDT_MEMRWA, SEL_KPL, SDP_PAGES,
758 SDP_OP32);
759
760 /*
761 * user code segment.
762 */
763 set_usegd(&gdt[GDT_UCODE], NULL, -1, SDT_MEMERA, SEL_UPL, SDP_PAGES,
764 SDP_OP32);
765
766 /*
767 * user data segment.
768 */
769 set_usegd(&gdt[GDT_UDATA], NULL, -1, SDT_MEMRWA, SEL_UPL, SDP_PAGES,
770 SDP_OP32);
771
772 #if !defined(__xpv)
773
774 /*
775 * TSS for T_DBLFLT (double fault) handler
776 */
777 set_syssegd((system_desc_t *)&gdt[GDT_DBFLT], dftss0,
778 sizeof (*dftss0) - 1, SDT_SYSTSS, SEL_KPL);
779
780 /*
781 * TSS for kernel
782 */
783 set_syssegd((system_desc_t *)&gdt[GDT_KTSS], ktss0,
784 sizeof (*ktss0) - 1, SDT_SYSTSS, SEL_KPL);
785
786 #endif /* !__xpv */
787
788 /*
789 * %gs selector for kernel
790 */
791 set_usegd(&gdt[GDT_GS], &cpus[0], sizeof (struct cpu) -1, SDT_MEMRWA,
792 SEL_KPL, SDP_BYTES, SDP_OP32);
793
794 /*
795 * Initialize lwp private descriptors.
796 * Only attributes and limits are initialized, the effective
797 * base address is programmed via fsbase/gsbase.
798 */
799 set_usegd(&gdt[GDT_LWPFS], NULL, (size_t)-1, SDT_MEMRWA, SEL_UPL,
800 SDP_PAGES, SDP_OP32);
801 set_usegd(&gdt[GDT_LWPGS], NULL, (size_t)-1, SDT_MEMRWA, SEL_UPL,
802 SDP_PAGES, SDP_OP32);
803
804 /*
805 * Initialize the descriptors set aside for brand usage.
806 * Only attributes and limits are initialized.
807 */
808 for (i = GDT_BRANDMIN; i <= GDT_BRANDMAX; i++)
809 set_usegd(&gdt0[i], NULL, (size_t)-1, SDT_MEMRWA, SEL_UPL,
810 SDP_PAGES, SDP_OP32);
811 /*
812 * Initialize convenient zero base user descriptor for clearing
813 * lwp private %fs and %gs descriptors in GDT. See setregs() for
814 * an example.
815 */
816 set_usegd(&zero_udesc, NULL, -1, SDT_MEMRWA, SEL_UPL,
817 SDP_BYTES, SDP_OP32);
818 }
819
820 #if defined(__xpv)
821
822 static user_desc_t *
823 init_gdt(void)
824 {
825 uint64_t gdtpa;
826 ulong_t ma[1]; /* XXPV should be a memory_t */
827
828 #if !defined(__lint)
829 /*
830 * Our gdt is never larger than a single page.
831 */
832 ASSERT((sizeof (*gdt0) * NGDT) <= PAGESIZE);
833 #endif
834 gdt0 = (user_desc_t *)BOP_ALLOC(bootops, (caddr_t)GDT_VA,
835 PAGESIZE, PAGESIZE);
836 bzero(gdt0, PAGESIZE);
837
838 init_gdt_common(gdt0);
839 gdtpa = pfn_to_pa(va_to_pfn(gdt0));
840
841 /*
842 * XXX Since we never invoke kmdb until after the kernel takes
843 * over the descriptor tables why not have it use the kernel's
844 * selectors?
845 */
846 if (boothowto & RB_DEBUG) {
847 set_usegd(&gdt0[GDT_B32DATA], NULL, -1, SDT_MEMRWA, SEL_KPL,
848 SDP_PAGES, SDP_OP32);
849 set_usegd(&gdt0[GDT_B32CODE], NULL, -1, SDT_MEMERA, SEL_KPL,
850 SDP_PAGES, SDP_OP32);
851 }
852
853 /*
854 * Clear write permission for page containing the gdt and install it.
855 */
856 ma[0] = (ulong_t)(pa_to_ma(gdtpa) >> PAGESHIFT);
857 kbm_read_only((uintptr_t)gdt0, gdtpa);
858 xen_set_gdt(ma, NGDT);
859
860 /*
861 * Reload the segment registers to use the new GDT
862 */
863 load_segment_registers(
864 KCS_SEL, KDS_SEL, KDS_SEL, KFS_SEL, KGS_SEL, KDS_SEL);
865
866 return (gdt0);
867 }
868
869 #else /* __xpv */
870
871 static user_desc_t *
872 init_gdt(void)
873 {
874 desctbr_t r_bgdt, r_gdt;
875 user_desc_t *bgdt;
876
877 #if !defined(__lint)
878 /*
879 * Our gdt is never larger than a single page.
880 */
881 ASSERT((sizeof (*gdt0) * NGDT) <= PAGESIZE);
882 #endif
883 /*
884 * XXX this allocation belongs in our caller, not here.
885 */
886 gdt0 = (user_desc_t *)BOP_ALLOC(bootops, (caddr_t)GDT_VA,
887 PAGESIZE, PAGESIZE);
888 bzero(gdt0, PAGESIZE);
889
890 init_gdt_common(gdt0);
891
892 /*
893 * Copy in from boot's gdt to our gdt entries.
894 * Entry 0 is null descriptor by definition.
895 */
896 rd_gdtr(&r_bgdt);
897 bgdt = (user_desc_t *)r_bgdt.dtr_base;
898 if (bgdt == NULL)
899 panic("null boot gdt");
900
901 gdt0[GDT_B32DATA] = bgdt[GDT_B32DATA];
902 gdt0[GDT_B32CODE] = bgdt[GDT_B32CODE];
903 gdt0[GDT_B16CODE] = bgdt[GDT_B16CODE];
904 gdt0[GDT_B16DATA] = bgdt[GDT_B16DATA];
905
906 /*
907 * Install our new GDT
908 */
909 r_gdt.dtr_limit = (sizeof (*gdt0) * NGDT) - 1;
910 r_gdt.dtr_base = (uintptr_t)gdt0;
911 wr_gdtr(&r_gdt);
912
913 /*
914 * Reload the segment registers to use the new GDT
915 */
916 load_segment_registers(
917 KCS_SEL, KDS_SEL, KDS_SEL, KFS_SEL, KGS_SEL, KDS_SEL);
918
919 return (gdt0);
920 }
921
922 #endif /* __xpv */
923 #endif /* __i386 */
924
925 /*
926 * Build kernel IDT.
927 *
928 * Note that for amd64 we pretty much require every gate to be an interrupt
929 * gate which blocks interrupts atomically on entry; that's because of our
930 * dependency on using 'swapgs' every time we come into the kernel to find
931 * the cpu structure. If we get interrupted just before doing that, %cs could
932 * be in kernel mode (so that the trap prolog doesn't do a swapgs), but
933 * %gsbase is really still pointing at something in userland. Bad things will
934 * ensue. We also use interrupt gates for i386 as well even though this is not
935 * required for some traps.
936 *
937 * Perhaps they should have invented a trap gate that does an atomic swapgs?
938 */
939 static void
940 init_idt_common(gate_desc_t *idt)
941 {
942 set_gatesegd(&idt[T_ZERODIV],
943 (kpti_enable == 1) ? &tr_div0trap : &div0trap,
944 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_ZERODIV));
945 set_gatesegd(&idt[T_SGLSTP],
946 (kpti_enable == 1) ? &tr_dbgtrap : &dbgtrap,
947 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_SGLSTP));
948 set_gatesegd(&idt[T_NMIFLT],
949 (kpti_enable == 1) ? &tr_nmiint : &nmiint,
950 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_NMIFLT));
951 set_gatesegd(&idt[T_BPTFLT],
952 (kpti_enable == 1) ? &tr_brktrap : &brktrap,
953 KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_BPTFLT));
954 set_gatesegd(&idt[T_OVFLW],
955 (kpti_enable == 1) ? &tr_ovflotrap : &ovflotrap,
956 KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_OVFLW));
957 set_gatesegd(&idt[T_BOUNDFLT],
958 (kpti_enable == 1) ? &tr_boundstrap : &boundstrap,
959 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_BOUNDFLT));
960 set_gatesegd(&idt[T_ILLINST],
961 (kpti_enable == 1) ? &tr_invoptrap : &invoptrap,
962 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_ILLINST));
963 set_gatesegd(&idt[T_NOEXTFLT],
964 (kpti_enable == 1) ? &tr_ndptrap : &ndptrap,
965 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_NOEXTFLT));
966
967 /*
968 * double fault handler.
969 *
970 * Note that on the hypervisor a guest does not receive #df faults.
971 * Instead a failsafe event is injected into the guest if its selectors
972 * and/or stack is in a broken state. See xen_failsafe_callback.
973 */
974 #if !defined(__xpv)
975 set_gatesegd(&idt[T_DBLFLT],
976 (kpti_enable == 1) ? &tr_syserrtrap : &syserrtrap,
977 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_DBLFLT));
978 #endif /* !__xpv */
979
980 /*
981 * T_EXTOVRFLT coprocessor-segment-overrun not supported.
982 */
983 set_gatesegd(&idt[T_TSSFLT],
984 (kpti_enable == 1) ? &tr_invtsstrap : &invtsstrap,
985 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_TSSFLT));
986 set_gatesegd(&idt[T_SEGFLT],
987 (kpti_enable == 1) ? &tr_segnptrap : &segnptrap,
988 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_SEGFLT));
989 set_gatesegd(&idt[T_STKFLT],
990 (kpti_enable == 1) ? &tr_stktrap : &stktrap,
991 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_STKFLT));
992 set_gatesegd(&idt[T_GPFLT],
993 (kpti_enable == 1) ? &tr_gptrap : &gptrap,
994 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_GPFLT));
995 set_gatesegd(&idt[T_PGFLT],
996 (kpti_enable == 1) ? &tr_pftrap : &pftrap,
997 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_PGFLT));
998 set_gatesegd(&idt[T_EXTERRFLT],
999 (kpti_enable == 1) ? &tr_ndperr : &ndperr,
1000 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_EXTERRFLT));
1001 set_gatesegd(&idt[T_ALIGNMENT],
1002 (kpti_enable == 1) ? &tr_achktrap : &achktrap,
1003 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_ALIGNMENT));
1004 set_gatesegd(&idt[T_MCE],
1005 (kpti_enable == 1) ? &tr_mcetrap : &mcetrap,
1006 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_MCE));
1007 set_gatesegd(&idt[T_SIMDFPE],
1008 (kpti_enable == 1) ? &tr_xmtrap : &xmtrap,
1009 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_SIMDFPE));
1010
1011 /*
1012 * install fast trap handler at 210.
1013 */
1014 set_gatesegd(&idt[T_FASTTRAP],
1015 (kpti_enable == 1) ? &tr_fasttrap : &fasttrap,
1016 KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_FASTTRAP));
1017
1018 /*
1019 * System call handler.
1020 */
1021 set_gatesegd(&idt[T_SYSCALLINT],
1022 (kpti_enable == 1) ? &tr_sys_syscall_int : &sys_syscall_int,
1023 KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_SYSCALLINT));
1024
1025 /*
1026 * Install the DTrace interrupt handler for the pid provider.
1027 */
1028 set_gatesegd(&idt[T_DTRACE_RET],
1029 (kpti_enable == 1) ? &tr_dtrace_ret : &dtrace_ret,
1030 KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_DTRACE_RET));
1031
1032 /*
1033 * Prepare interposing descriptor for the syscall handler
1034 * and cache copy of the default descriptor.
1035 */
1036 brand_tbl[0].ih_inum = T_SYSCALLINT;
1037 brand_tbl[0].ih_default_desc = idt0[T_SYSCALLINT];
1038
1039 set_gatesegd(&(brand_tbl[0].ih_interp_desc),
1040 (kpti_enable == 1) ? &tr_brand_sys_syscall_int :
1041 &brand_sys_syscall_int, KCS_SEL, SDT_SYSIGT, TRP_UPL,
1042 idt_vector_to_ist(T_SYSCALLINT));
1043
1044 brand_tbl[1].ih_inum = 0;
1045 }
1046
1047 #if defined(__xpv)
1048
1049 static void
1050 init_idt(gate_desc_t *idt)
1051 {
1052 init_idt_common(idt);
1053 }
1054
1055 #else /* __xpv */
1056
1057 static void
1058 init_idt(gate_desc_t *idt)
1059 {
1060 char ivctname[80];
1061 void (*ivctptr)(void);
1062 int i;
1063
1064 /*
1065 * Initialize entire table with 'reserved' trap and then overwrite
1066 * specific entries. T_EXTOVRFLT (9) is unsupported and reserved
1067 * since it can only be generated on a 386 processor. 15 is also
1068 * unsupported and reserved.
1069 */
1070 #if !defined(__xpv)
1071 for (i = 0; i < NIDT; i++) {
1072 set_gatesegd(&idt[i],
1073 (kpti_enable == 1) ? &tr_resvtrap : &resvtrap,
1074 KCS_SEL, SDT_SYSIGT, TRP_KPL,
1075 idt_vector_to_ist(T_RESVTRAP));
1076 }
1077 #else
1078 for (i = 0; i < NIDT; i++) {
1079 set_gatesegd(&idt[i], &resvtrap, KCS_SEL, SDT_SYSIGT, TRP_KPL,
1080 IST_NONE);
1081 }
1082 #endif
1083
1084 /*
1085 * 20-31 reserved
1086 */
1087 #if !defined(__xpv)
1088 for (i = 20; i < 32; i++) {
1089 set_gatesegd(&idt[i],
1090 (kpti_enable == 1) ? &tr_invaltrap : &invaltrap,
1091 KCS_SEL, SDT_SYSIGT, TRP_KPL,
1092 idt_vector_to_ist(T_INVALTRAP));
1093 }
1094 #else
1095 for (i = 20; i < 32; i++) {
1096 set_gatesegd(&idt[i], &invaltrap, KCS_SEL, SDT_SYSIGT, TRP_KPL,
1097 IST_NONE);
1098 }
1099 #endif
1100
1101 /*
1102 * interrupts 32 - 255
1103 */
1104 for (i = 32; i < 256; i++) {
1105 #if !defined(__xpv)
1106 (void) snprintf(ivctname, sizeof (ivctname),
1107 (kpti_enable == 1) ? "tr_ivct%d" : "ivct%d", i);
1108 #else
1109 (void) snprintf(ivctname, sizeof (ivctname), "ivct%d", i);
1110 #endif
1111 ivctptr = (void (*)(void))kobj_getsymvalue(ivctname, 0);
1112 if (ivctptr == NULL)
1113 panic("kobj_getsymvalue(%s) failed", ivctname);
1114
1115 set_gatesegd(&idt[i], ivctptr, KCS_SEL, SDT_SYSIGT, TRP_KPL,
1116 idt_vector_to_ist(i));
1117 }
1118
1119 /*
1120 * Now install the common ones. Note that it will overlay some
1121 * entries installed above like T_SYSCALLINT, T_FASTTRAP etc.
1122 */
1123 init_idt_common(idt);
1124 }
1125
1126 #endif /* __xpv */
1127
1128 /*
1129 * The kernel does not deal with LDTs unless a user explicitly creates
1130 * one. Under normal circumstances, the LDTR contains 0. Any process attempting
1131 * to reference the LDT will therefore cause a #gp. System calls made via the
1132 * obsolete lcall mechanism are emulated by the #gp fault handler.
1133 */
1134 static void
1135 init_ldt(void)
1136 {
1137 #if defined(__xpv)
1138 xen_set_ldt(NULL, 0);
1139 #else
1140 wr_ldtr(0);
1141 #endif
1142 }
1143
1144 #if !defined(__xpv)
1145
1146 static void
1147 init_tss(void)
1148 {
1149 extern struct cpu cpus[];
1150
1151 /*
1152 * tss_rsp0 is dynamically filled in by resume() (in swtch.s) on each
1153 * context switch but it'll be overwritten with this same value anyway.
1154 */
1155 if (kpti_enable == 1) {
1156 ktss0->tss_rsp0 = (uint64_t)&cpus->cpu_m.mcpu_kpti.kf_tr_rsp;
1157 }
1158
1159 /* Set up the IST stacks for double fault, NMI, MCE. */
1160 ktss0->tss_ist1 = (uintptr_t)&dblfault_stack0[sizeof (dblfault_stack0)];
1161 ktss0->tss_ist2 = (uintptr_t)&nmi_stack0[sizeof (nmi_stack0)];
1162 ktss0->tss_ist3 = (uintptr_t)&mce_stack0[sizeof (mce_stack0)];
1163
1164 /*
1165 * This IST stack is used for #DB,#BP (debug) interrupts (when KPTI is
1166 * enabled), and also for KDI (always).
1167 */
1168 ktss0->tss_ist4 = (uint64_t)&cpus->cpu_m.mcpu_kpti_dbg.kf_tr_rsp;
1169
1170 if (kpti_enable == 1) {
1171 /* This IST stack is used for #GP,#PF,#SS (fault) interrupts. */
1172 ktss0->tss_ist5 =
1173 (uint64_t)&cpus->cpu_m.mcpu_kpti_flt.kf_tr_rsp;
1174
1175 /* This IST stack is used for all other intrs (for KPTI). */
1176 ktss0->tss_ist6 = (uint64_t)&cpus->cpu_m.mcpu_kpti.kf_tr_rsp;
1177 }
1178
1179 /*
1180 * Set I/O bit map offset equal to size of TSS segment limit
1181 * for no I/O permission map. This will force all user I/O
1182 * instructions to generate #gp fault.
1183 */
1184 ktss0->tss_bitmapbase = sizeof (*ktss0);
1185
1186 /*
1187 * Point %tr to descriptor for ktss0 in gdt.
1188 */
1189 wr_tsr(KTSS_SEL);
1190 }
1191
1192 #endif /* !__xpv */
1193
1194 #if defined(__xpv)
1195
1196 void
1197 init_desctbls(void)
1198 {
1199 uint_t vec;
1200 user_desc_t *gdt;
1201
1202 /*
1203 * Setup and install our GDT.
1204 */
1205 gdt = init_gdt();
1206
1207 /*
1208 * Store static pa of gdt to speed up pa_to_ma() translations
1209 * on lwp context switches.
1210 */
1211 ASSERT(IS_P2ALIGNED((uintptr_t)gdt, PAGESIZE));
1212 CPU->cpu_gdt = gdt;
1213 CPU->cpu_m.mcpu_gdtpa = pfn_to_pa(va_to_pfn(gdt));
1214
1215 /*
1216 * Setup and install our IDT.
1217 */
1218 #if !defined(__lint)
1219 ASSERT(NIDT * sizeof (*idt0) <= PAGESIZE);
1220 #endif
1221 idt0 = (gate_desc_t *)BOP_ALLOC(bootops, (caddr_t)IDT_VA,
1222 PAGESIZE, PAGESIZE);
1223 bzero(idt0, PAGESIZE);
1224 init_idt(idt0);
1225 for (vec = 0; vec < NIDT; vec++)
1226 xen_idt_write(&idt0[vec], vec);
1227
1228 CPU->cpu_idt = idt0;
1229
1230 /*
1231 * set default kernel stack
1232 */
1233 xen_stack_switch(KDS_SEL,
1234 (ulong_t)&dblfault_stack0[sizeof (dblfault_stack0)]);
1235
1236 xen_init_callbacks();
1237
1238 init_ldt();
1239 }
1240
1241 #else /* __xpv */
1242
1243 void
1244 init_desctbls(void)
1245 {
1246 user_desc_t *gdt;
1247 desctbr_t idtr;
1248
1249 /*
1250 * Allocate IDT and TSS structures on unique pages for better
1251 * performance in virtual machines.
1252 */
1253 #if !defined(__lint)
1254 ASSERT(NIDT * sizeof (*idt0) <= PAGESIZE);
1255 #endif
1256 idt0 = (gate_desc_t *)BOP_ALLOC(bootops, (caddr_t)IDT_VA,
1257 PAGESIZE, PAGESIZE);
1258 bzero(idt0, PAGESIZE);
1259 #if !defined(__lint)
1260 ASSERT(sizeof (*ktss0) <= PAGESIZE);
1261 #endif
1262 ktss0 = (tss_t *)BOP_ALLOC(bootops, (caddr_t)KTSS_VA,
1263 PAGESIZE, PAGESIZE);
1264 bzero(ktss0, PAGESIZE);
1265
1266 #if defined(__i386)
1267 #if !defined(__lint)
1268 ASSERT(sizeof (*dftss0) <= PAGESIZE);
1269 #endif
1270 dftss0 = (tss_t *)BOP_ALLOC(bootops, (caddr_t)DFTSS_VA,
1271 PAGESIZE, PAGESIZE);
1272 bzero(dftss0, PAGESIZE);
1273 #endif
1274
1275 /*
1276 * Setup and install our GDT.
1277 */
1278 gdt = init_gdt();
1279 ASSERT(IS_P2ALIGNED((uintptr_t)gdt, PAGESIZE));
1280 CPU->cpu_gdt = gdt;
1281
1282 /*
1283 * Initialize this CPU's LDT.
1284 */
1285 CPU->cpu_m.mcpu_ldt = BOP_ALLOC(bootops, (caddr_t)LDT_VA,
1286 LDT_CPU_SIZE, PAGESIZE);
1287 bzero(CPU->cpu_m.mcpu_ldt, LDT_CPU_SIZE);
1288 CPU->cpu_m.mcpu_ldt_len = 0;
1289
1290 /*
1291 * Setup and install our IDT.
1292 */
1293 init_idt(idt0);
1294
1295 idtr.dtr_base = (uintptr_t)idt0;
1296 idtr.dtr_limit = (NIDT * sizeof (*idt0)) - 1;
1297 wr_idtr(&idtr);
1298 CPU->cpu_idt = idt0;
1299
1300 #if defined(__i386)
1301 /*
1302 * We maintain a description of idt0 in convenient IDTR format
1303 * for #pf's on some older pentium processors. See pentium_pftrap().
1304 */
1305 idt0_default_r = idtr;
1306 #endif /* __i386 */
1307
1308 init_tss();
1309 CPU->cpu_tss = ktss0;
1310 init_ldt();
1311
1312 /* Stash this so that the NMI,MCE,#DF and KDI handlers can use it. */
1313 kpti_safe_cr3 = (uint64_t)getcr3();
1314 }
1315
1316 #endif /* __xpv */
1317
1318 /*
1319 * In the early kernel, we need to set up a simple GDT to run on.
1320 *
1321 * XXPV Can dboot use this too? See dboot_gdt.s
1322 */
1323 void
1324 init_boot_gdt(user_desc_t *bgdt)
1325 {
1326 #if defined(__amd64)
1327 set_usegd(&bgdt[GDT_B32DATA], SDP_LONG, NULL, -1, SDT_MEMRWA, SEL_KPL,
1328 SDP_PAGES, SDP_OP32);
1329 set_usegd(&bgdt[GDT_B64CODE], SDP_LONG, NULL, -1, SDT_MEMERA, SEL_KPL,
1330 SDP_PAGES, SDP_OP32);
1331 #elif defined(__i386)
1332 set_usegd(&bgdt[GDT_B32DATA], NULL, -1, SDT_MEMRWA, SEL_KPL,
1333 SDP_PAGES, SDP_OP32);
1334 set_usegd(&bgdt[GDT_B32CODE], NULL, -1, SDT_MEMERA, SEL_KPL,
1335 SDP_PAGES, SDP_OP32);
1336 #endif /* __i386 */
1337 }
1338
1339 /*
1340 * Enable interpositioning on the system call path by rewriting the
1341 * sys{call|enter} MSRs and the syscall-related entries in the IDT to use
1342 * the branded entry points.
1343 */
1344 void
1345 brand_interpositioning_enable(void)
1346 {
1347 gate_desc_t *idt = CPU->cpu_idt;
1348 int i;
1349
1350 ASSERT(curthread->t_preempt != 0 || getpil() >= DISP_LEVEL);
1351
1352 for (i = 0; brand_tbl[i].ih_inum; i++) {
1353 idt[brand_tbl[i].ih_inum] = brand_tbl[i].ih_interp_desc;
1354 #if defined(__xpv)
1355 xen_idt_write(&idt[brand_tbl[i].ih_inum],
1356 brand_tbl[i].ih_inum);
1357 #endif
1358 }
1359
1360 #if defined(__amd64)
1361 #if defined(__xpv)
1362
1363 /*
1364 * Currently the hypervisor only supports 64-bit syscalls via
1365 * syscall instruction. The 32-bit syscalls are handled by
1366 * interrupt gate above.
1367 */
1368 xen_set_callback(brand_sys_syscall, CALLBACKTYPE_syscall,
1369 CALLBACKF_mask_events);
1370
1371 #else
1372
1373 if (is_x86_feature(x86_featureset, X86FSET_ASYSC)) {
1374 if (kpti_enable == 1) {
1375 wrmsr(MSR_AMD_LSTAR, (uintptr_t)tr_brand_sys_syscall);
1376 wrmsr(MSR_AMD_CSTAR, (uintptr_t)tr_brand_sys_syscall32);
1377 } else {
1378 wrmsr(MSR_AMD_LSTAR, (uintptr_t)brand_sys_syscall);
1379 wrmsr(MSR_AMD_CSTAR, (uintptr_t)brand_sys_syscall32);
1380 }
1381 }
1382
1383 #endif
1384 #endif /* __amd64 */
1385
1386 if (is_x86_feature(x86_featureset, X86FSET_SEP)) {
1387 if (kpti_enable == 1) {
1388 wrmsr(MSR_INTC_SEP_EIP,
1389 (uintptr_t)tr_brand_sys_sysenter);
1390 } else {
1391 wrmsr(MSR_INTC_SEP_EIP, (uintptr_t)brand_sys_sysenter);
1392 }
1393 }
1394 }
1395
1396 /*
1397 * Disable interpositioning on the system call path by rewriting the
1398 * sys{call|enter} MSRs and the syscall-related entries in the IDT to use
1399 * the standard entry points, which bypass the interpositioning hooks.
1400 */
1401 void
1402 brand_interpositioning_disable(void)
1403 {
1404 gate_desc_t *idt = CPU->cpu_idt;
1405 int i;
1406
1407 ASSERT(curthread->t_preempt != 0 || getpil() >= DISP_LEVEL);
1408
1409 for (i = 0; brand_tbl[i].ih_inum; i++) {
1410 idt[brand_tbl[i].ih_inum] = brand_tbl[i].ih_default_desc;
1411 #if defined(__xpv)
1412 xen_idt_write(&idt[brand_tbl[i].ih_inum],
1413 brand_tbl[i].ih_inum);
1414 #endif
1415 }
1416
1417 #if defined(__amd64)
1418 #if defined(__xpv)
1419
1420 /*
1421 * See comment above in brand_interpositioning_enable.
1422 */
1423 xen_set_callback(sys_syscall, CALLBACKTYPE_syscall,
1424 CALLBACKF_mask_events);
1425
1426 #else
1427
1428 if (is_x86_feature(x86_featureset, X86FSET_ASYSC)) {
1429 if (kpti_enable == 1) {
1430 wrmsr(MSR_AMD_LSTAR, (uintptr_t)tr_sys_syscall);
1431 wrmsr(MSR_AMD_CSTAR, (uintptr_t)tr_sys_syscall32);
1432 } else {
1433 wrmsr(MSR_AMD_LSTAR, (uintptr_t)sys_syscall);
1434 wrmsr(MSR_AMD_CSTAR, (uintptr_t)sys_syscall32);
1435 }
1436 }
1437
1438 #endif
1439 #endif /* __amd64 */
1440
1441 if (is_x86_feature(x86_featureset, X86FSET_SEP)) {
1442 if (kpti_enable == 1) {
1443 wrmsr(MSR_INTC_SEP_EIP, (uintptr_t)tr_sys_sysenter);
1444 } else {
1445 wrmsr(MSR_INTC_SEP_EIP, (uintptr_t)sys_sysenter);
1446 }
1447 }
1448 }