Print this page
9600 LDT still not happy under KPTI
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/intel/ia32/os/desctbls.c
+++ new/usr/src/uts/intel/ia32/os/desctbls.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21
22 22 /*
23 23 * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
24 24 */
25 25
26 26 /*
27 27 * Copyright 2018 Joyent, Inc. All rights reserved.
28 28 */
29 29
30 30 /*
31 31 * Copyright (c) 1992 Terrence R. Lambert.
32 32 * Copyright (c) 1990 The Regents of the University of California.
33 33 * All rights reserved.
34 34 *
35 35 * This code is derived from software contributed to Berkeley by
36 36 * William Jolitz.
37 37 *
38 38 * Redistribution and use in source and binary forms, with or without
39 39 * modification, are permitted provided that the following conditions
40 40 * are met:
41 41 * 1. Redistributions of source code must retain the above copyright
42 42 * notice, this list of conditions and the following disclaimer.
43 43 * 2. Redistributions in binary form must reproduce the above copyright
44 44 * notice, this list of conditions and the following disclaimer in the
45 45 * documentation and/or other materials provided with the distribution.
46 46 * 3. All advertising materials mentioning features or use of this software
47 47 * must display the following acknowledgement:
48 48 * This product includes software developed by the University of
49 49 * California, Berkeley and its contributors.
50 50 * 4. Neither the name of the University nor the names of its contributors
51 51 * may be used to endorse or promote products derived from this software
52 52 * without specific prior written permission.
53 53 *
54 54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
55 55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56 56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57 57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
58 58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59 59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60 60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61 61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62 62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63 63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64 64 * SUCH DAMAGE.
65 65 *
66 66 * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91
67 67 */
68 68
69 69 #include <sys/types.h>
70 70 #include <sys/sysmacros.h>
71 71 #include <sys/tss.h>
72 72 #include <sys/segments.h>
73 73 #include <sys/trap.h>
74 74 #include <sys/cpuvar.h>
75 75 #include <sys/bootconf.h>
76 76 #include <sys/x86_archext.h>
77 77 #include <sys/controlregs.h>
78 78 #include <sys/archsystm.h>
79 79 #include <sys/machsystm.h>
80 80 #include <sys/kobj.h>
81 81 #include <sys/cmn_err.h>
82 82 #include <sys/reboot.h>
83 83 #include <sys/kdi.h>
84 84 #include <sys/mach_mmu.h>
85 85 #include <sys/systm.h>
86 86 #include <sys/note.h>
87 87
88 88 #ifdef __xpv
89 89 #include <sys/hypervisor.h>
90 90 #include <vm/as.h>
91 91 #endif
92 92
93 93 #include <sys/promif.h>
94 94 #include <sys/bootinfo.h>
95 95 #include <vm/kboot_mmu.h>
96 96 #include <vm/hat_pte.h>
97 97
98 98 /*
99 99 * cpu0 and default tables and structures.
100 100 */
101 101 user_desc_t *gdt0;
102 102 #if !defined(__xpv)
103 103 desctbr_t gdt0_default_r;
104 104 #endif
105 105
106 106 gate_desc_t *idt0; /* interrupt descriptor table */
107 107 #if defined(__i386)
108 108 desctbr_t idt0_default_r; /* describes idt0 in IDTR format */
109 109 #endif
110 110
111 111 tss_t *ktss0; /* kernel task state structure */
112 112
113 113 #if defined(__i386)
114 114 tss_t *dftss0; /* #DF double-fault exception */
115 115 #endif /* __i386 */
116 116
117 117 user_desc_t zero_udesc; /* base zero user desc native procs */
118 118 user_desc_t null_udesc; /* null user descriptor */
119 119 system_desc_t null_sdesc; /* null system descriptor */
120 120
121 121 #if defined(__amd64)
122 122 user_desc_t zero_u32desc; /* 32-bit compatibility procs */
123 123 #endif /* __amd64 */
124 124
125 125 #if defined(__amd64)
126 126 user_desc_t ucs_on;
127 127 user_desc_t ucs_off;
128 128 user_desc_t ucs32_on;
129 129 user_desc_t ucs32_off;
130 130 #endif /* __amd64 */
131 131
132 132 /*
133 133 * If the size of this is changed, you must update hat_pcp_setup() and the
134 134 * definitions in exception.s
135 135 */
136 136 extern char dblfault_stack0[DEFAULTSTKSZ];
137 137 extern char nmi_stack0[DEFAULTSTKSZ];
138 138 extern char mce_stack0[DEFAULTSTKSZ];
139 139
140 140 extern void fast_null(void);
141 141 extern hrtime_t get_hrtime(void);
142 142 extern hrtime_t gethrvtime(void);
143 143 extern hrtime_t get_hrestime(void);
144 144 extern uint64_t getlgrp(void);
145 145
146 146 void (*(fasttable[]))(void) = {
147 147 fast_null, /* T_FNULL routine */
148 148 fast_null, /* T_FGETFP routine (initially null) */
149 149 fast_null, /* T_FSETFP routine (initially null) */
150 150 (void (*)())get_hrtime, /* T_GETHRTIME */
151 151 (void (*)())gethrvtime, /* T_GETHRVTIME */
152 152 (void (*)())get_hrestime, /* T_GETHRESTIME */
153 153 (void (*)())getlgrp /* T_GETLGRP */
154 154 };
155 155
156 156 /*
157 157 * Structure containing pre-computed descriptors to allow us to temporarily
158 158 * interpose on a standard handler.
159 159 */
160 160 struct interposing_handler {
161 161 int ih_inum;
162 162 gate_desc_t ih_interp_desc;
163 163 gate_desc_t ih_default_desc;
164 164 };
165 165
166 166 /*
167 167 * The brand infrastructure interposes on two handlers, and we use one as a
168 168 * NULL signpost.
169 169 */
170 170 static struct interposing_handler brand_tbl[2];
↓ open down ↓ |
170 lines elided |
↑ open up ↑ |
171 171
172 172 /*
173 173 * software prototypes for default local descriptor table
174 174 */
175 175
176 176 /*
177 177 * Routines for loading segment descriptors in format the hardware
178 178 * can understand.
179 179 */
180 180
181 -#if defined(__amd64)
182 -
183 181 /*
184 182 * In long mode we have the new L or long mode attribute bit
185 183 * for code segments. Only the conforming bit in type is used along
186 184 * with descriptor priority and present bits. Default operand size must
187 185 * be zero when in long mode. In 32-bit compatibility mode all fields
188 186 * are treated as in legacy mode. For data segments while in long mode
189 187 * only the present bit is loaded.
190 188 */
191 189 void
192 190 set_usegd(user_desc_t *dp, uint_t lmode, void *base, size_t size,
193 191 uint_t type, uint_t dpl, uint_t gran, uint_t defopsz)
194 192 {
195 193 ASSERT(lmode == SDP_SHORT || lmode == SDP_LONG);
194 + /* This should never be a "system" segment. */
195 + ASSERT3U(type & SDT_S, !=, 0);
196 196
197 197 /*
198 198 * 64-bit long mode.
199 199 */
200 200 if (lmode == SDP_LONG)
201 201 dp->usd_def32 = 0; /* 32-bit operands only */
202 202 else
203 203 /*
204 204 * 32-bit compatibility mode.
205 205 */
206 206 dp->usd_def32 = defopsz; /* 0 = 16, 1 = 32-bit ops */
207 207
208 + /*
209 + * We should always set the "accessed" bit (SDT_A), otherwise the CPU
210 + * will write to the GDT whenever we change segment registers around.
211 + * With KPTI on, the GDT is read-only in the user page table, which
212 + * causes crashes if we don't set this.
213 + */
214 + ASSERT3U(type & SDT_A, !=, 0);
215 +
208 216 dp->usd_long = lmode; /* 64-bit mode */
209 217 dp->usd_type = type;
210 218 dp->usd_dpl = dpl;
211 219 dp->usd_p = 1;
212 220 dp->usd_gran = gran; /* 0 = bytes, 1 = pages */
213 221
214 222 dp->usd_lobase = (uintptr_t)base;
215 223 dp->usd_midbase = (uintptr_t)base >> 16;
216 224 dp->usd_hibase = (uintptr_t)base >> (16 + 8);
217 225 dp->usd_lolimit = size;
218 226 dp->usd_hilimit = (uintptr_t)size >> 16;
219 227 }
220 228
221 -#elif defined(__i386)
222 -
223 229 /*
224 - * Install user segment descriptor for code and data.
225 - */
226 -void
227 -set_usegd(user_desc_t *dp, void *base, size_t size, uint_t type,
228 - uint_t dpl, uint_t gran, uint_t defopsz)
229 -{
230 - dp->usd_lolimit = size;
231 - dp->usd_hilimit = (uintptr_t)size >> 16;
232 -
233 - dp->usd_lobase = (uintptr_t)base;
234 - dp->usd_midbase = (uintptr_t)base >> 16;
235 - dp->usd_hibase = (uintptr_t)base >> (16 + 8);
236 -
237 - dp->usd_type = type;
238 - dp->usd_dpl = dpl;
239 - dp->usd_p = 1;
240 - dp->usd_def32 = defopsz; /* 0 = 16, 1 = 32 bit operands */
241 - dp->usd_gran = gran; /* 0 = bytes, 1 = pages */
242 -}
243 -
244 -#endif /* __i386 */
245 -
246 -/*
247 230 * Install system segment descriptor for LDT and TSS segments.
248 231 */
249 232
250 -#if defined(__amd64)
251 -
252 233 void
253 234 set_syssegd(system_desc_t *dp, void *base, size_t size, uint_t type,
254 235 uint_t dpl)
255 236 {
256 237 dp->ssd_lolimit = size;
257 238 dp->ssd_hilimit = (uintptr_t)size >> 16;
258 239
259 240 dp->ssd_lobase = (uintptr_t)base;
260 241 dp->ssd_midbase = (uintptr_t)base >> 16;
261 242 dp->ssd_hibase = (uintptr_t)base >> (16 + 8);
262 243 dp->ssd_hi64base = (uintptr_t)base >> (16 + 8 + 8);
263 244
264 245 dp->ssd_type = type;
265 246 dp->ssd_zero1 = 0; /* must be zero */
266 247 dp->ssd_zero2 = 0;
267 248 dp->ssd_dpl = dpl;
268 249 dp->ssd_p = 1;
269 250 dp->ssd_gran = 0; /* force byte units */
270 251 }
271 252
272 253 void *
273 254 get_ssd_base(system_desc_t *dp)
↓ open down ↓ |
12 lines elided |
↑ open up ↑ |
274 255 {
275 256 uintptr_t base;
276 257
277 258 base = (uintptr_t)dp->ssd_lobase |
278 259 (uintptr_t)dp->ssd_midbase << 16 |
279 260 (uintptr_t)dp->ssd_hibase << (16 + 8) |
280 261 (uintptr_t)dp->ssd_hi64base << (16 + 8 + 8);
281 262 return ((void *)base);
282 263 }
283 264
284 -#elif defined(__i386)
285 -
286 -void
287 -set_syssegd(system_desc_t *dp, void *base, size_t size, uint_t type,
288 - uint_t dpl)
289 -{
290 - dp->ssd_lolimit = size;
291 - dp->ssd_hilimit = (uintptr_t)size >> 16;
292 -
293 - dp->ssd_lobase = (uintptr_t)base;
294 - dp->ssd_midbase = (uintptr_t)base >> 16;
295 - dp->ssd_hibase = (uintptr_t)base >> (16 + 8);
296 -
297 - dp->ssd_type = type;
298 - dp->ssd_zero = 0; /* must be zero */
299 - dp->ssd_dpl = dpl;
300 - dp->ssd_p = 1;
301 - dp->ssd_gran = 0; /* force byte units */
302 -}
303 -
304 -void *
305 -get_ssd_base(system_desc_t *dp)
306 -{
307 - uintptr_t base;
308 -
309 - base = (uintptr_t)dp->ssd_lobase |
310 - (uintptr_t)dp->ssd_midbase << 16 |
311 - (uintptr_t)dp->ssd_hibase << (16 + 8);
312 - return ((void *)base);
313 -}
314 -
315 -#endif /* __i386 */
316 -
317 265 /*
318 266 * Install gate segment descriptor for interrupt, trap, call and task gates.
319 267 *
320 268 * For 64 bit native if we have KPTI enabled, we use the IST stack mechanism on
321 269 * all interrupts. We have different ISTs for each class of exceptions that are
322 270 * most likely to occur while handling an existing exception; while many of
323 271 * these are just going to panic, it's nice not to trample on the existing
324 272 * exception state for debugging purposes.
325 273 *
326 274 * Normal interrupts are all redirected unconditionally to the KPTI trampoline
327 275 * stack space. This unifies the trampoline handling between user and kernel
328 276 * space (and avoids the need to touch %gs).
329 277 *
330 278 * The KDI IDT *all* uses the DBG IST: consider single stepping tr_pftrap, when
331 279 * we do a read from KMDB that cause another #PF. Without its own IST, this
332 280 * would stomp on the kernel's mcpu_kpti_flt frame.
333 281 */
334 282 uint_t
335 283 idt_vector_to_ist(uint_t vector)
336 284 {
337 285 #if defined(__xpv)
338 286 _NOTE(ARGUNUSED(vector));
339 287 return (IST_NONE);
340 288 #else
341 289 switch (vector) {
342 290 /* These should always use IST even without KPTI enabled. */
343 291 case T_DBLFLT:
344 292 return (IST_DF);
345 293 case T_NMIFLT:
346 294 return (IST_NMI);
347 295 case T_MCE:
348 296 return (IST_MCE);
349 297
350 298 case T_BPTFLT:
351 299 case T_SGLSTP:
352 300 if (kpti_enable == 1) {
353 301 return (IST_DBG);
354 302 }
355 303 return (IST_NONE);
356 304 case T_STKFLT:
357 305 case T_GPFLT:
358 306 case T_PGFLT:
359 307 if (kpti_enable == 1) {
360 308 return (IST_NESTABLE);
361 309 }
362 310 return (IST_NONE);
363 311 default:
364 312 if (kpti_enable == 1) {
365 313 return (IST_DEFAULT);
366 314 }
367 315 return (IST_NONE);
368 316 }
369 317 #endif
370 318 }
371 319
372 320 void
373 321 set_gatesegd(gate_desc_t *dp, void (*func)(void), selector_t sel,
374 322 uint_t type, uint_t dpl, uint_t ist)
375 323 {
376 324 dp->sgd_looffset = (uintptr_t)func;
377 325 dp->sgd_hioffset = (uintptr_t)func >> 16;
378 326 dp->sgd_hi64offset = (uintptr_t)func >> (16 + 16);
379 327 dp->sgd_selector = (uint16_t)sel;
380 328 dp->sgd_ist = ist;
381 329 dp->sgd_type = type;
382 330 dp->sgd_dpl = dpl;
383 331 dp->sgd_p = 1;
↓ open down ↓ |
57 lines elided |
↑ open up ↑ |
384 332 }
385 333
386 334 /*
387 335 * Updates a single user descriptor in the the GDT of the current cpu.
388 336 * Caller is responsible for preventing cpu migration.
389 337 */
390 338
391 339 void
392 340 gdt_update_usegd(uint_t sidx, user_desc_t *udp)
393 341 {
394 -#if defined(__xpv)
342 +#if defined(DEBUG)
343 + /* This should never be a "system" segment, but it might be null. */
344 + if (udp->usd_p != 0 || udp->usd_type != 0) {
345 + ASSERT3U(udp->usd_type & SDT_S, !=, 0);
346 + }
347 + /*
348 + * We should always set the "accessed" bit (SDT_A), otherwise the CPU
349 + * will write to the GDT whenever we change segment registers around.
350 + * With KPTI on, the GDT is read-only in the user page table, which
351 + * causes crashes if we don't set this.
352 + */
353 + if (udp->usd_p != 0 || udp->usd_type != 0) {
354 + ASSERT3U(udp->usd_type & SDT_A, !=, 0);
355 + }
356 +#endif
395 357
358 +#if defined(__xpv)
396 359 uint64_t dpa = CPU->cpu_m.mcpu_gdtpa + sizeof (*udp) * sidx;
397 360
398 361 if (HYPERVISOR_update_descriptor(pa_to_ma(dpa), *(uint64_t *)udp))
399 362 panic("gdt_update_usegd: HYPERVISOR_update_descriptor");
400 363
401 364 #else /* __xpv */
402 -
403 365 CPU->cpu_gdt[sidx] = *udp;
404 -
405 366 #endif /* __xpv */
406 367 }
407 368
408 369 /*
409 370 * Writes single descriptor pointed to by udp into a processes
410 371 * LDT entry pointed to by ldp.
411 372 */
412 373 int
413 374 ldt_update_segd(user_desc_t *ldp, user_desc_t *udp)
414 375 {
415 -#if defined(__xpv)
376 +#if defined(DEBUG)
377 + /* This should never be a "system" segment, but it might be null. */
378 + if (udp->usd_p != 0 || udp->usd_type != 0) {
379 + ASSERT3U(udp->usd_type & SDT_S, !=, 0);
380 + }
381 + /*
382 + * We should always set the "accessed" bit (SDT_A), otherwise the CPU
383 + * will write to the LDT whenever we change segment registers around.
384 + * With KPTI on, the LDT is read-only in the user page table, which
385 + * causes crashes if we don't set this.
386 + */
387 + if (udp->usd_p != 0 || udp->usd_type != 0) {
388 + ASSERT3U(udp->usd_type & SDT_A, !=, 0);
389 + }
390 +#endif
416 391
392 +#if defined(__xpv)
417 393 uint64_t dpa;
418 394
419 395 dpa = mmu_ptob(hat_getpfnum(kas.a_hat, (caddr_t)ldp)) |
420 396 ((uintptr_t)ldp & PAGEOFFSET);
421 397
422 398 /*
423 399 * The hypervisor is a little more restrictive about what it
424 400 * supports in the LDT.
425 401 */
426 402 if (HYPERVISOR_update_descriptor(pa_to_ma(dpa), *(uint64_t *)udp) != 0)
427 403 return (EINVAL);
428 404
429 405 #else /* __xpv */
430 -
431 406 *ldp = *udp;
432 407
433 408 #endif /* __xpv */
434 409 return (0);
435 410 }
436 411
437 412 #if defined(__xpv)
438 413
439 414 /*
440 415 * Converts hw format gate descriptor into pseudo-IDT format for the hypervisor.
441 416 * Returns true if a valid entry was written.
442 417 */
443 418 int
444 419 xen_idt_to_trap_info(uint_t vec, gate_desc_t *sgd, void *ti_arg)
445 420 {
446 421 trap_info_t *ti = ti_arg; /* XXPV Aargh - segments.h comment */
447 422
448 423 /*
449 424 * skip holes in the IDT
450 425 */
451 426 if (GATESEG_GETOFFSET(sgd) == 0)
452 427 return (0);
453 428
454 429 ASSERT(sgd->sgd_type == SDT_SYSIGT);
455 430 ti->vector = vec;
456 431 TI_SET_DPL(ti, sgd->sgd_dpl);
457 432
458 433 /*
459 434 * Is this an interrupt gate?
460 435 */
461 436 if (sgd->sgd_type == SDT_SYSIGT) {
462 437 /* LINTED */
463 438 TI_SET_IF(ti, 1);
464 439 }
465 440 ti->cs = sgd->sgd_selector;
466 441 #if defined(__amd64)
467 442 ti->cs |= SEL_KPL; /* force into ring 3. see KCS_SEL */
468 443 #endif
469 444 ti->address = GATESEG_GETOFFSET(sgd);
470 445 return (1);
471 446 }
472 447
473 448 /*
474 449 * Convert a single hw format gate descriptor and write it into our virtual IDT.
475 450 */
476 451 void
477 452 xen_idt_write(gate_desc_t *sgd, uint_t vec)
478 453 {
479 454 trap_info_t trapinfo[2];
480 455
481 456 bzero(trapinfo, sizeof (trapinfo));
482 457 if (xen_idt_to_trap_info(vec, sgd, &trapinfo[0]) == 0)
483 458 return;
484 459 if (xen_set_trap_table(trapinfo) != 0)
485 460 panic("xen_idt_write: xen_set_trap_table() failed");
486 461 }
487 462
488 463 #endif /* __xpv */
489 464
490 465 #if defined(__amd64)
491 466
492 467 /*
493 468 * Build kernel GDT.
494 469 */
495 470
496 471 static void
497 472 init_gdt_common(user_desc_t *gdt)
498 473 {
499 474 int i;
500 475
501 476 /*
502 477 * 64-bit kernel code segment.
503 478 */
504 479 set_usegd(&gdt[GDT_KCODE], SDP_LONG, NULL, 0, SDT_MEMERA, SEL_KPL,
505 480 SDP_PAGES, SDP_OP32);
506 481
507 482 /*
508 483 * 64-bit kernel data segment. The limit attribute is ignored in 64-bit
509 484 * mode, but we set it here to 0xFFFF so that we can use the SYSRET
510 485 * instruction to return from system calls back to 32-bit applications.
511 486 * SYSRET doesn't update the base, limit, or attributes of %ss or %ds
512 487 * descriptors. We therefore must ensure that the kernel uses something,
513 488 * though it will be ignored by hardware, that is compatible with 32-bit
514 489 * apps. For the same reason we must set the default op size of this
515 490 * descriptor to 32-bit operands.
516 491 */
517 492 set_usegd(&gdt[GDT_KDATA], SDP_LONG, NULL, -1, SDT_MEMRWA,
518 493 SEL_KPL, SDP_PAGES, SDP_OP32);
519 494 gdt[GDT_KDATA].usd_def32 = 1;
520 495
521 496 /*
522 497 * 64-bit user code segment.
523 498 */
524 499 set_usegd(&gdt[GDT_UCODE], SDP_LONG, NULL, 0, SDT_MEMERA, SEL_UPL,
525 500 SDP_PAGES, SDP_OP32);
526 501
527 502 /*
528 503 * 32-bit user code segment.
529 504 */
530 505 set_usegd(&gdt[GDT_U32CODE], SDP_SHORT, NULL, -1, SDT_MEMERA,
531 506 SEL_UPL, SDP_PAGES, SDP_OP32);
532 507
533 508 /*
534 509 * See gdt_ucode32() and gdt_ucode_native().
535 510 */
536 511 ucs_on = ucs_off = gdt[GDT_UCODE];
537 512 ucs_off.usd_p = 0; /* forces #np fault */
538 513
539 514 ucs32_on = ucs32_off = gdt[GDT_U32CODE];
540 515 ucs32_off.usd_p = 0; /* forces #np fault */
541 516
542 517 /*
543 518 * 32 and 64 bit data segments can actually share the same descriptor.
544 519 * In long mode only the present bit is checked but all other fields
545 520 * are loaded. But in compatibility mode all fields are interpreted
546 521 * as in legacy mode so they must be set correctly for a 32-bit data
547 522 * segment.
548 523 */
549 524 set_usegd(&gdt[GDT_UDATA], SDP_SHORT, NULL, -1, SDT_MEMRWA, SEL_UPL,
550 525 SDP_PAGES, SDP_OP32);
551 526
552 527 #if !defined(__xpv)
553 528
554 529 /*
555 530 * The 64-bit kernel has no default LDT. By default, the LDT descriptor
556 531 * in the GDT is 0.
557 532 */
558 533
559 534 /*
560 535 * Kernel TSS
561 536 */
562 537 set_syssegd((system_desc_t *)&gdt[GDT_KTSS], ktss0,
563 538 sizeof (*ktss0) - 1, SDT_SYSTSS, SEL_KPL);
564 539
565 540 #endif /* !__xpv */
566 541
567 542 /*
568 543 * Initialize fs and gs descriptors for 32 bit processes.
569 544 * Only attributes and limits are initialized, the effective
570 545 * base address is programmed via fsbase/gsbase.
571 546 */
572 547 set_usegd(&gdt[GDT_LWPFS], SDP_SHORT, NULL, -1, SDT_MEMRWA,
573 548 SEL_UPL, SDP_PAGES, SDP_OP32);
574 549 set_usegd(&gdt[GDT_LWPGS], SDP_SHORT, NULL, -1, SDT_MEMRWA,
575 550 SEL_UPL, SDP_PAGES, SDP_OP32);
576 551
577 552 /*
578 553 * Initialize the descriptors set aside for brand usage.
579 554 * Only attributes and limits are initialized.
580 555 */
581 556 for (i = GDT_BRANDMIN; i <= GDT_BRANDMAX; i++)
582 557 set_usegd(&gdt0[i], SDP_SHORT, NULL, -1, SDT_MEMRWA,
583 558 SEL_UPL, SDP_PAGES, SDP_OP32);
584 559
585 560 /*
586 561 * Initialize convenient zero base user descriptors for clearing
587 562 * lwp private %fs and %gs descriptors in GDT. See setregs() for
588 563 * an example.
589 564 */
590 565 set_usegd(&zero_udesc, SDP_LONG, 0, 0, SDT_MEMRWA, SEL_UPL,
591 566 SDP_BYTES, SDP_OP32);
592 567 set_usegd(&zero_u32desc, SDP_SHORT, 0, -1, SDT_MEMRWA, SEL_UPL,
593 568 SDP_PAGES, SDP_OP32);
594 569 }
595 570
596 571 #if defined(__xpv)
597 572
598 573 static user_desc_t *
599 574 init_gdt(void)
600 575 {
601 576 uint64_t gdtpa;
602 577 ulong_t ma[1]; /* XXPV should be a memory_t */
603 578 ulong_t addr;
604 579
605 580 #if !defined(__lint)
606 581 /*
607 582 * Our gdt is never larger than a single page.
608 583 */
609 584 ASSERT((sizeof (*gdt0) * NGDT) <= PAGESIZE);
610 585 #endif
611 586 gdt0 = (user_desc_t *)BOP_ALLOC(bootops, (caddr_t)GDT_VA,
612 587 PAGESIZE, PAGESIZE);
613 588 bzero(gdt0, PAGESIZE);
614 589
615 590 init_gdt_common(gdt0);
616 591
617 592 /*
618 593 * XXX Since we never invoke kmdb until after the kernel takes
619 594 * over the descriptor tables why not have it use the kernel's
620 595 * selectors?
621 596 */
622 597 if (boothowto & RB_DEBUG) {
623 598 set_usegd(&gdt0[GDT_B32DATA], SDP_LONG, NULL, -1, SDT_MEMRWA,
624 599 SEL_KPL, SDP_PAGES, SDP_OP32);
625 600 set_usegd(&gdt0[GDT_B64CODE], SDP_LONG, NULL, -1, SDT_MEMERA,
626 601 SEL_KPL, SDP_PAGES, SDP_OP32);
627 602 }
628 603
629 604 /*
630 605 * Clear write permission for page containing the gdt and install it.
631 606 */
632 607 gdtpa = pfn_to_pa(va_to_pfn(gdt0));
633 608 ma[0] = (ulong_t)(pa_to_ma(gdtpa) >> PAGESHIFT);
634 609 kbm_read_only((uintptr_t)gdt0, gdtpa);
635 610 xen_set_gdt(ma, NGDT);
636 611
637 612 /*
638 613 * Reload the segment registers to use the new GDT.
639 614 * On 64-bit, fixup KCS_SEL to be in ring 3.
640 615 * See KCS_SEL in segments.h.
641 616 */
642 617 load_segment_registers((KCS_SEL | SEL_KPL), KFS_SEL, KGS_SEL, KDS_SEL);
643 618
644 619 /*
645 620 * setup %gs for kernel
646 621 */
647 622 xen_set_segment_base(SEGBASE_GS_KERNEL, (ulong_t)&cpus[0]);
648 623
649 624 /*
650 625 * XX64 We should never dereference off "other gsbase" or
651 626 * "fsbase". So, we should arrange to point FSBASE and
652 627 * KGSBASE somewhere truly awful e.g. point it at the last
653 628 * valid address below the hole so that any attempts to index
654 629 * off them cause an exception.
655 630 *
656 631 * For now, point it at 8G -- at least it should be unmapped
657 632 * until some 64-bit processes run.
658 633 */
659 634 addr = 0x200000000ul;
660 635 xen_set_segment_base(SEGBASE_FS, addr);
661 636 xen_set_segment_base(SEGBASE_GS_USER, addr);
662 637 xen_set_segment_base(SEGBASE_GS_USER_SEL, 0);
663 638
664 639 return (gdt0);
665 640 }
666 641
667 642 #else /* __xpv */
668 643
669 644 static user_desc_t *
670 645 init_gdt(void)
671 646 {
672 647 desctbr_t r_bgdt, r_gdt;
673 648 user_desc_t *bgdt;
674 649
675 650 #if !defined(__lint)
676 651 /*
677 652 * Our gdt is never larger than a single page.
678 653 */
679 654 ASSERT((sizeof (*gdt0) * NGDT) <= PAGESIZE);
680 655 #endif
681 656 gdt0 = (user_desc_t *)BOP_ALLOC(bootops, (caddr_t)GDT_VA,
682 657 PAGESIZE, PAGESIZE);
683 658 bzero(gdt0, PAGESIZE);
684 659
685 660 init_gdt_common(gdt0);
686 661
687 662 /*
688 663 * Copy in from boot's gdt to our gdt.
689 664 * Entry 0 is the null descriptor by definition.
690 665 */
691 666 rd_gdtr(&r_bgdt);
692 667 bgdt = (user_desc_t *)r_bgdt.dtr_base;
693 668 if (bgdt == NULL)
694 669 panic("null boot gdt");
695 670
696 671 gdt0[GDT_B32DATA] = bgdt[GDT_B32DATA];
697 672 gdt0[GDT_B32CODE] = bgdt[GDT_B32CODE];
698 673 gdt0[GDT_B16CODE] = bgdt[GDT_B16CODE];
699 674 gdt0[GDT_B16DATA] = bgdt[GDT_B16DATA];
700 675 gdt0[GDT_B64CODE] = bgdt[GDT_B64CODE];
701 676
702 677 /*
703 678 * Install our new GDT
704 679 */
705 680 r_gdt.dtr_limit = (sizeof (*gdt0) * NGDT) - 1;
706 681 r_gdt.dtr_base = (uintptr_t)gdt0;
707 682 wr_gdtr(&r_gdt);
708 683
709 684 /*
710 685 * Reload the segment registers to use the new GDT
711 686 */
712 687 load_segment_registers(KCS_SEL, KFS_SEL, KGS_SEL, KDS_SEL);
713 688
714 689 /*
715 690 * setup %gs for kernel
716 691 */
717 692 wrmsr(MSR_AMD_GSBASE, (uint64_t)&cpus[0]);
718 693
719 694 /*
720 695 * XX64 We should never dereference off "other gsbase" or
721 696 * "fsbase". So, we should arrange to point FSBASE and
722 697 * KGSBASE somewhere truly awful e.g. point it at the last
723 698 * valid address below the hole so that any attempts to index
724 699 * off them cause an exception.
725 700 *
726 701 * For now, point it at 8G -- at least it should be unmapped
727 702 * until some 64-bit processes run.
728 703 */
729 704 wrmsr(MSR_AMD_FSBASE, 0x200000000ul);
730 705 wrmsr(MSR_AMD_KGSBASE, 0x200000000ul);
731 706 return (gdt0);
732 707 }
733 708
734 709 #endif /* __xpv */
735 710
736 711 #elif defined(__i386)
737 712
738 713 static void
739 714 init_gdt_common(user_desc_t *gdt)
740 715 {
741 716 int i;
742 717
743 718 /*
744 719 * Text and data for both kernel and user span entire 32 bit
745 720 * address space.
746 721 */
747 722
748 723 /*
749 724 * kernel code segment.
750 725 */
751 726 set_usegd(&gdt[GDT_KCODE], NULL, -1, SDT_MEMERA, SEL_KPL, SDP_PAGES,
752 727 SDP_OP32);
753 728
754 729 /*
755 730 * kernel data segment.
756 731 */
757 732 set_usegd(&gdt[GDT_KDATA], NULL, -1, SDT_MEMRWA, SEL_KPL, SDP_PAGES,
758 733 SDP_OP32);
759 734
760 735 /*
761 736 * user code segment.
762 737 */
763 738 set_usegd(&gdt[GDT_UCODE], NULL, -1, SDT_MEMERA, SEL_UPL, SDP_PAGES,
764 739 SDP_OP32);
765 740
766 741 /*
767 742 * user data segment.
768 743 */
769 744 set_usegd(&gdt[GDT_UDATA], NULL, -1, SDT_MEMRWA, SEL_UPL, SDP_PAGES,
770 745 SDP_OP32);
771 746
772 747 #if !defined(__xpv)
773 748
774 749 /*
775 750 * TSS for T_DBLFLT (double fault) handler
776 751 */
777 752 set_syssegd((system_desc_t *)&gdt[GDT_DBFLT], dftss0,
778 753 sizeof (*dftss0) - 1, SDT_SYSTSS, SEL_KPL);
779 754
780 755 /*
781 756 * TSS for kernel
782 757 */
783 758 set_syssegd((system_desc_t *)&gdt[GDT_KTSS], ktss0,
784 759 sizeof (*ktss0) - 1, SDT_SYSTSS, SEL_KPL);
785 760
786 761 #endif /* !__xpv */
787 762
788 763 /*
789 764 * %gs selector for kernel
790 765 */
791 766 set_usegd(&gdt[GDT_GS], &cpus[0], sizeof (struct cpu) -1, SDT_MEMRWA,
792 767 SEL_KPL, SDP_BYTES, SDP_OP32);
793 768
794 769 /*
795 770 * Initialize lwp private descriptors.
796 771 * Only attributes and limits are initialized, the effective
797 772 * base address is programmed via fsbase/gsbase.
798 773 */
799 774 set_usegd(&gdt[GDT_LWPFS], NULL, (size_t)-1, SDT_MEMRWA, SEL_UPL,
800 775 SDP_PAGES, SDP_OP32);
801 776 set_usegd(&gdt[GDT_LWPGS], NULL, (size_t)-1, SDT_MEMRWA, SEL_UPL,
802 777 SDP_PAGES, SDP_OP32);
803 778
804 779 /*
805 780 * Initialize the descriptors set aside for brand usage.
806 781 * Only attributes and limits are initialized.
807 782 */
808 783 for (i = GDT_BRANDMIN; i <= GDT_BRANDMAX; i++)
809 784 set_usegd(&gdt0[i], NULL, (size_t)-1, SDT_MEMRWA, SEL_UPL,
810 785 SDP_PAGES, SDP_OP32);
811 786 /*
812 787 * Initialize convenient zero base user descriptor for clearing
813 788 * lwp private %fs and %gs descriptors in GDT. See setregs() for
814 789 * an example.
815 790 */
816 791 set_usegd(&zero_udesc, NULL, -1, SDT_MEMRWA, SEL_UPL,
817 792 SDP_BYTES, SDP_OP32);
818 793 }
819 794
820 795 #if defined(__xpv)
821 796
822 797 static user_desc_t *
823 798 init_gdt(void)
824 799 {
825 800 uint64_t gdtpa;
826 801 ulong_t ma[1]; /* XXPV should be a memory_t */
827 802
828 803 #if !defined(__lint)
829 804 /*
830 805 * Our gdt is never larger than a single page.
831 806 */
832 807 ASSERT((sizeof (*gdt0) * NGDT) <= PAGESIZE);
833 808 #endif
834 809 gdt0 = (user_desc_t *)BOP_ALLOC(bootops, (caddr_t)GDT_VA,
835 810 PAGESIZE, PAGESIZE);
836 811 bzero(gdt0, PAGESIZE);
837 812
838 813 init_gdt_common(gdt0);
839 814 gdtpa = pfn_to_pa(va_to_pfn(gdt0));
840 815
841 816 /*
842 817 * XXX Since we never invoke kmdb until after the kernel takes
843 818 * over the descriptor tables why not have it use the kernel's
844 819 * selectors?
845 820 */
846 821 if (boothowto & RB_DEBUG) {
847 822 set_usegd(&gdt0[GDT_B32DATA], NULL, -1, SDT_MEMRWA, SEL_KPL,
848 823 SDP_PAGES, SDP_OP32);
849 824 set_usegd(&gdt0[GDT_B32CODE], NULL, -1, SDT_MEMERA, SEL_KPL,
850 825 SDP_PAGES, SDP_OP32);
851 826 }
852 827
853 828 /*
854 829 * Clear write permission for page containing the gdt and install it.
855 830 */
856 831 ma[0] = (ulong_t)(pa_to_ma(gdtpa) >> PAGESHIFT);
857 832 kbm_read_only((uintptr_t)gdt0, gdtpa);
858 833 xen_set_gdt(ma, NGDT);
859 834
860 835 /*
861 836 * Reload the segment registers to use the new GDT
862 837 */
863 838 load_segment_registers(
864 839 KCS_SEL, KDS_SEL, KDS_SEL, KFS_SEL, KGS_SEL, KDS_SEL);
865 840
866 841 return (gdt0);
867 842 }
868 843
869 844 #else /* __xpv */
870 845
871 846 static user_desc_t *
872 847 init_gdt(void)
873 848 {
874 849 desctbr_t r_bgdt, r_gdt;
875 850 user_desc_t *bgdt;
876 851
877 852 #if !defined(__lint)
878 853 /*
879 854 * Our gdt is never larger than a single page.
880 855 */
881 856 ASSERT((sizeof (*gdt0) * NGDT) <= PAGESIZE);
882 857 #endif
883 858 /*
884 859 * XXX this allocation belongs in our caller, not here.
885 860 */
886 861 gdt0 = (user_desc_t *)BOP_ALLOC(bootops, (caddr_t)GDT_VA,
887 862 PAGESIZE, PAGESIZE);
888 863 bzero(gdt0, PAGESIZE);
889 864
890 865 init_gdt_common(gdt0);
891 866
892 867 /*
893 868 * Copy in from boot's gdt to our gdt entries.
894 869 * Entry 0 is null descriptor by definition.
895 870 */
896 871 rd_gdtr(&r_bgdt);
897 872 bgdt = (user_desc_t *)r_bgdt.dtr_base;
898 873 if (bgdt == NULL)
899 874 panic("null boot gdt");
900 875
901 876 gdt0[GDT_B32DATA] = bgdt[GDT_B32DATA];
902 877 gdt0[GDT_B32CODE] = bgdt[GDT_B32CODE];
903 878 gdt0[GDT_B16CODE] = bgdt[GDT_B16CODE];
904 879 gdt0[GDT_B16DATA] = bgdt[GDT_B16DATA];
905 880
906 881 /*
907 882 * Install our new GDT
908 883 */
909 884 r_gdt.dtr_limit = (sizeof (*gdt0) * NGDT) - 1;
910 885 r_gdt.dtr_base = (uintptr_t)gdt0;
911 886 wr_gdtr(&r_gdt);
912 887
913 888 /*
914 889 * Reload the segment registers to use the new GDT
915 890 */
916 891 load_segment_registers(
917 892 KCS_SEL, KDS_SEL, KDS_SEL, KFS_SEL, KGS_SEL, KDS_SEL);
918 893
919 894 return (gdt0);
920 895 }
921 896
922 897 #endif /* __xpv */
923 898 #endif /* __i386 */
924 899
925 900 /*
926 901 * Build kernel IDT.
927 902 *
928 903 * Note that for amd64 we pretty much require every gate to be an interrupt
929 904 * gate which blocks interrupts atomically on entry; that's because of our
930 905 * dependency on using 'swapgs' every time we come into the kernel to find
931 906 * the cpu structure. If we get interrupted just before doing that, %cs could
932 907 * be in kernel mode (so that the trap prolog doesn't do a swapgs), but
933 908 * %gsbase is really still pointing at something in userland. Bad things will
934 909 * ensue. We also use interrupt gates for i386 as well even though this is not
935 910 * required for some traps.
936 911 *
937 912 * Perhaps they should have invented a trap gate that does an atomic swapgs?
938 913 */
939 914 static void
940 915 init_idt_common(gate_desc_t *idt)
941 916 {
942 917 set_gatesegd(&idt[T_ZERODIV],
943 918 (kpti_enable == 1) ? &tr_div0trap : &div0trap,
944 919 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_ZERODIV));
945 920 set_gatesegd(&idt[T_SGLSTP],
946 921 (kpti_enable == 1) ? &tr_dbgtrap : &dbgtrap,
947 922 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_SGLSTP));
948 923 set_gatesegd(&idt[T_NMIFLT],
949 924 (kpti_enable == 1) ? &tr_nmiint : &nmiint,
950 925 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_NMIFLT));
951 926 set_gatesegd(&idt[T_BPTFLT],
952 927 (kpti_enable == 1) ? &tr_brktrap : &brktrap,
953 928 KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_BPTFLT));
954 929 set_gatesegd(&idt[T_OVFLW],
955 930 (kpti_enable == 1) ? &tr_ovflotrap : &ovflotrap,
956 931 KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_OVFLW));
957 932 set_gatesegd(&idt[T_BOUNDFLT],
958 933 (kpti_enable == 1) ? &tr_boundstrap : &boundstrap,
959 934 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_BOUNDFLT));
960 935 set_gatesegd(&idt[T_ILLINST],
961 936 (kpti_enable == 1) ? &tr_invoptrap : &invoptrap,
962 937 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_ILLINST));
963 938 set_gatesegd(&idt[T_NOEXTFLT],
964 939 (kpti_enable == 1) ? &tr_ndptrap : &ndptrap,
965 940 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_NOEXTFLT));
966 941
967 942 /*
968 943 * double fault handler.
969 944 *
970 945 * Note that on the hypervisor a guest does not receive #df faults.
971 946 * Instead a failsafe event is injected into the guest if its selectors
972 947 * and/or stack is in a broken state. See xen_failsafe_callback.
973 948 */
974 949 #if !defined(__xpv)
975 950 set_gatesegd(&idt[T_DBLFLT],
976 951 (kpti_enable == 1) ? &tr_syserrtrap : &syserrtrap,
977 952 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_DBLFLT));
978 953 #endif /* !__xpv */
979 954
980 955 /*
981 956 * T_EXTOVRFLT coprocessor-segment-overrun not supported.
982 957 */
983 958 set_gatesegd(&idt[T_TSSFLT],
984 959 (kpti_enable == 1) ? &tr_invtsstrap : &invtsstrap,
985 960 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_TSSFLT));
986 961 set_gatesegd(&idt[T_SEGFLT],
987 962 (kpti_enable == 1) ? &tr_segnptrap : &segnptrap,
988 963 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_SEGFLT));
989 964 set_gatesegd(&idt[T_STKFLT],
990 965 (kpti_enable == 1) ? &tr_stktrap : &stktrap,
991 966 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_STKFLT));
992 967 set_gatesegd(&idt[T_GPFLT],
993 968 (kpti_enable == 1) ? &tr_gptrap : &gptrap,
994 969 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_GPFLT));
995 970 set_gatesegd(&idt[T_PGFLT],
996 971 (kpti_enable == 1) ? &tr_pftrap : &pftrap,
997 972 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_PGFLT));
998 973 set_gatesegd(&idt[T_EXTERRFLT],
999 974 (kpti_enable == 1) ? &tr_ndperr : &ndperr,
1000 975 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_EXTERRFLT));
1001 976 set_gatesegd(&idt[T_ALIGNMENT],
1002 977 (kpti_enable == 1) ? &tr_achktrap : &achktrap,
1003 978 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_ALIGNMENT));
1004 979 set_gatesegd(&idt[T_MCE],
1005 980 (kpti_enable == 1) ? &tr_mcetrap : &mcetrap,
1006 981 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_MCE));
1007 982 set_gatesegd(&idt[T_SIMDFPE],
1008 983 (kpti_enable == 1) ? &tr_xmtrap : &xmtrap,
1009 984 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_SIMDFPE));
1010 985
1011 986 /*
1012 987 * install fast trap handler at 210.
1013 988 */
1014 989 set_gatesegd(&idt[T_FASTTRAP],
1015 990 (kpti_enable == 1) ? &tr_fasttrap : &fasttrap,
1016 991 KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_FASTTRAP));
1017 992
1018 993 /*
1019 994 * System call handler.
1020 995 */
1021 996 set_gatesegd(&idt[T_SYSCALLINT],
1022 997 (kpti_enable == 1) ? &tr_sys_syscall_int : &sys_syscall_int,
1023 998 KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_SYSCALLINT));
1024 999
1025 1000 /*
1026 1001 * Install the DTrace interrupt handler for the pid provider.
1027 1002 */
1028 1003 set_gatesegd(&idt[T_DTRACE_RET],
1029 1004 (kpti_enable == 1) ? &tr_dtrace_ret : &dtrace_ret,
1030 1005 KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_DTRACE_RET));
1031 1006
1032 1007 /*
1033 1008 * Prepare interposing descriptor for the syscall handler
1034 1009 * and cache copy of the default descriptor.
1035 1010 */
1036 1011 brand_tbl[0].ih_inum = T_SYSCALLINT;
1037 1012 brand_tbl[0].ih_default_desc = idt0[T_SYSCALLINT];
1038 1013
1039 1014 set_gatesegd(&(brand_tbl[0].ih_interp_desc),
1040 1015 (kpti_enable == 1) ? &tr_brand_sys_syscall_int :
1041 1016 &brand_sys_syscall_int, KCS_SEL, SDT_SYSIGT, TRP_UPL,
1042 1017 idt_vector_to_ist(T_SYSCALLINT));
1043 1018
1044 1019 brand_tbl[1].ih_inum = 0;
1045 1020 }
1046 1021
1047 1022 #if defined(__xpv)
1048 1023
1049 1024 static void
1050 1025 init_idt(gate_desc_t *idt)
1051 1026 {
1052 1027 init_idt_common(idt);
1053 1028 }
1054 1029
1055 1030 #else /* __xpv */
1056 1031
1057 1032 static void
1058 1033 init_idt(gate_desc_t *idt)
1059 1034 {
1060 1035 char ivctname[80];
1061 1036 void (*ivctptr)(void);
1062 1037 int i;
1063 1038
1064 1039 /*
1065 1040 * Initialize entire table with 'reserved' trap and then overwrite
1066 1041 * specific entries. T_EXTOVRFLT (9) is unsupported and reserved
1067 1042 * since it can only be generated on a 386 processor. 15 is also
1068 1043 * unsupported and reserved.
1069 1044 */
1070 1045 #if !defined(__xpv)
1071 1046 for (i = 0; i < NIDT; i++) {
1072 1047 set_gatesegd(&idt[i],
1073 1048 (kpti_enable == 1) ? &tr_resvtrap : &resvtrap,
1074 1049 KCS_SEL, SDT_SYSIGT, TRP_KPL,
1075 1050 idt_vector_to_ist(T_RESVTRAP));
1076 1051 }
1077 1052 #else
1078 1053 for (i = 0; i < NIDT; i++) {
1079 1054 set_gatesegd(&idt[i], &resvtrap, KCS_SEL, SDT_SYSIGT, TRP_KPL,
1080 1055 IST_NONE);
1081 1056 }
1082 1057 #endif
1083 1058
1084 1059 /*
1085 1060 * 20-31 reserved
1086 1061 */
1087 1062 #if !defined(__xpv)
1088 1063 for (i = 20; i < 32; i++) {
1089 1064 set_gatesegd(&idt[i],
1090 1065 (kpti_enable == 1) ? &tr_invaltrap : &invaltrap,
1091 1066 KCS_SEL, SDT_SYSIGT, TRP_KPL,
1092 1067 idt_vector_to_ist(T_INVALTRAP));
1093 1068 }
1094 1069 #else
1095 1070 for (i = 20; i < 32; i++) {
1096 1071 set_gatesegd(&idt[i], &invaltrap, KCS_SEL, SDT_SYSIGT, TRP_KPL,
1097 1072 IST_NONE);
1098 1073 }
1099 1074 #endif
1100 1075
1101 1076 /*
1102 1077 * interrupts 32 - 255
1103 1078 */
1104 1079 for (i = 32; i < 256; i++) {
1105 1080 #if !defined(__xpv)
1106 1081 (void) snprintf(ivctname, sizeof (ivctname),
1107 1082 (kpti_enable == 1) ? "tr_ivct%d" : "ivct%d", i);
1108 1083 #else
1109 1084 (void) snprintf(ivctname, sizeof (ivctname), "ivct%d", i);
1110 1085 #endif
1111 1086 ivctptr = (void (*)(void))kobj_getsymvalue(ivctname, 0);
1112 1087 if (ivctptr == NULL)
1113 1088 panic("kobj_getsymvalue(%s) failed", ivctname);
1114 1089
1115 1090 set_gatesegd(&idt[i], ivctptr, KCS_SEL, SDT_SYSIGT, TRP_KPL,
1116 1091 idt_vector_to_ist(i));
1117 1092 }
1118 1093
1119 1094 /*
1120 1095 * Now install the common ones. Note that it will overlay some
1121 1096 * entries installed above like T_SYSCALLINT, T_FASTTRAP etc.
1122 1097 */
1123 1098 init_idt_common(idt);
1124 1099 }
1125 1100
1126 1101 #endif /* __xpv */
1127 1102
1128 1103 /*
1129 1104 * The kernel does not deal with LDTs unless a user explicitly creates
1130 1105 * one. Under normal circumstances, the LDTR contains 0. Any process attempting
1131 1106 * to reference the LDT will therefore cause a #gp. System calls made via the
1132 1107 * obsolete lcall mechanism are emulated by the #gp fault handler.
1133 1108 */
1134 1109 static void
1135 1110 init_ldt(void)
1136 1111 {
1137 1112 #if defined(__xpv)
1138 1113 xen_set_ldt(NULL, 0);
1139 1114 #else
1140 1115 wr_ldtr(0);
1141 1116 #endif
1142 1117 }
1143 1118
1144 1119 #if !defined(__xpv)
1145 1120
1146 1121 static void
1147 1122 init_tss(void)
1148 1123 {
1149 1124 extern struct cpu cpus[];
1150 1125
1151 1126 /*
1152 1127 * tss_rsp0 is dynamically filled in by resume() (in swtch.s) on each
1153 1128 * context switch but it'll be overwritten with this same value anyway.
1154 1129 */
1155 1130 if (kpti_enable == 1) {
1156 1131 ktss0->tss_rsp0 = (uint64_t)&cpus->cpu_m.mcpu_kpti.kf_tr_rsp;
1157 1132 }
1158 1133
1159 1134 /* Set up the IST stacks for double fault, NMI, MCE. */
1160 1135 ktss0->tss_ist1 = (uintptr_t)&dblfault_stack0[sizeof (dblfault_stack0)];
1161 1136 ktss0->tss_ist2 = (uintptr_t)&nmi_stack0[sizeof (nmi_stack0)];
1162 1137 ktss0->tss_ist3 = (uintptr_t)&mce_stack0[sizeof (mce_stack0)];
1163 1138
1164 1139 /*
1165 1140 * This IST stack is used for #DB,#BP (debug) interrupts (when KPTI is
1166 1141 * enabled), and also for KDI (always).
1167 1142 */
1168 1143 ktss0->tss_ist4 = (uint64_t)&cpus->cpu_m.mcpu_kpti_dbg.kf_tr_rsp;
1169 1144
1170 1145 if (kpti_enable == 1) {
1171 1146 /* This IST stack is used for #GP,#PF,#SS (fault) interrupts. */
1172 1147 ktss0->tss_ist5 =
1173 1148 (uint64_t)&cpus->cpu_m.mcpu_kpti_flt.kf_tr_rsp;
1174 1149
1175 1150 /* This IST stack is used for all other intrs (for KPTI). */
1176 1151 ktss0->tss_ist6 = (uint64_t)&cpus->cpu_m.mcpu_kpti.kf_tr_rsp;
1177 1152 }
1178 1153
1179 1154 /*
1180 1155 * Set I/O bit map offset equal to size of TSS segment limit
1181 1156 * for no I/O permission map. This will force all user I/O
1182 1157 * instructions to generate #gp fault.
1183 1158 */
1184 1159 ktss0->tss_bitmapbase = sizeof (*ktss0);
1185 1160
1186 1161 /*
1187 1162 * Point %tr to descriptor for ktss0 in gdt.
1188 1163 */
1189 1164 wr_tsr(KTSS_SEL);
1190 1165 }
1191 1166
1192 1167 #endif /* !__xpv */
1193 1168
1194 1169 #if defined(__xpv)
1195 1170
1196 1171 void
1197 1172 init_desctbls(void)
1198 1173 {
1199 1174 uint_t vec;
1200 1175 user_desc_t *gdt;
1201 1176
1202 1177 /*
1203 1178 * Setup and install our GDT.
1204 1179 */
1205 1180 gdt = init_gdt();
1206 1181
1207 1182 /*
1208 1183 * Store static pa of gdt to speed up pa_to_ma() translations
1209 1184 * on lwp context switches.
1210 1185 */
1211 1186 ASSERT(IS_P2ALIGNED((uintptr_t)gdt, PAGESIZE));
1212 1187 CPU->cpu_gdt = gdt;
1213 1188 CPU->cpu_m.mcpu_gdtpa = pfn_to_pa(va_to_pfn(gdt));
1214 1189
1215 1190 /*
1216 1191 * Setup and install our IDT.
1217 1192 */
1218 1193 #if !defined(__lint)
1219 1194 ASSERT(NIDT * sizeof (*idt0) <= PAGESIZE);
1220 1195 #endif
1221 1196 idt0 = (gate_desc_t *)BOP_ALLOC(bootops, (caddr_t)IDT_VA,
1222 1197 PAGESIZE, PAGESIZE);
1223 1198 bzero(idt0, PAGESIZE);
1224 1199 init_idt(idt0);
1225 1200 for (vec = 0; vec < NIDT; vec++)
1226 1201 xen_idt_write(&idt0[vec], vec);
1227 1202
1228 1203 CPU->cpu_idt = idt0;
1229 1204
1230 1205 /*
1231 1206 * set default kernel stack
1232 1207 */
1233 1208 xen_stack_switch(KDS_SEL,
1234 1209 (ulong_t)&dblfault_stack0[sizeof (dblfault_stack0)]);
1235 1210
1236 1211 xen_init_callbacks();
1237 1212
1238 1213 init_ldt();
1239 1214 }
1240 1215
1241 1216 #else /* __xpv */
1242 1217
1243 1218 void
1244 1219 init_desctbls(void)
1245 1220 {
1246 1221 user_desc_t *gdt;
1247 1222 desctbr_t idtr;
1248 1223
1249 1224 /*
1250 1225 * Allocate IDT and TSS structures on unique pages for better
1251 1226 * performance in virtual machines.
1252 1227 */
1253 1228 #if !defined(__lint)
1254 1229 ASSERT(NIDT * sizeof (*idt0) <= PAGESIZE);
1255 1230 #endif
1256 1231 idt0 = (gate_desc_t *)BOP_ALLOC(bootops, (caddr_t)IDT_VA,
1257 1232 PAGESIZE, PAGESIZE);
1258 1233 bzero(idt0, PAGESIZE);
1259 1234 #if !defined(__lint)
1260 1235 ASSERT(sizeof (*ktss0) <= PAGESIZE);
1261 1236 #endif
1262 1237 ktss0 = (tss_t *)BOP_ALLOC(bootops, (caddr_t)KTSS_VA,
1263 1238 PAGESIZE, PAGESIZE);
1264 1239 bzero(ktss0, PAGESIZE);
1265 1240
1266 1241 #if defined(__i386)
1267 1242 #if !defined(__lint)
1268 1243 ASSERT(sizeof (*dftss0) <= PAGESIZE);
1269 1244 #endif
1270 1245 dftss0 = (tss_t *)BOP_ALLOC(bootops, (caddr_t)DFTSS_VA,
1271 1246 PAGESIZE, PAGESIZE);
1272 1247 bzero(dftss0, PAGESIZE);
1273 1248 #endif
1274 1249
1275 1250 /*
1276 1251 * Setup and install our GDT.
1277 1252 */
1278 1253 gdt = init_gdt();
1279 1254 ASSERT(IS_P2ALIGNED((uintptr_t)gdt, PAGESIZE));
1280 1255 CPU->cpu_gdt = gdt;
1281 1256
1282 1257 /*
1283 1258 * Initialize this CPU's LDT.
1284 1259 */
1285 1260 CPU->cpu_m.mcpu_ldt = BOP_ALLOC(bootops, (caddr_t)LDT_VA,
1286 1261 LDT_CPU_SIZE, PAGESIZE);
1287 1262 bzero(CPU->cpu_m.mcpu_ldt, LDT_CPU_SIZE);
1288 1263 CPU->cpu_m.mcpu_ldt_len = 0;
1289 1264
1290 1265 /*
1291 1266 * Setup and install our IDT.
1292 1267 */
1293 1268 init_idt(idt0);
1294 1269
1295 1270 idtr.dtr_base = (uintptr_t)idt0;
1296 1271 idtr.dtr_limit = (NIDT * sizeof (*idt0)) - 1;
1297 1272 wr_idtr(&idtr);
1298 1273 CPU->cpu_idt = idt0;
1299 1274
1300 1275 #if defined(__i386)
1301 1276 /*
1302 1277 * We maintain a description of idt0 in convenient IDTR format
1303 1278 * for #pf's on some older pentium processors. See pentium_pftrap().
1304 1279 */
1305 1280 idt0_default_r = idtr;
1306 1281 #endif /* __i386 */
1307 1282
1308 1283 init_tss();
1309 1284 CPU->cpu_tss = ktss0;
1310 1285 init_ldt();
1311 1286
1312 1287 /* Stash this so that the NMI,MCE,#DF and KDI handlers can use it. */
1313 1288 kpti_safe_cr3 = (uint64_t)getcr3();
1314 1289 }
1315 1290
1316 1291 #endif /* __xpv */
1317 1292
1318 1293 /*
1319 1294 * In the early kernel, we need to set up a simple GDT to run on.
1320 1295 *
1321 1296 * XXPV Can dboot use this too? See dboot_gdt.s
1322 1297 */
1323 1298 void
1324 1299 init_boot_gdt(user_desc_t *bgdt)
1325 1300 {
1326 1301 #if defined(__amd64)
1327 1302 set_usegd(&bgdt[GDT_B32DATA], SDP_LONG, NULL, -1, SDT_MEMRWA, SEL_KPL,
1328 1303 SDP_PAGES, SDP_OP32);
1329 1304 set_usegd(&bgdt[GDT_B64CODE], SDP_LONG, NULL, -1, SDT_MEMERA, SEL_KPL,
1330 1305 SDP_PAGES, SDP_OP32);
1331 1306 #elif defined(__i386)
1332 1307 set_usegd(&bgdt[GDT_B32DATA], NULL, -1, SDT_MEMRWA, SEL_KPL,
1333 1308 SDP_PAGES, SDP_OP32);
1334 1309 set_usegd(&bgdt[GDT_B32CODE], NULL, -1, SDT_MEMERA, SEL_KPL,
1335 1310 SDP_PAGES, SDP_OP32);
1336 1311 #endif /* __i386 */
1337 1312 }
1338 1313
1339 1314 /*
1340 1315 * Enable interpositioning on the system call path by rewriting the
1341 1316 * sys{call|enter} MSRs and the syscall-related entries in the IDT to use
1342 1317 * the branded entry points.
1343 1318 */
1344 1319 void
1345 1320 brand_interpositioning_enable(void)
1346 1321 {
1347 1322 gate_desc_t *idt = CPU->cpu_idt;
1348 1323 int i;
1349 1324
1350 1325 ASSERT(curthread->t_preempt != 0 || getpil() >= DISP_LEVEL);
1351 1326
1352 1327 for (i = 0; brand_tbl[i].ih_inum; i++) {
1353 1328 idt[brand_tbl[i].ih_inum] = brand_tbl[i].ih_interp_desc;
1354 1329 #if defined(__xpv)
1355 1330 xen_idt_write(&idt[brand_tbl[i].ih_inum],
1356 1331 brand_tbl[i].ih_inum);
1357 1332 #endif
1358 1333 }
1359 1334
1360 1335 #if defined(__amd64)
1361 1336 #if defined(__xpv)
1362 1337
1363 1338 /*
1364 1339 * Currently the hypervisor only supports 64-bit syscalls via
1365 1340 * syscall instruction. The 32-bit syscalls are handled by
1366 1341 * interrupt gate above.
1367 1342 */
1368 1343 xen_set_callback(brand_sys_syscall, CALLBACKTYPE_syscall,
1369 1344 CALLBACKF_mask_events);
1370 1345
1371 1346 #else
1372 1347
1373 1348 if (is_x86_feature(x86_featureset, X86FSET_ASYSC)) {
1374 1349 if (kpti_enable == 1) {
1375 1350 wrmsr(MSR_AMD_LSTAR, (uintptr_t)tr_brand_sys_syscall);
1376 1351 wrmsr(MSR_AMD_CSTAR, (uintptr_t)tr_brand_sys_syscall32);
1377 1352 } else {
1378 1353 wrmsr(MSR_AMD_LSTAR, (uintptr_t)brand_sys_syscall);
1379 1354 wrmsr(MSR_AMD_CSTAR, (uintptr_t)brand_sys_syscall32);
1380 1355 }
1381 1356 }
1382 1357
1383 1358 #endif
1384 1359 #endif /* __amd64 */
1385 1360
1386 1361 if (is_x86_feature(x86_featureset, X86FSET_SEP)) {
1387 1362 if (kpti_enable == 1) {
1388 1363 wrmsr(MSR_INTC_SEP_EIP,
1389 1364 (uintptr_t)tr_brand_sys_sysenter);
1390 1365 } else {
1391 1366 wrmsr(MSR_INTC_SEP_EIP, (uintptr_t)brand_sys_sysenter);
1392 1367 }
1393 1368 }
1394 1369 }
1395 1370
1396 1371 /*
1397 1372 * Disable interpositioning on the system call path by rewriting the
1398 1373 * sys{call|enter} MSRs and the syscall-related entries in the IDT to use
1399 1374 * the standard entry points, which bypass the interpositioning hooks.
1400 1375 */
1401 1376 void
1402 1377 brand_interpositioning_disable(void)
1403 1378 {
1404 1379 gate_desc_t *idt = CPU->cpu_idt;
1405 1380 int i;
1406 1381
1407 1382 ASSERT(curthread->t_preempt != 0 || getpil() >= DISP_LEVEL);
1408 1383
1409 1384 for (i = 0; brand_tbl[i].ih_inum; i++) {
1410 1385 idt[brand_tbl[i].ih_inum] = brand_tbl[i].ih_default_desc;
1411 1386 #if defined(__xpv)
1412 1387 xen_idt_write(&idt[brand_tbl[i].ih_inum],
1413 1388 brand_tbl[i].ih_inum);
1414 1389 #endif
1415 1390 }
1416 1391
1417 1392 #if defined(__amd64)
1418 1393 #if defined(__xpv)
1419 1394
1420 1395 /*
1421 1396 * See comment above in brand_interpositioning_enable.
1422 1397 */
1423 1398 xen_set_callback(sys_syscall, CALLBACKTYPE_syscall,
1424 1399 CALLBACKF_mask_events);
1425 1400
1426 1401 #else
1427 1402
1428 1403 if (is_x86_feature(x86_featureset, X86FSET_ASYSC)) {
1429 1404 if (kpti_enable == 1) {
1430 1405 wrmsr(MSR_AMD_LSTAR, (uintptr_t)tr_sys_syscall);
1431 1406 wrmsr(MSR_AMD_CSTAR, (uintptr_t)tr_sys_syscall32);
1432 1407 } else {
1433 1408 wrmsr(MSR_AMD_LSTAR, (uintptr_t)sys_syscall);
1434 1409 wrmsr(MSR_AMD_CSTAR, (uintptr_t)sys_syscall32);
1435 1410 }
1436 1411 }
1437 1412
1438 1413 #endif
1439 1414 #endif /* __amd64 */
1440 1415
1441 1416 if (is_x86_feature(x86_featureset, X86FSET_SEP)) {
1442 1417 if (kpti_enable == 1) {
1443 1418 wrmsr(MSR_INTC_SEP_EIP, (uintptr_t)tr_sys_sysenter);
1444 1419 } else {
1445 1420 wrmsr(MSR_INTC_SEP_EIP, (uintptr_t)sys_sysenter);
1446 1421 }
1447 1422 }
1448 1423 }
↓ open down ↓ |
1008 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX