Print this page
9600 LDT still not happy under KPTI
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/intel/ia32/os/sysi86.c
+++ new/usr/src/uts/intel/ia32/os/sysi86.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
23 23 * Copyright 2018 Joyent, Inc.
24 24 */
25 25
26 26 /* Copyright (c) 1990, 1991 UNIX System Laboratories, Inc. */
27 27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T */
28 28 /* All Rights Reserved */
29 29
30 30 /* Copyright (c) 1987, 1988 Microsoft Corporation */
31 31 /* All Rights Reserved */
32 32
33 33 #include <sys/param.h>
34 34 #include <sys/types.h>
35 35 #include <sys/sysmacros.h>
36 36 #include <sys/systm.h>
37 37 #include <sys/signal.h>
38 38 #include <sys/errno.h>
39 39 #include <sys/fault.h>
40 40 #include <sys/syscall.h>
41 41 #include <sys/cpuvar.h>
42 42 #include <sys/sysi86.h>
43 43 #include <sys/psw.h>
44 44 #include <sys/cred.h>
45 45 #include <sys/policy.h>
46 46 #include <sys/thread.h>
47 47 #include <sys/debug.h>
48 48 #include <sys/ontrap.h>
49 49 #include <sys/privregs.h>
50 50 #include <sys/x86_archext.h>
51 51 #include <sys/vmem.h>
52 52 #include <sys/kmem.h>
53 53 #include <sys/mman.h>
54 54 #include <sys/archsystm.h>
55 55 #include <vm/hat.h>
56 56 #include <vm/as.h>
57 57 #include <vm/seg.h>
58 58 #include <vm/seg_kmem.h>
59 59 #include <vm/faultcode.h>
60 60 #include <sys/fp.h>
61 61 #include <sys/cmn_err.h>
62 62 #include <sys/segments.h>
63 63 #include <sys/clock.h>
64 64 #include <vm/hat_i86.h>
65 65 #if defined(__xpv)
66 66 #include <sys/hypervisor.h>
67 67 #include <sys/note.h>
68 68 #endif
69 69
70 70 static void ldt_alloc(proc_t *, uint_t);
71 71 static void ldt_free(proc_t *);
72 72 static void ldt_dup(proc_t *, proc_t *);
73 73 static void ldt_grow(proc_t *, uint_t);
74 74
75 75 /*
76 76 * sysi86 System Call
77 77 */
78 78
79 79 /* ARGSUSED */
80 80 int
81 81 sysi86(short cmd, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3)
82 82 {
83 83 struct ssd ssd;
84 84 int error = 0;
85 85 int c;
86 86 proc_t *pp = curproc;
87 87
88 88 switch (cmd) {
89 89
90 90 /*
91 91 * The SI86V86 subsystem call of the SYSI86 system call
92 92 * supports only one subcode -- V86SC_IOPL.
93 93 */
94 94 case SI86V86:
95 95 if (arg1 == V86SC_IOPL) {
96 96 struct regs *rp = lwptoregs(ttolwp(curthread));
97 97 greg_t oldpl = rp->r_ps & PS_IOPL;
98 98 greg_t newpl = arg2 & PS_IOPL;
99 99
100 100 /*
101 101 * Must be privileged to run this system call
102 102 * if giving more io privilege.
103 103 */
104 104 if (newpl > oldpl && (error =
105 105 secpolicy_sys_config(CRED(), B_FALSE)) != 0)
106 106 return (set_errno(error));
107 107 #if defined(__xpv)
108 108 kpreempt_disable();
109 109 installctx(curthread, NULL, xen_disable_user_iopl,
110 110 xen_enable_user_iopl, NULL, NULL,
111 111 xen_disable_user_iopl, NULL);
112 112 xen_enable_user_iopl();
113 113 kpreempt_enable();
114 114 #else
115 115 rp->r_ps ^= oldpl ^ newpl;
116 116 #endif
117 117 } else
118 118 error = EINVAL;
119 119 break;
120 120
121 121 /*
122 122 * Set a segment descriptor
123 123 */
124 124 case SI86DSCR:
125 125 /*
126 126 * There are considerable problems here manipulating
127 127 * resources shared by many running lwps. Get everyone
128 128 * into a safe state before changing the LDT.
129 129 */
130 130 if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK1)) {
131 131 error = EINTR;
132 132 break;
133 133 }
134 134
135 135 if (get_udatamodel() == DATAMODEL_LP64) {
136 136 error = EINVAL;
137 137 break;
138 138 }
139 139
140 140 if (copyin((caddr_t)arg1, &ssd, sizeof (ssd)) < 0) {
141 141 error = EFAULT;
142 142 break;
143 143 }
144 144
145 145 error = setdscr(&ssd);
146 146
147 147 mutex_enter(&pp->p_lock);
148 148 if (curthread != pp->p_agenttp)
149 149 continuelwps(pp);
150 150 mutex_exit(&pp->p_lock);
151 151 break;
152 152
153 153 case SI86FPHW:
154 154 c = fp_kind & 0xff;
155 155 if (suword32((void *)arg1, c) == -1)
156 156 error = EFAULT;
157 157 break;
158 158
159 159 case SI86FPSTART:
160 160 /*
161 161 * arg1 is the address of _fp_hw
162 162 * arg2 is the desired x87 FCW value
163 163 * arg3 is the desired SSE MXCSR value
164 164 * a return value of one means SSE hardware, else none.
165 165 */
166 166 c = fp_kind & 0xff;
167 167 if (suword32((void *)arg1, c) == -1) {
168 168 error = EFAULT;
169 169 break;
170 170 }
171 171 fpsetcw((uint16_t)arg2, (uint32_t)arg3);
172 172 return ((fp_kind & __FP_SSE) ? 1 : 0);
173 173
174 174 /* real time clock management commands */
175 175
176 176 case WTODC:
177 177 if ((error = secpolicy_settime(CRED())) == 0) {
178 178 timestruc_t ts;
179 179 mutex_enter(&tod_lock);
180 180 gethrestime(&ts);
181 181 tod_set(ts);
182 182 mutex_exit(&tod_lock);
183 183 }
184 184 break;
185 185
186 186 /* Give some timezone playing room */
187 187 #define ONEWEEK (7 * 24 * 60 * 60)
188 188
189 189 case SGMTL:
190 190 /*
191 191 * Called from 32 bit land, negative values
192 192 * are not sign extended, so we do that here
193 193 * by casting it to an int and back. We also
194 194 * clamp the value to within reason and detect
195 195 * when a 64 bit call overflows an int.
196 196 */
197 197 if ((error = secpolicy_settime(CRED())) == 0) {
198 198 int newlag = (int)arg1;
199 199
200 200 #ifdef _SYSCALL32_IMPL
201 201 if (get_udatamodel() == DATAMODEL_NATIVE &&
202 202 (long)newlag != (long)arg1) {
203 203 error = EOVERFLOW;
204 204 } else
205 205 #endif
206 206 if (newlag >= -ONEWEEK && newlag <= ONEWEEK)
207 207 sgmtl(newlag);
208 208 else
209 209 error = EOVERFLOW;
210 210 }
211 211 break;
212 212
213 213 case GGMTL:
214 214 if (get_udatamodel() == DATAMODEL_NATIVE) {
215 215 if (sulword((void *)arg1, ggmtl()) == -1)
216 216 error = EFAULT;
217 217 #ifdef _SYSCALL32_IMPL
218 218 } else {
219 219 time_t gmtl;
220 220
221 221 if ((gmtl = ggmtl()) > INT32_MAX) {
222 222 /*
223 223 * Since gmt_lag can at most be
224 224 * +/- 12 hours, something is
225 225 * *seriously* messed up here.
226 226 */
227 227 error = EOVERFLOW;
228 228 } else if (suword32((void *)arg1, (int32_t)gmtl) == -1)
229 229 error = EFAULT;
230 230 #endif
231 231 }
232 232 break;
233 233
234 234 case RTCSYNC:
235 235 if ((error = secpolicy_settime(CRED())) == 0)
236 236 rtcsync();
237 237 break;
238 238
239 239 /* END OF real time clock management commands */
240 240
241 241 default:
242 242 error = EINVAL;
243 243 break;
244 244 }
245 245 return (error == 0 ? 0 : set_errno(error));
246 246 }
247 247
248 248 void
249 249 usd_to_ssd(user_desc_t *usd, struct ssd *ssd, selector_t sel)
250 250 {
251 251 ssd->bo = USEGD_GETBASE(usd);
252 252 ssd->ls = USEGD_GETLIMIT(usd);
253 253 ssd->sel = sel;
254 254
255 255 /*
256 256 * set type, dpl and present bits.
257 257 */
258 258 ssd->acc1 = usd->usd_type;
259 259 ssd->acc1 |= usd->usd_dpl << 5;
260 260 ssd->acc1 |= usd->usd_p << (5 + 2);
261 261
262 262 /*
263 263 * set avl, DB and granularity bits.
264 264 */
265 265 ssd->acc2 = usd->usd_avl;
266 266
267 267 #if defined(__amd64)
268 268 ssd->acc2 |= usd->usd_long << 1;
269 269 #else
270 270 ssd->acc2 |= usd->usd_reserved << 1;
271 271 #endif
272 272
273 273 ssd->acc2 |= usd->usd_def32 << (1 + 1);
274 274 ssd->acc2 |= usd->usd_gran << (1 + 1 + 1);
275 275 }
276 276
↓ open down ↓ |
276 lines elided |
↑ open up ↑ |
277 277 static void
278 278 ssd_to_usd(struct ssd *ssd, user_desc_t *usd)
279 279 {
280 280
281 281 ASSERT(bcmp(usd, &null_udesc, sizeof (*usd)) == 0);
282 282
283 283 USEGD_SETBASE(usd, ssd->bo);
284 284 USEGD_SETLIMIT(usd, ssd->ls);
285 285
286 286 /*
287 - * set type, dpl and present bits.
287 + * Set type, dpl and present bits.
288 + *
289 + * Force the "accessed" bit to on so that we don't run afoul of
290 + * KPTI.
288 291 */
289 - usd->usd_type = ssd->acc1;
292 + usd->usd_type = ssd->acc1 | SDT_A;
290 293 usd->usd_dpl = ssd->acc1 >> 5;
291 294 usd->usd_p = ssd->acc1 >> (5 + 2);
292 295
293 296 ASSERT(usd->usd_type >= SDT_MEMRO);
294 297 ASSERT(usd->usd_dpl == SEL_UPL);
295 298
296 299 /*
297 300 * 64-bit code selectors are never allowed in the LDT.
298 301 * Reserved bit is always 0 on 32-bit systems.
299 302 */
300 303 #if defined(__amd64)
301 304 usd->usd_long = 0;
302 305 #else
303 306 usd->usd_reserved = 0;
304 307 #endif
305 308
306 309 /*
307 310 * set avl, DB and granularity bits.
308 311 */
309 312 usd->usd_avl = ssd->acc2;
310 313 usd->usd_def32 = ssd->acc2 >> (1 + 1);
311 314 usd->usd_gran = ssd->acc2 >> (1 + 1 + 1);
312 315 }
313 316
314 317
315 318 #if defined(__i386)
316 319
317 320 static void
318 321 ssd_to_sgd(struct ssd *ssd, gate_desc_t *sgd)
319 322 {
320 323
321 324 ASSERT(bcmp(sgd, &null_sdesc, sizeof (*sgd)) == 0);
322 325
323 326 sgd->sgd_looffset = ssd->bo;
324 327 sgd->sgd_hioffset = ssd->bo >> 16;
325 328
326 329 sgd->sgd_selector = ssd->ls;
327 330
328 331 /*
329 332 * set type, dpl and present bits.
330 333 */
331 334 sgd->sgd_type = ssd->acc1;
332 335 sgd->sgd_dpl = ssd->acc1 >> 5;
333 336 sgd->sgd_p = ssd->acc1 >> 7;
334 337 ASSERT(sgd->sgd_type == SDT_SYSCGT);
335 338 ASSERT(sgd->sgd_dpl == SEL_UPL);
336 339 sgd->sgd_stkcpy = 0;
337 340 }
↓ open down ↓ |
38 lines elided |
↑ open up ↑ |
338 341
339 342 #endif /* __i386 */
340 343
341 344 /*
342 345 * Load LDT register with the current process's LDT.
343 346 */
344 347 static void
345 348 ldt_load(void)
346 349 {
347 350 #if defined(__xpv)
348 - xen_set_ldt(get_ssd_base(&curproc->p_ldt_desc),
349 - curproc->p_ldtlimit + 1);
351 + xen_set_ldt(curproc->p_ldt, curproc->p_ldtlimit + 1);
350 352 #else
351 353 size_t len;
352 354 system_desc_t desc;
353 355
354 356 /*
355 357 * Before we can use the LDT on this CPU, we must install the LDT in the
356 358 * user mapping table.
357 359 */
358 360 len = (curproc->p_ldtlimit + 1) * sizeof (user_desc_t);
359 361 bcopy(curproc->p_ldt, CPU->cpu_m.mcpu_ldt, len);
360 362 CPU->cpu_m.mcpu_ldt_len = len;
361 363 set_syssegd(&desc, CPU->cpu_m.mcpu_ldt, len - 1, SDT_SYSLDT, SEL_KPL);
362 364 *((system_desc_t *)&CPU->cpu_gdt[GDT_LDT]) = desc;
363 365
364 366 wr_ldtr(ULDT_SEL);
365 367 #endif
366 368 }
367 369
368 370 /*
369 371 * Store a NULL selector in the LDTR. All subsequent illegal references to
370 372 * the LDT will result in a #gp.
371 373 */
372 374 void
373 375 ldt_unload(void)
374 376 {
375 377 #if defined(__xpv)
376 378 xen_set_ldt(NULL, 0);
377 379 #else
378 380 *((system_desc_t *)&CPU->cpu_gdt[GDT_LDT]) = null_sdesc;
379 381 wr_ldtr(0);
380 382
381 383 bzero(CPU->cpu_m.mcpu_ldt, CPU->cpu_m.mcpu_ldt_len);
382 384 CPU->cpu_m.mcpu_ldt_len = 0;
383 385 #endif
384 386 }
385 387
386 388 /*ARGSUSED*/
387 389 static void
388 390 ldt_savectx(proc_t *p)
389 391 {
390 392 ASSERT(p->p_ldt != NULL);
391 393 ASSERT(p == curproc);
392 394
393 395 #if defined(__amd64)
394 396 /*
395 397 * The 64-bit kernel must be sure to clear any stale ldt
396 398 * selectors when context switching away from a process that
397 399 * has a private ldt. Consider the following example:
398 400 *
399 401 * Wine creats a ldt descriptor and points a segment register
400 402 * to it.
401 403 *
402 404 * We then context switch away from wine lwp to kernel
403 405 * thread and hit breakpoint in kernel with kmdb
404 406 *
↓ open down ↓ |
45 lines elided |
↑ open up ↑ |
405 407 * When we continue and resume from kmdb we will #gp
406 408 * fault since kmdb will have saved the stale ldt selector
407 409 * from wine and will try to restore it but we are no longer in
408 410 * the context of the wine process and do not have our
409 411 * ldtr register pointing to the private ldt.
410 412 */
411 413 reset_sregs();
412 414 #endif
413 415
414 416 ldt_unload();
415 - cpu_fast_syscall_enable(NULL);
417 + cpu_fast_syscall_enable();
416 418 }
417 419
418 420 static void
419 421 ldt_restorectx(proc_t *p)
420 422 {
421 423 ASSERT(p->p_ldt != NULL);
422 424 ASSERT(p == curproc);
423 425
424 426 ldt_load();
425 - cpu_fast_syscall_disable(NULL);
427 + cpu_fast_syscall_disable();
426 428 }
427 429
428 430 /*
429 - * When a process with a private LDT execs, fast syscalls must be enabled for
430 - * the new process image.
431 + * At exec time, we need to clear up our LDT context and re-enable fast syscalls
432 + * for the new process image.
433 + *
434 + * The same is true for the other case, where we have:
435 + *
436 + * proc_exit()
437 + * ->exitpctx()->ldt_savectx()
438 + * ->freepctx()->ldt_freectx()
439 + *
440 + * Because pre-emption is not prevented between the two callbacks, we could have
441 + * come off CPU, and brought back LDT context when coming back on CPU via
442 + * ldt_restorectx().
431 443 */
432 444 /* ARGSUSED */
433 445 static void
434 446 ldt_freectx(proc_t *p, int isexec)
435 447 {
436 - ASSERT(p->p_ldt);
448 + ASSERT(p->p_ldt != NULL);
449 + ASSERT(p == curproc);
437 450
438 - if (isexec) {
439 - kpreempt_disable();
440 - cpu_fast_syscall_enable(NULL);
441 - kpreempt_enable();
442 - }
443 -
444 - /*
445 - * ldt_free() will free the memory used by the private LDT, reset the
446 - * process's descriptor, and re-program the LDTR.
447 - */
451 + kpreempt_disable();
448 452 ldt_free(p);
453 + cpu_fast_syscall_enable();
454 + kpreempt_enable();
449 455 }
450 456
451 457 /*
452 458 * Install ctx op that ensures syscall/sysenter are disabled.
453 459 * See comments below.
454 460 *
455 461 * When a thread with a private LDT forks, the new process
456 462 * must have the LDT context ops installed.
457 463 */
458 464 /* ARGSUSED */
459 465 static void
460 466 ldt_installctx(proc_t *p, proc_t *cp)
461 467 {
462 468 proc_t *targ = p;
463 469 kthread_t *t;
464 470
465 471 /*
466 472 * If this is a fork, operate on the child process.
467 473 */
468 474 if (cp != NULL) {
469 475 targ = cp;
470 476 ldt_dup(p, cp);
471 477 }
472 478
473 479 /*
474 480 * The process context ops expect the target process as their argument.
475 481 */
476 482 ASSERT(removepctx(targ, targ, ldt_savectx, ldt_restorectx,
477 483 ldt_installctx, ldt_savectx, ldt_freectx) == 0);
478 484
479 485 installpctx(targ, targ, ldt_savectx, ldt_restorectx,
480 486 ldt_installctx, ldt_savectx, ldt_freectx);
481 487
482 488 /*
483 489 * We've just disabled fast system call and return instructions; take
484 490 * the slow path out to make sure we don't try to use one to return
485 491 * back to user. We must set t_post_sys for every thread in the
486 492 * process to make sure none of them escape out via fast return.
487 493 */
488 494
489 495 mutex_enter(&targ->p_lock);
↓ open down ↓ |
31 lines elided |
↑ open up ↑ |
490 496 t = targ->p_tlist;
491 497 do {
492 498 t->t_post_sys = 1;
493 499 } while ((t = t->t_forw) != targ->p_tlist);
494 500 mutex_exit(&targ->p_lock);
495 501 }
496 502
497 503 int
498 504 setdscr(struct ssd *ssd)
499 505 {
500 - ushort_t seli; /* selector index */
506 + ushort_t seli; /* selector index */
501 507 user_desc_t *ldp; /* descriptor pointer */
502 508 user_desc_t ndesc; /* new descriptor */
503 - proc_t *pp = ttoproc(curthread);
509 + proc_t *pp = curproc;
504 510 int rc = 0;
505 511
506 512 /*
507 513 * LDT segments: executable and data at DPL 3 only.
508 514 */
509 515 if (!SELISLDT(ssd->sel) || !SELISUPL(ssd->sel))
510 516 return (EINVAL);
511 517
512 518 /*
513 519 * check the selector index.
514 520 */
515 521 seli = SELTOIDX(ssd->sel);
516 522 if (seli >= MAXNLDT || seli < LDT_UDBASE)
517 523 return (EINVAL);
518 524
519 525 ndesc = null_udesc;
520 526 mutex_enter(&pp->p_ldtlock);
521 527
522 528 /*
523 529 * If this is the first time for this process then setup a
524 530 * private LDT for it.
525 531 */
526 532 if (pp->p_ldt == NULL) {
527 533 ldt_alloc(pp, seli);
528 534
529 535 /*
530 536 * Now that this process has a private LDT, the use of
531 537 * the syscall/sysret and sysenter/sysexit instructions
532 538 * is forbidden for this processes because they destroy
533 539 * the contents of %cs and %ss segment registers.
↓ open down ↓ |
20 lines elided |
↑ open up ↑ |
534 540 *
535 541 * Explicity disable them here and add a context handler
536 542 * to the process. Note that disabling
537 543 * them here means we can't use sysret or sysexit on
538 544 * the way out of this system call - so we force this
539 545 * thread to take the slow path (which doesn't make use
540 546 * of sysenter or sysexit) back out.
541 547 */
542 548 kpreempt_disable();
543 549 ldt_installctx(pp, NULL);
544 - cpu_fast_syscall_disable(NULL);
550 + cpu_fast_syscall_disable();
545 551 ASSERT(curthread->t_post_sys != 0);
546 552 kpreempt_enable();
547 553
548 554 } else if (seli > pp->p_ldtlimit) {
555 + ASSERT(pp->p_pctx != NULL);
549 556
550 557 /*
551 558 * Increase size of ldt to include seli.
552 559 */
553 560 ldt_grow(pp, seli);
554 561 }
555 562
556 563 ASSERT(seli <= pp->p_ldtlimit);
557 564 ldp = &pp->p_ldt[seli];
558 565
559 566 /*
560 567 * On the 64-bit kernel, this is where things get more subtle.
561 568 * Recall that in the 64-bit kernel, when we enter the kernel we
562 569 * deliberately -don't- reload the segment selectors we came in on
563 570 * for %ds, %es, %fs or %gs. Messing with selectors is expensive,
564 571 * and the underlying descriptors are essentially ignored by the
565 572 * hardware in long mode - except for the base that we override with
566 573 * the gsbase MSRs.
567 574 *
568 575 * However, there's one unfortunate issue with this rosy picture --
569 576 * a descriptor that's not marked as 'present' will still generate
570 577 * an #np when loading a segment register.
571 578 *
572 579 * Consider this case. An lwp creates a harmless LDT entry, points
573 580 * one of it's segment registers at it, then tells the kernel (here)
574 581 * to delete it. In the 32-bit kernel, the #np will happen on the
575 582 * way back to userland where we reload the segment registers, and be
576 583 * handled in kern_gpfault(). In the 64-bit kernel, the same thing
577 584 * will happen in the normal case too. However, if we're trying to
578 585 * use a debugger that wants to save and restore the segment registers,
579 586 * and the debugger things that we have valid segment registers, we
580 587 * have the problem that the debugger will try and restore the
581 588 * segment register that points at the now 'not present' descriptor
582 589 * and will take a #np right there.
583 590 *
584 591 * We should obviously fix the debugger to be paranoid about
585 592 * -not- restoring segment registers that point to bad descriptors;
586 593 * however we can prevent the problem here if we check to see if any
587 594 * of the segment registers are still pointing at the thing we're
588 595 * destroying; if they are, return an error instead. (That also seems
589 596 * a lot better failure mode than SIGKILL and a core file
590 597 * from kern_gpfault() too.)
591 598 */
592 599 if (SI86SSD_PRES(ssd) == 0) {
593 600 kthread_t *t;
594 601 int bad = 0;
595 602
596 603 /*
597 604 * Look carefully at the segment registers of every lwp
598 605 * in the process (they're all stopped by our caller).
599 606 * If we're about to invalidate a descriptor that's still
600 607 * being referenced by *any* of them, return an error,
601 608 * rather than having them #gp on their way out of the kernel.
602 609 */
603 610 ASSERT(pp->p_lwprcnt == 1);
604 611
605 612 mutex_enter(&pp->p_lock);
606 613 t = pp->p_tlist;
607 614 do {
608 615 klwp_t *lwp = ttolwp(t);
609 616 struct regs *rp = lwp->lwp_regs;
610 617 #if defined(__amd64)
611 618 pcb_t *pcb = &lwp->lwp_pcb;
612 619 #endif
613 620
614 621 if (ssd->sel == rp->r_cs || ssd->sel == rp->r_ss) {
615 622 bad = 1;
616 623 break;
617 624 }
618 625
619 626 #if defined(__amd64)
620 627 if (pcb->pcb_rupdate == 1) {
621 628 if (ssd->sel == pcb->pcb_ds ||
622 629 ssd->sel == pcb->pcb_es ||
623 630 ssd->sel == pcb->pcb_fs ||
624 631 ssd->sel == pcb->pcb_gs) {
625 632 bad = 1;
626 633 break;
627 634 }
628 635 } else
629 636 #endif
630 637 {
631 638 if (ssd->sel == rp->r_ds ||
632 639 ssd->sel == rp->r_es ||
633 640 ssd->sel == rp->r_fs ||
634 641 ssd->sel == rp->r_gs) {
635 642 bad = 1;
636 643 break;
637 644 }
638 645 }
639 646
↓ open down ↓ |
81 lines elided |
↑ open up ↑ |
640 647 } while ((t = t->t_forw) != pp->p_tlist);
641 648 mutex_exit(&pp->p_lock);
642 649
643 650 if (bad) {
644 651 mutex_exit(&pp->p_ldtlock);
645 652 return (EBUSY);
646 653 }
647 654 }
648 655
649 656 /*
650 - * If acc1 is zero, clear the descriptor (including the 'present' bit)
657 + * If acc1 is zero, clear the descriptor (including the 'present' bit).
658 + * Make sure we update the CPU-private copy of the LDT.
651 659 */
652 660 if (ssd->acc1 == 0) {
653 661 rc = ldt_update_segd(ldp, &null_udesc);
662 + kpreempt_disable();
663 + ldt_load();
664 + kpreempt_enable();
654 665 mutex_exit(&pp->p_ldtlock);
655 666 return (rc);
656 667 }
657 668
658 669 /*
659 670 * Check segment type, allow segment not present and
660 671 * only user DPL (3).
661 672 */
662 673 if (SI86SSD_DPL(ssd) != SEL_UPL) {
663 674 mutex_exit(&pp->p_ldtlock);
664 675 return (EINVAL);
665 676 }
666 677
667 -#if defined(__amd64)
668 678 /*
669 679 * Do not allow 32-bit applications to create 64-bit mode code
670 680 * segments.
671 681 */
672 682 if (SI86SSD_ISUSEG(ssd) && ((SI86SSD_TYPE(ssd) >> 3) & 1) == 1 &&
673 683 SI86SSD_ISLONG(ssd)) {
674 684 mutex_exit(&pp->p_ldtlock);
675 685 return (EINVAL);
676 686 }
677 -#endif /* __amd64 */
678 687
679 688 /*
680 - * Set up a code or data user segment descriptor.
689 + * Set up a code or data user segment descriptor, making sure to update
690 + * the CPU-private copy of the LDT.
681 691 */
682 692 if (SI86SSD_ISUSEG(ssd)) {
683 693 ssd_to_usd(ssd, &ndesc);
684 694 rc = ldt_update_segd(ldp, &ndesc);
695 + kpreempt_disable();
696 + ldt_load();
697 + kpreempt_enable();
685 698 mutex_exit(&pp->p_ldtlock);
686 699 return (rc);
687 700 }
688 701
689 -#if defined(__i386)
690 - /*
691 - * Allow a call gate only if the destination is in the LDT
692 - * and the system is running in 32-bit legacy mode.
693 - *
694 - * In long mode 32-bit call gates are redefined as 64-bit call
695 - * gates and the hw enforces that the target code selector
696 - * of the call gate must be 64-bit selector. A #gp fault is
697 - * generated if otherwise. Since we do not allow 32-bit processes
698 - * to switch themselves to 64-bits we never allow call gates
699 - * on 64-bit system system.
700 - */
701 - if (SI86SSD_TYPE(ssd) == SDT_SYSCGT && SELISLDT(ssd->ls)) {
702 -
703 -
704 - ssd_to_sgd(ssd, (gate_desc_t *)&ndesc);
705 - rc = ldt_update_segd(ldp, &ndesc);
706 - mutex_exit(&pp->p_ldtlock);
707 - return (rc);
708 - }
709 -#endif /* __i386 */
710 -
711 702 mutex_exit(&pp->p_ldtlock);
712 703 return (EINVAL);
713 704 }
714 705
715 706 /*
716 - * Allocate new LDT for process just large enough to contain seli.
717 - * Note we allocate and grow LDT in PAGESIZE chunks. We do this
718 - * to simplify the implementation and because on the hypervisor it's
719 - * required, since the LDT must live on pages that have PROT_WRITE
720 - * removed and which are given to the hypervisor.
707 + * Allocate new LDT for process just large enough to contain seli. Note we
708 + * allocate and grow LDT in PAGESIZE chunks. We do this to simplify the
709 + * implementation and because on the hypervisor it's required, since the LDT
710 + * must live on pages that have PROT_WRITE removed and which are given to the
711 + * hypervisor.
712 + *
713 + * Note that we don't actually load the LDT into the current CPU here: it's done
714 + * later by our caller.
721 715 */
722 716 static void
723 717 ldt_alloc(proc_t *pp, uint_t seli)
724 718 {
725 719 user_desc_t *ldt;
726 720 size_t ldtsz;
727 721 uint_t nsels;
728 722
729 723 ASSERT(MUTEX_HELD(&pp->p_ldtlock));
730 724 ASSERT(pp->p_ldt == NULL);
731 725 ASSERT(pp->p_ldtlimit == 0);
732 726
733 727 /*
734 728 * Allocate new LDT just large enough to contain seli. The LDT must
735 729 * always be allocated in units of pages for KPTI.
736 730 */
737 731 ldtsz = P2ROUNDUP((seli + 1) * sizeof (user_desc_t), PAGESIZE);
738 732 nsels = ldtsz / sizeof (user_desc_t);
739 733 ASSERT(nsels >= MINNLDT && nsels <= MAXNLDT);
740 734
↓ open down ↓ |
10 lines elided |
↑ open up ↑ |
741 735 ldt = kmem_zalloc(ldtsz, KM_SLEEP);
742 736 ASSERT(IS_P2ALIGNED(ldt, PAGESIZE));
743 737
744 738 #if defined(__xpv)
745 739 if (xen_ldt_setprot(ldt, ldtsz, PROT_READ))
746 740 panic("ldt_alloc:xen_ldt_setprot(PROT_READ) failed");
747 741 #endif
748 742
749 743 pp->p_ldt = ldt;
750 744 pp->p_ldtlimit = nsels - 1;
751 - set_syssegd(&pp->p_ldt_desc, ldt, ldtsz - 1, SDT_SYSLDT, SEL_KPL);
752 -
753 - if (pp == curproc) {
754 - kpreempt_disable();
755 - ldt_load();
756 - kpreempt_enable();
757 - }
758 745 }
759 746
760 747 static void
761 748 ldt_free(proc_t *pp)
762 749 {
763 750 user_desc_t *ldt;
764 751 size_t ldtsz;
765 752
766 753 ASSERT(pp->p_ldt != NULL);
767 754
768 755 mutex_enter(&pp->p_ldtlock);
769 756 ldt = pp->p_ldt;
770 757 ldtsz = (pp->p_ldtlimit + 1) * sizeof (user_desc_t);
771 758
772 759 ASSERT(IS_P2ALIGNED(ldtsz, PAGESIZE));
773 760
774 761 pp->p_ldt = NULL;
775 762 pp->p_ldtlimit = 0;
776 - pp->p_ldt_desc = null_sdesc;
777 763 mutex_exit(&pp->p_ldtlock);
778 764
779 765 if (pp == curproc) {
780 766 kpreempt_disable();
781 767 ldt_unload();
782 768 kpreempt_enable();
783 769 }
784 770
785 771 #if defined(__xpv)
786 772 /*
787 773 * We are not allowed to make the ldt writable until after
788 774 * we tell the hypervisor to unload it.
789 775 */
790 776 if (xen_ldt_setprot(ldt, ldtsz, PROT_READ | PROT_WRITE))
791 777 panic("ldt_free:xen_ldt_setprot(PROT_READ|PROT_WRITE) failed");
792 778 #endif
793 779
794 780 kmem_free(ldt, ldtsz);
795 781 }
796 782
797 783 /*
798 784 * On fork copy new ldt for child.
799 785 */
800 786 static void
801 787 ldt_dup(proc_t *pp, proc_t *cp)
802 788 {
803 789 size_t ldtsz;
804 790
805 791 ASSERT(pp->p_ldt != NULL);
806 792 ASSERT(cp != curproc);
807 793
808 794 /*
809 795 * I assume the parent's ldt can't increase since we're in a fork.
810 796 */
811 797 mutex_enter(&pp->p_ldtlock);
812 798 mutex_enter(&cp->p_ldtlock);
813 799
814 800 ldtsz = (pp->p_ldtlimit + 1) * sizeof (user_desc_t);
815 801
816 802 ldt_alloc(cp, pp->p_ldtlimit);
817 803
818 804 #if defined(__xpv)
819 805 /*
820 806 * Make child's ldt writable so it can be copied into from
821 807 * parent's ldt. This works since ldt_alloc above did not load
822 808 * the ldt since its for the child process. If we tried to make
823 809 * an LDT writable that is loaded in hw the setprot operation
824 810 * would fail.
825 811 */
826 812 if (xen_ldt_setprot(cp->p_ldt, ldtsz, PROT_READ | PROT_WRITE))
827 813 panic("ldt_dup:xen_ldt_setprot(PROT_READ|PROT_WRITE) failed");
828 814 #endif
829 815
830 816 bcopy(pp->p_ldt, cp->p_ldt, ldtsz);
↓ open down ↓ |
44 lines elided |
↑ open up ↑ |
831 817
832 818 #if defined(__xpv)
833 819 if (xen_ldt_setprot(cp->p_ldt, ldtsz, PROT_READ))
834 820 panic("ldt_dup:xen_ldt_setprot(PROT_READ) failed");
835 821 #endif
836 822 mutex_exit(&cp->p_ldtlock);
837 823 mutex_exit(&pp->p_ldtlock);
838 824
839 825 }
840 826
827 +/*
828 + * Note that we don't actually load the LDT into the current CPU here: it's done
829 + * later by our caller - unless we take an error. This works out because
830 + * ldt_load() does a copy of ->p_ldt instead of directly loading it into the GDT
831 + * (and therefore can't be using the freed old LDT), and by definition if the
832 + * new entry didn't pass validation, then the proc shouldn't be referencing an
833 + * entry in the extended region.
834 + */
841 835 static void
842 836 ldt_grow(proc_t *pp, uint_t seli)
843 837 {
844 838 user_desc_t *oldt, *nldt;
845 839 uint_t nsels;
846 840 size_t oldtsz, nldtsz;
847 841
848 842 ASSERT(MUTEX_HELD(&pp->p_ldtlock));
849 843 ASSERT(pp->p_ldt != NULL);
850 844 ASSERT(pp->p_ldtlimit != 0);
851 845
852 846 /*
853 847 * Allocate larger LDT just large enough to contain seli. The LDT must
854 848 * always be allocated in units of pages for KPTI.
855 849 */
856 850 nldtsz = P2ROUNDUP((seli + 1) * sizeof (user_desc_t), PAGESIZE);
857 851 nsels = nldtsz / sizeof (user_desc_t);
858 852 ASSERT(nsels >= MINNLDT && nsels <= MAXNLDT);
859 853 ASSERT(nsels > pp->p_ldtlimit);
860 854
861 855 oldt = pp->p_ldt;
862 856 oldtsz = (pp->p_ldtlimit + 1) * sizeof (user_desc_t);
863 857
864 858 nldt = kmem_zalloc(nldtsz, KM_SLEEP);
865 859 ASSERT(IS_P2ALIGNED(nldt, PAGESIZE));
866 860
867 861 bcopy(oldt, nldt, oldtsz);
868 862
869 863 /*
870 864 * unload old ldt.
871 865 */
872 866 kpreempt_disable();
873 867 ldt_unload();
874 868 kpreempt_enable();
875 869
876 870 #if defined(__xpv)
877 871
878 872 /*
879 873 * Make old ldt writable and new ldt read only.
880 874 */
↓ open down ↓ |
30 lines elided |
↑ open up ↑ |
881 875 if (xen_ldt_setprot(oldt, oldtsz, PROT_READ | PROT_WRITE))
882 876 panic("ldt_grow:xen_ldt_setprot(PROT_READ|PROT_WRITE) failed");
883 877
884 878 if (xen_ldt_setprot(nldt, nldtsz, PROT_READ))
885 879 panic("ldt_grow:xen_ldt_setprot(PROT_READ) failed");
886 880 #endif
887 881
888 882 pp->p_ldt = nldt;
889 883 pp->p_ldtlimit = nsels - 1;
890 884
891 - /*
892 - * write new ldt segment descriptor.
893 - */
894 - set_syssegd(&pp->p_ldt_desc, nldt, nldtsz - 1, SDT_SYSLDT, SEL_KPL);
895 -
896 - /*
897 - * load the new ldt.
898 - */
899 - kpreempt_disable();
900 - ldt_load();
901 - kpreempt_enable();
902 -
903 885 kmem_free(oldt, oldtsz);
904 886 }
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX