8956 Implement KPTI Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com> Reviewed by: Robert Mustacchi <rm@joyent.com>
1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright 2018 Joyent, Inc. 24 */ 25 26 /* Copyright (c) 1990, 1991 UNIX System Laboratories, Inc. */ 27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T */ 28 /* All Rights Reserved */ 29 30 /* Copyright (c) 1987, 1988 Microsoft Corporation */ 31 /* All Rights Reserved */ 32 33 #include <sys/param.h> 34 #include <sys/types.h> 35 #include <sys/sysmacros.h> 36 #include <sys/systm.h> 37 #include <sys/signal.h> 38 #include <sys/errno.h> 39 #include <sys/fault.h> 40 #include <sys/syscall.h> 41 #include <sys/cpuvar.h> 42 #include <sys/sysi86.h> 43 #include <sys/psw.h> 44 #include <sys/cred.h> 45 #include <sys/policy.h> 46 #include <sys/thread.h> 47 #include <sys/debug.h> 48 #include <sys/ontrap.h> 49 #include <sys/privregs.h> 50 #include <sys/x86_archext.h> 51 #include <sys/vmem.h> 52 #include <sys/kmem.h> 53 #include <sys/mman.h> 54 #include <sys/archsystm.h> 55 #include <vm/hat.h> 56 #include <vm/as.h> 57 #include <vm/seg.h> 58 #include <vm/seg_kmem.h> 59 #include <vm/faultcode.h> 60 #include <sys/fp.h> 61 #include <sys/cmn_err.h> 62 #include <sys/segments.h> 63 #include <sys/clock.h> 64 #include <vm/hat_i86.h> 65 #if defined(__xpv) 66 #include <sys/hypervisor.h> 67 #include <sys/note.h> 68 #endif 69 70 static void ldt_alloc(proc_t *, uint_t); 71 static void ldt_free(proc_t *); 72 static void ldt_dup(proc_t *, proc_t *); 73 static void ldt_grow(proc_t *, uint_t); 74 75 /* 76 * sysi86 System Call 77 */ 78 79 /* ARGSUSED */ 80 int 81 sysi86(short cmd, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3) 82 { 83 struct ssd ssd; 84 int error = 0; 85 int c; 86 proc_t *pp = curproc; 87 88 switch (cmd) { 89 90 /* 91 * The SI86V86 subsystem call of the SYSI86 system call 92 * supports only one subcode -- V86SC_IOPL. 93 */ 94 case SI86V86: 95 if (arg1 == V86SC_IOPL) { 96 struct regs *rp = lwptoregs(ttolwp(curthread)); 97 greg_t oldpl = rp->r_ps & PS_IOPL; 98 greg_t newpl = arg2 & PS_IOPL; 99 100 /* 101 * Must be privileged to run this system call 102 * if giving more io privilege. 103 */ 104 if (newpl > oldpl && (error = 105 secpolicy_sys_config(CRED(), B_FALSE)) != 0) 106 return (set_errno(error)); 107 #if defined(__xpv) 108 kpreempt_disable(); 109 installctx(curthread, NULL, xen_disable_user_iopl, 110 xen_enable_user_iopl, NULL, NULL, 111 xen_disable_user_iopl, NULL); 112 xen_enable_user_iopl(); 113 kpreempt_enable(); 114 #else 115 rp->r_ps ^= oldpl ^ newpl; 116 #endif 117 } else 118 error = EINVAL; 119 break; 120 121 /* 122 * Set a segment descriptor 123 */ 124 case SI86DSCR: 125 /* 126 * There are considerable problems here manipulating 127 * resources shared by many running lwps. Get everyone 128 * into a safe state before changing the LDT. 129 */ 130 if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK1)) { 131 error = EINTR; 132 break; 133 } 134 135 if (get_udatamodel() == DATAMODEL_LP64) { 136 error = EINVAL; 137 break; 138 } 139 140 if (copyin((caddr_t)arg1, &ssd, sizeof (ssd)) < 0) { 141 error = EFAULT; 142 break; 143 } 144 145 error = setdscr(&ssd); 146 147 mutex_enter(&pp->p_lock); 148 if (curthread != pp->p_agenttp) 149 continuelwps(pp); 150 mutex_exit(&pp->p_lock); 151 break; 152 153 case SI86FPHW: 154 c = fp_kind & 0xff; 155 if (suword32((void *)arg1, c) == -1) 156 error = EFAULT; 157 break; 158 159 case SI86FPSTART: 160 /* 161 * arg1 is the address of _fp_hw 162 * arg2 is the desired x87 FCW value 163 * arg3 is the desired SSE MXCSR value 164 * a return value of one means SSE hardware, else none. 165 */ 166 c = fp_kind & 0xff; 167 if (suword32((void *)arg1, c) == -1) { 168 error = EFAULT; 169 break; 170 } 171 fpsetcw((uint16_t)arg2, (uint32_t)arg3); 172 return ((fp_kind & __FP_SSE) ? 1 : 0); 173 174 /* real time clock management commands */ 175 176 case WTODC: 177 if ((error = secpolicy_settime(CRED())) == 0) { 178 timestruc_t ts; 179 mutex_enter(&tod_lock); 180 gethrestime(&ts); 181 tod_set(ts); 182 mutex_exit(&tod_lock); 183 } 184 break; 185 186 /* Give some timezone playing room */ 187 #define ONEWEEK (7 * 24 * 60 * 60) 188 189 case SGMTL: 190 /* 191 * Called from 32 bit land, negative values 192 * are not sign extended, so we do that here 193 * by casting it to an int and back. We also 194 * clamp the value to within reason and detect 195 * when a 64 bit call overflows an int. 196 */ 197 if ((error = secpolicy_settime(CRED())) == 0) { 198 int newlag = (int)arg1; 199 200 #ifdef _SYSCALL32_IMPL 201 if (get_udatamodel() == DATAMODEL_NATIVE && 202 (long)newlag != (long)arg1) { 203 error = EOVERFLOW; 204 } else 205 #endif 206 if (newlag >= -ONEWEEK && newlag <= ONEWEEK) 207 sgmtl(newlag); 208 else 209 error = EOVERFLOW; 210 } 211 break; 212 213 case GGMTL: 214 if (get_udatamodel() == DATAMODEL_NATIVE) { 215 if (sulword((void *)arg1, ggmtl()) == -1) 216 error = EFAULT; 217 #ifdef _SYSCALL32_IMPL 218 } else { 219 time_t gmtl; 220 221 if ((gmtl = ggmtl()) > INT32_MAX) { 222 /* 223 * Since gmt_lag can at most be 224 * +/- 12 hours, something is 225 * *seriously* messed up here. 226 */ 227 error = EOVERFLOW; 228 } else if (suword32((void *)arg1, (int32_t)gmtl) == -1) 229 error = EFAULT; 230 #endif 231 } 232 break; 233 234 case RTCSYNC: 235 if ((error = secpolicy_settime(CRED())) == 0) 236 rtcsync(); 237 break; 238 239 /* END OF real time clock management commands */ 240 241 default: 242 error = EINVAL; 243 break; 244 } 245 return (error == 0 ? 0 : set_errno(error)); 246 } 247 248 void 249 usd_to_ssd(user_desc_t *usd, struct ssd *ssd, selector_t sel) 250 { 251 ssd->bo = USEGD_GETBASE(usd); 252 ssd->ls = USEGD_GETLIMIT(usd); 253 ssd->sel = sel; 254 255 /* 256 * set type, dpl and present bits. 257 */ 258 ssd->acc1 = usd->usd_type; 259 ssd->acc1 |= usd->usd_dpl << 5; 260 ssd->acc1 |= usd->usd_p << (5 + 2); 261 262 /* 263 * set avl, DB and granularity bits. 264 */ 265 ssd->acc2 = usd->usd_avl; 266 267 #if defined(__amd64) 268 ssd->acc2 |= usd->usd_long << 1; 269 #else 270 ssd->acc2 |= usd->usd_reserved << 1; 271 #endif 272 273 ssd->acc2 |= usd->usd_def32 << (1 + 1); 274 ssd->acc2 |= usd->usd_gran << (1 + 1 + 1); 275 } 276 277 static void 278 ssd_to_usd(struct ssd *ssd, user_desc_t *usd) 279 { 280 281 ASSERT(bcmp(usd, &null_udesc, sizeof (*usd)) == 0); 282 283 USEGD_SETBASE(usd, ssd->bo); 284 USEGD_SETLIMIT(usd, ssd->ls); 285 286 /* 287 * set type, dpl and present bits. 288 */ 289 usd->usd_type = ssd->acc1; 290 usd->usd_dpl = ssd->acc1 >> 5; 291 usd->usd_p = ssd->acc1 >> (5 + 2); 292 293 ASSERT(usd->usd_type >= SDT_MEMRO); 294 ASSERT(usd->usd_dpl == SEL_UPL); 295 296 /* 297 * 64-bit code selectors are never allowed in the LDT. 298 * Reserved bit is always 0 on 32-bit systems. 299 */ 300 #if defined(__amd64) 301 usd->usd_long = 0; 302 #else 303 usd->usd_reserved = 0; 304 #endif 305 306 /* 307 * set avl, DB and granularity bits. 308 */ 309 usd->usd_avl = ssd->acc2; 310 usd->usd_def32 = ssd->acc2 >> (1 + 1); 311 usd->usd_gran = ssd->acc2 >> (1 + 1 + 1); 312 } 313 314 315 #if defined(__i386) 316 317 static void 318 ssd_to_sgd(struct ssd *ssd, gate_desc_t *sgd) 319 { 320 321 ASSERT(bcmp(sgd, &null_sdesc, sizeof (*sgd)) == 0); 322 323 sgd->sgd_looffset = ssd->bo; 324 sgd->sgd_hioffset = ssd->bo >> 16; 325 326 sgd->sgd_selector = ssd->ls; 327 328 /* 329 * set type, dpl and present bits. 330 */ 331 sgd->sgd_type = ssd->acc1; 332 sgd->sgd_dpl = ssd->acc1 >> 5; 333 sgd->sgd_p = ssd->acc1 >> 7; 334 ASSERT(sgd->sgd_type == SDT_SYSCGT); 335 ASSERT(sgd->sgd_dpl == SEL_UPL); 336 sgd->sgd_stkcpy = 0; 337 } 338 339 #endif /* __i386 */ 340 341 /* 342 * Load LDT register with the current process's LDT. 343 */ 344 static void 345 ldt_load(void) 346 { 347 #if defined(__xpv) 348 xen_set_ldt(get_ssd_base(&curproc->p_ldt_desc), 349 curproc->p_ldtlimit + 1); 350 #else 351 size_t len; 352 system_desc_t desc; 353 354 /* 355 * Before we can use the LDT on this CPU, we must install the LDT in the 356 * user mapping table. 357 */ 358 len = (curproc->p_ldtlimit + 1) * sizeof (user_desc_t); 359 bcopy(curproc->p_ldt, CPU->cpu_m.mcpu_ldt, len); 360 CPU->cpu_m.mcpu_ldt_len = len; 361 set_syssegd(&desc, CPU->cpu_m.mcpu_ldt, len - 1, SDT_SYSLDT, SEL_KPL); 362 *((system_desc_t *)&CPU->cpu_gdt[GDT_LDT]) = desc; 363 364 wr_ldtr(ULDT_SEL); 365 #endif 366 } 367 368 /* 369 * Store a NULL selector in the LDTR. All subsequent illegal references to 370 * the LDT will result in a #gp. 371 */ 372 void 373 ldt_unload(void) 374 { 375 #if defined(__xpv) 376 xen_set_ldt(NULL, 0); 377 #else 378 *((system_desc_t *)&CPU->cpu_gdt[GDT_LDT]) = null_sdesc; 379 wr_ldtr(0); 380 381 bzero(CPU->cpu_m.mcpu_ldt, CPU->cpu_m.mcpu_ldt_len); 382 CPU->cpu_m.mcpu_ldt_len = 0; 383 #endif 384 } 385 386 /*ARGSUSED*/ 387 static void 388 ldt_savectx(proc_t *p) 389 { 390 ASSERT(p->p_ldt != NULL); 391 ASSERT(p == curproc); 392 393 #if defined(__amd64) 394 /* 395 * The 64-bit kernel must be sure to clear any stale ldt 396 * selectors when context switching away from a process that 397 * has a private ldt. Consider the following example: 398 * 399 * Wine creats a ldt descriptor and points a segment register 400 * to it. 401 * 402 * We then context switch away from wine lwp to kernel 403 * thread and hit breakpoint in kernel with kmdb 404 * 405 * When we continue and resume from kmdb we will #gp 406 * fault since kmdb will have saved the stale ldt selector 407 * from wine and will try to restore it but we are no longer in 408 * the context of the wine process and do not have our 409 * ldtr register pointing to the private ldt. 410 */ 411 reset_sregs(); 412 #endif 413 414 ldt_unload(); 415 cpu_fast_syscall_enable(NULL); 416 } 417 418 static void 419 ldt_restorectx(proc_t *p) 420 { 421 ASSERT(p->p_ldt != NULL); 422 ASSERT(p == curproc); 423 424 ldt_load(); 425 cpu_fast_syscall_disable(NULL); 426 } 427 428 /* 429 * When a process with a private LDT execs, fast syscalls must be enabled for 430 * the new process image. 431 */ 432 /* ARGSUSED */ 433 static void 434 ldt_freectx(proc_t *p, int isexec) 435 { 436 ASSERT(p->p_ldt); 437 438 if (isexec) { 439 kpreempt_disable(); 440 cpu_fast_syscall_enable(NULL); 441 kpreempt_enable(); 442 } 443 444 /* 445 * ldt_free() will free the memory used by the private LDT, reset the 446 * process's descriptor, and re-program the LDTR. 447 */ 448 ldt_free(p); 449 } 450 451 /* 452 * Install ctx op that ensures syscall/sysenter are disabled. 453 * See comments below. 454 * 455 * When a thread with a private LDT forks, the new process 456 * must have the LDT context ops installed. 457 */ 458 /* ARGSUSED */ 459 static void 460 ldt_installctx(proc_t *p, proc_t *cp) 461 { 462 proc_t *targ = p; 463 kthread_t *t; 464 465 /* 466 * If this is a fork, operate on the child process. 467 */ 468 if (cp != NULL) { 469 targ = cp; 470 ldt_dup(p, cp); 471 } 472 473 /* 474 * The process context ops expect the target process as their argument. 475 */ 476 ASSERT(removepctx(targ, targ, ldt_savectx, ldt_restorectx, 477 ldt_installctx, ldt_savectx, ldt_freectx) == 0); 478 479 installpctx(targ, targ, ldt_savectx, ldt_restorectx, 480 ldt_installctx, ldt_savectx, ldt_freectx); 481 482 /* 483 * We've just disabled fast system call and return instructions; take 484 * the slow path out to make sure we don't try to use one to return 485 * back to user. We must set t_post_sys for every thread in the 486 * process to make sure none of them escape out via fast return. 487 */ 488 489 mutex_enter(&targ->p_lock); 490 t = targ->p_tlist; 491 do { 492 t->t_post_sys = 1; 493 } while ((t = t->t_forw) != targ->p_tlist); 494 mutex_exit(&targ->p_lock); 495 } 496 497 int 498 setdscr(struct ssd *ssd) 499 { 500 ushort_t seli; /* selector index */ 501 user_desc_t *ldp; /* descriptor pointer */ 502 user_desc_t ndesc; /* new descriptor */ 503 proc_t *pp = ttoproc(curthread); 504 int rc = 0; 505 506 /* 507 * LDT segments: executable and data at DPL 3 only. 508 */ 509 if (!SELISLDT(ssd->sel) || !SELISUPL(ssd->sel)) 510 return (EINVAL); 511 512 /* 513 * check the selector index. 514 */ 515 seli = SELTOIDX(ssd->sel); 516 if (seli >= MAXNLDT || seli < LDT_UDBASE) 517 return (EINVAL); 518 519 ndesc = null_udesc; 520 mutex_enter(&pp->p_ldtlock); 521 522 /* 523 * If this is the first time for this process then setup a 524 * private LDT for it. 525 */ 526 if (pp->p_ldt == NULL) { 527 ldt_alloc(pp, seli); 528 529 /* 530 * Now that this process has a private LDT, the use of 531 * the syscall/sysret and sysenter/sysexit instructions 532 * is forbidden for this processes because they destroy 533 * the contents of %cs and %ss segment registers. 534 * 535 * Explicity disable them here and add a context handler 536 * to the process. Note that disabling 537 * them here means we can't use sysret or sysexit on 538 * the way out of this system call - so we force this 539 * thread to take the slow path (which doesn't make use 540 * of sysenter or sysexit) back out. 541 */ 542 kpreempt_disable(); 543 ldt_installctx(pp, NULL); 544 cpu_fast_syscall_disable(NULL); 545 ASSERT(curthread->t_post_sys != 0); 546 kpreempt_enable(); 547 548 } else if (seli > pp->p_ldtlimit) { 549 550 /* 551 * Increase size of ldt to include seli. 552 */ 553 ldt_grow(pp, seli); 554 } 555 556 ASSERT(seli <= pp->p_ldtlimit); 557 ldp = &pp->p_ldt[seli]; 558 559 /* 560 * On the 64-bit kernel, this is where things get more subtle. 561 * Recall that in the 64-bit kernel, when we enter the kernel we 562 * deliberately -don't- reload the segment selectors we came in on 563 * for %ds, %es, %fs or %gs. Messing with selectors is expensive, 564 * and the underlying descriptors are essentially ignored by the 565 * hardware in long mode - except for the base that we override with 566 * the gsbase MSRs. 567 * 568 * However, there's one unfortunate issue with this rosy picture -- 569 * a descriptor that's not marked as 'present' will still generate 570 * an #np when loading a segment register. 571 * 572 * Consider this case. An lwp creates a harmless LDT entry, points 573 * one of it's segment registers at it, then tells the kernel (here) 574 * to delete it. In the 32-bit kernel, the #np will happen on the 575 * way back to userland where we reload the segment registers, and be 576 * handled in kern_gpfault(). In the 64-bit kernel, the same thing 577 * will happen in the normal case too. However, if we're trying to 578 * use a debugger that wants to save and restore the segment registers, 579 * and the debugger things that we have valid segment registers, we 580 * have the problem that the debugger will try and restore the 581 * segment register that points at the now 'not present' descriptor 582 * and will take a #np right there. 583 * 584 * We should obviously fix the debugger to be paranoid about 585 * -not- restoring segment registers that point to bad descriptors; 586 * however we can prevent the problem here if we check to see if any 587 * of the segment registers are still pointing at the thing we're 588 * destroying; if they are, return an error instead. (That also seems 589 * a lot better failure mode than SIGKILL and a core file 590 * from kern_gpfault() too.) 591 */ 592 if (SI86SSD_PRES(ssd) == 0) { 593 kthread_t *t; 594 int bad = 0; 595 596 /* 597 * Look carefully at the segment registers of every lwp 598 * in the process (they're all stopped by our caller). 599 * If we're about to invalidate a descriptor that's still 600 * being referenced by *any* of them, return an error, 601 * rather than having them #gp on their way out of the kernel. 602 */ 603 ASSERT(pp->p_lwprcnt == 1); 604 605 mutex_enter(&pp->p_lock); 606 t = pp->p_tlist; 607 do { 608 klwp_t *lwp = ttolwp(t); 609 struct regs *rp = lwp->lwp_regs; 610 #if defined(__amd64) 611 pcb_t *pcb = &lwp->lwp_pcb; 612 #endif 613 614 if (ssd->sel == rp->r_cs || ssd->sel == rp->r_ss) { 615 bad = 1; 616 break; 617 } 618 619 #if defined(__amd64) 620 if (pcb->pcb_rupdate == 1) { 621 if (ssd->sel == pcb->pcb_ds || 622 ssd->sel == pcb->pcb_es || 623 ssd->sel == pcb->pcb_fs || 624 ssd->sel == pcb->pcb_gs) { 625 bad = 1; 626 break; 627 } 628 } else 629 #endif 630 { 631 if (ssd->sel == rp->r_ds || 632 ssd->sel == rp->r_es || 633 ssd->sel == rp->r_fs || 634 ssd->sel == rp->r_gs) { 635 bad = 1; 636 break; 637 } 638 } 639 640 } while ((t = t->t_forw) != pp->p_tlist); 641 mutex_exit(&pp->p_lock); 642 643 if (bad) { 644 mutex_exit(&pp->p_ldtlock); 645 return (EBUSY); 646 } 647 } 648 649 /* 650 * If acc1 is zero, clear the descriptor (including the 'present' bit) 651 */ 652 if (ssd->acc1 == 0) { 653 rc = ldt_update_segd(ldp, &null_udesc); 654 mutex_exit(&pp->p_ldtlock); 655 return (rc); 656 } 657 658 /* 659 * Check segment type, allow segment not present and 660 * only user DPL (3). 661 */ 662 if (SI86SSD_DPL(ssd) != SEL_UPL) { 663 mutex_exit(&pp->p_ldtlock); 664 return (EINVAL); 665 } 666 667 #if defined(__amd64) 668 /* 669 * Do not allow 32-bit applications to create 64-bit mode code 670 * segments. 671 */ 672 if (SI86SSD_ISUSEG(ssd) && ((SI86SSD_TYPE(ssd) >> 3) & 1) == 1 && 673 SI86SSD_ISLONG(ssd)) { 674 mutex_exit(&pp->p_ldtlock); 675 return (EINVAL); 676 } 677 #endif /* __amd64 */ 678 679 /* 680 * Set up a code or data user segment descriptor. 681 */ 682 if (SI86SSD_ISUSEG(ssd)) { 683 ssd_to_usd(ssd, &ndesc); 684 rc = ldt_update_segd(ldp, &ndesc); 685 mutex_exit(&pp->p_ldtlock); 686 return (rc); 687 } 688 689 #if defined(__i386) 690 /* 691 * Allow a call gate only if the destination is in the LDT 692 * and the system is running in 32-bit legacy mode. 693 * 694 * In long mode 32-bit call gates are redefined as 64-bit call 695 * gates and the hw enforces that the target code selector 696 * of the call gate must be 64-bit selector. A #gp fault is 697 * generated if otherwise. Since we do not allow 32-bit processes 698 * to switch themselves to 64-bits we never allow call gates 699 * on 64-bit system system. 700 */ 701 if (SI86SSD_TYPE(ssd) == SDT_SYSCGT && SELISLDT(ssd->ls)) { 702 703 704 ssd_to_sgd(ssd, (gate_desc_t *)&ndesc); 705 rc = ldt_update_segd(ldp, &ndesc); 706 mutex_exit(&pp->p_ldtlock); 707 return (rc); 708 } 709 #endif /* __i386 */ 710 711 mutex_exit(&pp->p_ldtlock); 712 return (EINVAL); 713 } 714 715 /* 716 * Allocate new LDT for process just large enough to contain seli. 717 * Note we allocate and grow LDT in PAGESIZE chunks. We do this 718 * to simplify the implementation and because on the hypervisor it's 719 * required, since the LDT must live on pages that have PROT_WRITE 720 * removed and which are given to the hypervisor. 721 */ 722 static void 723 ldt_alloc(proc_t *pp, uint_t seli) 724 { 725 user_desc_t *ldt; 726 size_t ldtsz; 727 uint_t nsels; 728 729 ASSERT(MUTEX_HELD(&pp->p_ldtlock)); 730 ASSERT(pp->p_ldt == NULL); 731 ASSERT(pp->p_ldtlimit == 0); 732 733 /* 734 * Allocate new LDT just large enough to contain seli. The LDT must 735 * always be allocated in units of pages for KPTI. 736 */ 737 ldtsz = P2ROUNDUP((seli + 1) * sizeof (user_desc_t), PAGESIZE); 738 nsels = ldtsz / sizeof (user_desc_t); 739 ASSERT(nsels >= MINNLDT && nsels <= MAXNLDT); 740 741 ldt = kmem_zalloc(ldtsz, KM_SLEEP); 742 ASSERT(IS_P2ALIGNED(ldt, PAGESIZE)); 743 744 #if defined(__xpv) 745 if (xen_ldt_setprot(ldt, ldtsz, PROT_READ)) 746 panic("ldt_alloc:xen_ldt_setprot(PROT_READ) failed"); 747 #endif 748 749 pp->p_ldt = ldt; 750 pp->p_ldtlimit = nsels - 1; 751 set_syssegd(&pp->p_ldt_desc, ldt, ldtsz - 1, SDT_SYSLDT, SEL_KPL); 752 753 if (pp == curproc) { 754 kpreempt_disable(); 755 ldt_load(); 756 kpreempt_enable(); 757 } 758 } 759 760 static void 761 ldt_free(proc_t *pp) 762 { 763 user_desc_t *ldt; 764 size_t ldtsz; 765 766 ASSERT(pp->p_ldt != NULL); 767 768 mutex_enter(&pp->p_ldtlock); 769 ldt = pp->p_ldt; 770 ldtsz = (pp->p_ldtlimit + 1) * sizeof (user_desc_t); 771 772 ASSERT(IS_P2ALIGNED(ldtsz, PAGESIZE)); 773 774 pp->p_ldt = NULL; 775 pp->p_ldtlimit = 0; 776 pp->p_ldt_desc = null_sdesc; 777 mutex_exit(&pp->p_ldtlock); 778 779 if (pp == curproc) { 780 kpreempt_disable(); 781 ldt_unload(); 782 kpreempt_enable(); 783 } 784 785 #if defined(__xpv) 786 /* 787 * We are not allowed to make the ldt writable until after 788 * we tell the hypervisor to unload it. 789 */ 790 if (xen_ldt_setprot(ldt, ldtsz, PROT_READ | PROT_WRITE)) 791 panic("ldt_free:xen_ldt_setprot(PROT_READ|PROT_WRITE) failed"); 792 #endif 793 794 kmem_free(ldt, ldtsz); 795 } 796 797 /* 798 * On fork copy new ldt for child. 799 */ 800 static void 801 ldt_dup(proc_t *pp, proc_t *cp) 802 { 803 size_t ldtsz; 804 805 ASSERT(pp->p_ldt != NULL); 806 ASSERT(cp != curproc); 807 808 /* 809 * I assume the parent's ldt can't increase since we're in a fork. 810 */ 811 mutex_enter(&pp->p_ldtlock); 812 mutex_enter(&cp->p_ldtlock); 813 814 ldtsz = (pp->p_ldtlimit + 1) * sizeof (user_desc_t); 815 816 ldt_alloc(cp, pp->p_ldtlimit); 817 818 #if defined(__xpv) 819 /* 820 * Make child's ldt writable so it can be copied into from 821 * parent's ldt. This works since ldt_alloc above did not load 822 * the ldt since its for the child process. If we tried to make 823 * an LDT writable that is loaded in hw the setprot operation 824 * would fail. 825 */ 826 if (xen_ldt_setprot(cp->p_ldt, ldtsz, PROT_READ | PROT_WRITE)) 827 panic("ldt_dup:xen_ldt_setprot(PROT_READ|PROT_WRITE) failed"); 828 #endif 829 830 bcopy(pp->p_ldt, cp->p_ldt, ldtsz); 831 832 #if defined(__xpv) 833 if (xen_ldt_setprot(cp->p_ldt, ldtsz, PROT_READ)) 834 panic("ldt_dup:xen_ldt_setprot(PROT_READ) failed"); 835 #endif 836 mutex_exit(&cp->p_ldtlock); 837 mutex_exit(&pp->p_ldtlock); 838 839 } 840 841 static void 842 ldt_grow(proc_t *pp, uint_t seli) 843 { 844 user_desc_t *oldt, *nldt; 845 uint_t nsels; 846 size_t oldtsz, nldtsz; 847 848 ASSERT(MUTEX_HELD(&pp->p_ldtlock)); 849 ASSERT(pp->p_ldt != NULL); 850 ASSERT(pp->p_ldtlimit != 0); 851 852 /* 853 * Allocate larger LDT just large enough to contain seli. The LDT must 854 * always be allocated in units of pages for KPTI. 855 */ 856 nldtsz = P2ROUNDUP((seli + 1) * sizeof (user_desc_t), PAGESIZE); 857 nsels = nldtsz / sizeof (user_desc_t); 858 ASSERT(nsels >= MINNLDT && nsels <= MAXNLDT); 859 ASSERT(nsels > pp->p_ldtlimit); 860 861 oldt = pp->p_ldt; 862 oldtsz = (pp->p_ldtlimit + 1) * sizeof (user_desc_t); 863 864 nldt = kmem_zalloc(nldtsz, KM_SLEEP); 865 ASSERT(IS_P2ALIGNED(nldt, PAGESIZE)); 866 867 bcopy(oldt, nldt, oldtsz); 868 869 /* 870 * unload old ldt. 871 */ 872 kpreempt_disable(); 873 ldt_unload(); 874 kpreempt_enable(); 875 876 #if defined(__xpv) 877 878 /* 879 * Make old ldt writable and new ldt read only. 880 */ 881 if (xen_ldt_setprot(oldt, oldtsz, PROT_READ | PROT_WRITE)) 882 panic("ldt_grow:xen_ldt_setprot(PROT_READ|PROT_WRITE) failed"); 883 884 if (xen_ldt_setprot(nldt, nldtsz, PROT_READ)) 885 panic("ldt_grow:xen_ldt_setprot(PROT_READ) failed"); 886 #endif 887 888 pp->p_ldt = nldt; 889 pp->p_ldtlimit = nsels - 1; 890 891 /* 892 * write new ldt segment descriptor. 893 */ 894 set_syssegd(&pp->p_ldt_desc, nldt, nldtsz - 1, SDT_SYSLDT, SEL_KPL); 895 896 /* 897 * load the new ldt. 898 */ 899 kpreempt_disable(); 900 ldt_load(); 901 kpreempt_enable(); 902 903 kmem_free(oldt, oldtsz); 904 } --- EOF ---