1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright 2018 Joyent, Inc. 24 */ 25 26 /* Copyright (c) 1990, 1991 UNIX System Laboratories, Inc. */ 27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T */ 28 /* All Rights Reserved */ 29 30 /* Copyright (c) 1987, 1988 Microsoft Corporation */ 31 /* All Rights Reserved */ 32 33 #include <sys/param.h> 34 #include <sys/types.h> 35 #include <sys/sysmacros.h> 36 #include <sys/systm.h> 37 #include <sys/signal.h> 38 #include <sys/errno.h> 39 #include <sys/fault.h> 40 #include <sys/syscall.h> 41 #include <sys/cpuvar.h> 42 #include <sys/sysi86.h> 43 #include <sys/psw.h> 44 #include <sys/cred.h> 45 #include <sys/policy.h> 46 #include <sys/thread.h> 47 #include <sys/debug.h> 48 #include <sys/ontrap.h> 49 #include <sys/privregs.h> 50 #include <sys/x86_archext.h> 51 #include <sys/vmem.h> 52 #include <sys/kmem.h> 53 #include <sys/mman.h> 54 #include <sys/archsystm.h> 55 #include <vm/hat.h> 56 #include <vm/as.h> 57 #include <vm/seg.h> 58 #include <vm/seg_kmem.h> 59 #include <vm/faultcode.h> 60 #include <sys/fp.h> 61 #include <sys/cmn_err.h> 62 #include <sys/segments.h> 63 #include <sys/clock.h> 64 #include <vm/hat_i86.h> 65 #if defined(__xpv) 66 #include <sys/hypervisor.h> 67 #include <sys/note.h> 68 #endif 69 70 static void ldt_alloc(proc_t *, uint_t); 71 static void ldt_free(proc_t *); 72 static void ldt_dup(proc_t *, proc_t *); 73 static void ldt_grow(proc_t *, uint_t); 74 75 /* 76 * sysi86 System Call 77 */ 78 79 /* ARGSUSED */ 80 int 81 sysi86(short cmd, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3) 82 { 83 struct ssd ssd; 84 int error = 0; 85 int c; 86 proc_t *pp = curproc; 87 88 switch (cmd) { 89 90 /* 91 * The SI86V86 subsystem call of the SYSI86 system call 92 * supports only one subcode -- V86SC_IOPL. 93 */ 94 case SI86V86: 95 if (arg1 == V86SC_IOPL) { 96 struct regs *rp = lwptoregs(ttolwp(curthread)); 97 greg_t oldpl = rp->r_ps & PS_IOPL; 98 greg_t newpl = arg2 & PS_IOPL; 99 100 /* 101 * Must be privileged to run this system call 102 * if giving more io privilege. 103 */ 104 if (newpl > oldpl && (error = 105 secpolicy_sys_config(CRED(), B_FALSE)) != 0) 106 return (set_errno(error)); 107 #if defined(__xpv) 108 kpreempt_disable(); 109 installctx(curthread, NULL, xen_disable_user_iopl, 110 xen_enable_user_iopl, NULL, NULL, 111 xen_disable_user_iopl, NULL); 112 xen_enable_user_iopl(); 113 kpreempt_enable(); 114 #else 115 rp->r_ps ^= oldpl ^ newpl; 116 #endif 117 } else 118 error = EINVAL; 119 break; 120 121 /* 122 * Set a segment descriptor 123 */ 124 case SI86DSCR: 125 /* 126 * There are considerable problems here manipulating 127 * resources shared by many running lwps. Get everyone 128 * into a safe state before changing the LDT. 129 */ 130 if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK1)) { 131 error = EINTR; 132 break; 133 } 134 135 if (get_udatamodel() == DATAMODEL_LP64) { 136 error = EINVAL; 137 break; 138 } 139 140 if (copyin((caddr_t)arg1, &ssd, sizeof (ssd)) < 0) { 141 error = EFAULT; 142 break; 143 } 144 145 error = setdscr(&ssd); 146 147 mutex_enter(&pp->p_lock); 148 if (curthread != pp->p_agenttp) 149 continuelwps(pp); 150 mutex_exit(&pp->p_lock); 151 break; 152 153 case SI86FPHW: 154 c = fp_kind & 0xff; 155 if (suword32((void *)arg1, c) == -1) 156 error = EFAULT; 157 break; 158 159 case SI86FPSTART: 160 /* 161 * arg1 is the address of _fp_hw 162 * arg2 is the desired x87 FCW value 163 * arg3 is the desired SSE MXCSR value 164 * a return value of one means SSE hardware, else none. 165 */ 166 c = fp_kind & 0xff; 167 if (suword32((void *)arg1, c) == -1) { 168 error = EFAULT; 169 break; 170 } 171 fpsetcw((uint16_t)arg2, (uint32_t)arg3); 172 return ((fp_kind & __FP_SSE) ? 1 : 0); 173 174 /* real time clock management commands */ 175 176 case WTODC: 177 if ((error = secpolicy_settime(CRED())) == 0) { 178 timestruc_t ts; 179 mutex_enter(&tod_lock); 180 gethrestime(&ts); 181 tod_set(ts); 182 mutex_exit(&tod_lock); 183 } 184 break; 185 186 /* Give some timezone playing room */ 187 #define ONEWEEK (7 * 24 * 60 * 60) 188 189 case SGMTL: 190 /* 191 * Called from 32 bit land, negative values 192 * are not sign extended, so we do that here 193 * by casting it to an int and back. We also 194 * clamp the value to within reason and detect 195 * when a 64 bit call overflows an int. 196 */ 197 if ((error = secpolicy_settime(CRED())) == 0) { 198 int newlag = (int)arg1; 199 200 #ifdef _SYSCALL32_IMPL 201 if (get_udatamodel() == DATAMODEL_NATIVE && 202 (long)newlag != (long)arg1) { 203 error = EOVERFLOW; 204 } else 205 #endif 206 if (newlag >= -ONEWEEK && newlag <= ONEWEEK) 207 sgmtl(newlag); 208 else 209 error = EOVERFLOW; 210 } 211 break; 212 213 case GGMTL: 214 if (get_udatamodel() == DATAMODEL_NATIVE) { 215 if (sulword((void *)arg1, ggmtl()) == -1) 216 error = EFAULT; 217 #ifdef _SYSCALL32_IMPL 218 } else { 219 time_t gmtl; 220 221 if ((gmtl = ggmtl()) > INT32_MAX) { 222 /* 223 * Since gmt_lag can at most be 224 * +/- 12 hours, something is 225 * *seriously* messed up here. 226 */ 227 error = EOVERFLOW; 228 } else if (suword32((void *)arg1, (int32_t)gmtl) == -1) 229 error = EFAULT; 230 #endif 231 } 232 break; 233 234 case RTCSYNC: 235 if ((error = secpolicy_settime(CRED())) == 0) 236 rtcsync(); 237 break; 238 239 /* END OF real time clock management commands */ 240 241 default: 242 error = EINVAL; 243 break; 244 } 245 return (error == 0 ? 0 : set_errno(error)); 246 } 247 248 void 249 usd_to_ssd(user_desc_t *usd, struct ssd *ssd, selector_t sel) 250 { 251 ssd->bo = USEGD_GETBASE(usd); 252 ssd->ls = USEGD_GETLIMIT(usd); 253 ssd->sel = sel; 254 255 /* 256 * set type, dpl and present bits. 257 */ 258 ssd->acc1 = usd->usd_type; 259 ssd->acc1 |= usd->usd_dpl << 5; 260 ssd->acc1 |= usd->usd_p << (5 + 2); 261 262 /* 263 * set avl, DB and granularity bits. 264 */ 265 ssd->acc2 = usd->usd_avl; 266 267 #if defined(__amd64) 268 ssd->acc2 |= usd->usd_long << 1; 269 #else 270 ssd->acc2 |= usd->usd_reserved << 1; 271 #endif 272 273 ssd->acc2 |= usd->usd_def32 << (1 + 1); 274 ssd->acc2 |= usd->usd_gran << (1 + 1 + 1); 275 } 276 277 static void 278 ssd_to_usd(struct ssd *ssd, user_desc_t *usd) 279 { 280 281 ASSERT(bcmp(usd, &null_udesc, sizeof (*usd)) == 0); 282 283 USEGD_SETBASE(usd, ssd->bo); 284 USEGD_SETLIMIT(usd, ssd->ls); 285 286 /* 287 * Set type, dpl and present bits. 288 * 289 * Force the "accessed" bit to on so that we don't run afoul of 290 * KPTI. 291 */ 292 usd->usd_type = ssd->acc1 | SDT_A; 293 usd->usd_dpl = ssd->acc1 >> 5; 294 usd->usd_p = ssd->acc1 >> (5 + 2); 295 296 ASSERT(usd->usd_type >= SDT_MEMRO); 297 ASSERT(usd->usd_dpl == SEL_UPL); 298 299 /* 300 * 64-bit code selectors are never allowed in the LDT. 301 * Reserved bit is always 0 on 32-bit systems. 302 */ 303 #if defined(__amd64) 304 usd->usd_long = 0; 305 #else 306 usd->usd_reserved = 0; 307 #endif 308 309 /* 310 * set avl, DB and granularity bits. 311 */ 312 usd->usd_avl = ssd->acc2; 313 usd->usd_def32 = ssd->acc2 >> (1 + 1); 314 usd->usd_gran = ssd->acc2 >> (1 + 1 + 1); 315 } 316 317 318 #if defined(__i386) 319 320 static void 321 ssd_to_sgd(struct ssd *ssd, gate_desc_t *sgd) 322 { 323 324 ASSERT(bcmp(sgd, &null_sdesc, sizeof (*sgd)) == 0); 325 326 sgd->sgd_looffset = ssd->bo; 327 sgd->sgd_hioffset = ssd->bo >> 16; 328 329 sgd->sgd_selector = ssd->ls; 330 331 /* 332 * set type, dpl and present bits. 333 */ 334 sgd->sgd_type = ssd->acc1; 335 sgd->sgd_dpl = ssd->acc1 >> 5; 336 sgd->sgd_p = ssd->acc1 >> 7; 337 ASSERT(sgd->sgd_type == SDT_SYSCGT); 338 ASSERT(sgd->sgd_dpl == SEL_UPL); 339 sgd->sgd_stkcpy = 0; 340 } 341 342 #endif /* __i386 */ 343 344 /* 345 * Load LDT register with the current process's LDT. 346 */ 347 static void 348 ldt_load(void) 349 { 350 #if defined(__xpv) 351 xen_set_ldt(curproc->p_ldt, curproc->p_ldtlimit + 1); 352 #else 353 size_t len; 354 system_desc_t desc; 355 356 /* 357 * Before we can use the LDT on this CPU, we must install the LDT in the 358 * user mapping table. 359 */ 360 len = (curproc->p_ldtlimit + 1) * sizeof (user_desc_t); 361 bcopy(curproc->p_ldt, CPU->cpu_m.mcpu_ldt, len); 362 CPU->cpu_m.mcpu_ldt_len = len; 363 set_syssegd(&desc, CPU->cpu_m.mcpu_ldt, len - 1, SDT_SYSLDT, SEL_KPL); 364 *((system_desc_t *)&CPU->cpu_gdt[GDT_LDT]) = desc; 365 366 wr_ldtr(ULDT_SEL); 367 #endif 368 } 369 370 /* 371 * Store a NULL selector in the LDTR. All subsequent illegal references to 372 * the LDT will result in a #gp. 373 */ 374 void 375 ldt_unload(void) 376 { 377 #if defined(__xpv) 378 xen_set_ldt(NULL, 0); 379 #else 380 *((system_desc_t *)&CPU->cpu_gdt[GDT_LDT]) = null_sdesc; 381 wr_ldtr(0); 382 383 bzero(CPU->cpu_m.mcpu_ldt, CPU->cpu_m.mcpu_ldt_len); 384 CPU->cpu_m.mcpu_ldt_len = 0; 385 #endif 386 } 387 388 /*ARGSUSED*/ 389 static void 390 ldt_savectx(proc_t *p) 391 { 392 ASSERT(p->p_ldt != NULL); 393 ASSERT(p == curproc); 394 395 #if defined(__amd64) 396 /* 397 * The 64-bit kernel must be sure to clear any stale ldt 398 * selectors when context switching away from a process that 399 * has a private ldt. Consider the following example: 400 * 401 * Wine creats a ldt descriptor and points a segment register 402 * to it. 403 * 404 * We then context switch away from wine lwp to kernel 405 * thread and hit breakpoint in kernel with kmdb 406 * 407 * When we continue and resume from kmdb we will #gp 408 * fault since kmdb will have saved the stale ldt selector 409 * from wine and will try to restore it but we are no longer in 410 * the context of the wine process and do not have our 411 * ldtr register pointing to the private ldt. 412 */ 413 reset_sregs(); 414 #endif 415 416 ldt_unload(); 417 cpu_fast_syscall_enable(); 418 } 419 420 static void 421 ldt_restorectx(proc_t *p) 422 { 423 ASSERT(p->p_ldt != NULL); 424 ASSERT(p == curproc); 425 426 ldt_load(); 427 cpu_fast_syscall_disable(); 428 } 429 430 /* 431 * At exec time, we need to clear up our LDT context and re-enable fast syscalls 432 * for the new process image. 433 * 434 * The same is true for the other case, where we have: 435 * 436 * proc_exit() 437 * ->exitpctx()->ldt_savectx() 438 * ->freepctx()->ldt_freectx() 439 * 440 * Because pre-emption is not prevented between the two callbacks, we could have 441 * come off CPU, and brought back LDT context when coming back on CPU via 442 * ldt_restorectx(). 443 */ 444 /* ARGSUSED */ 445 static void 446 ldt_freectx(proc_t *p, int isexec) 447 { 448 ASSERT(p->p_ldt != NULL); 449 ASSERT(p == curproc); 450 451 kpreempt_disable(); 452 ldt_free(p); 453 cpu_fast_syscall_enable(); 454 kpreempt_enable(); 455 } 456 457 /* 458 * Install ctx op that ensures syscall/sysenter are disabled. 459 * See comments below. 460 * 461 * When a thread with a private LDT forks, the new process 462 * must have the LDT context ops installed. 463 */ 464 /* ARGSUSED */ 465 static void 466 ldt_installctx(proc_t *p, proc_t *cp) 467 { 468 proc_t *targ = p; 469 kthread_t *t; 470 471 /* 472 * If this is a fork, operate on the child process. 473 */ 474 if (cp != NULL) { 475 targ = cp; 476 ldt_dup(p, cp); 477 } 478 479 /* 480 * The process context ops expect the target process as their argument. 481 */ 482 ASSERT(removepctx(targ, targ, ldt_savectx, ldt_restorectx, 483 ldt_installctx, ldt_savectx, ldt_freectx) == 0); 484 485 installpctx(targ, targ, ldt_savectx, ldt_restorectx, 486 ldt_installctx, ldt_savectx, ldt_freectx); 487 488 /* 489 * We've just disabled fast system call and return instructions; take 490 * the slow path out to make sure we don't try to use one to return 491 * back to user. We must set t_post_sys for every thread in the 492 * process to make sure none of them escape out via fast return. 493 */ 494 495 mutex_enter(&targ->p_lock); 496 t = targ->p_tlist; 497 do { 498 t->t_post_sys = 1; 499 } while ((t = t->t_forw) != targ->p_tlist); 500 mutex_exit(&targ->p_lock); 501 } 502 503 int 504 setdscr(struct ssd *ssd) 505 { 506 ushort_t seli; /* selector index */ 507 user_desc_t *ldp; /* descriptor pointer */ 508 user_desc_t ndesc; /* new descriptor */ 509 proc_t *pp = curproc; 510 int rc = 0; 511 512 /* 513 * LDT segments: executable and data at DPL 3 only. 514 */ 515 if (!SELISLDT(ssd->sel) || !SELISUPL(ssd->sel)) 516 return (EINVAL); 517 518 /* 519 * check the selector index. 520 */ 521 seli = SELTOIDX(ssd->sel); 522 if (seli >= MAXNLDT || seli < LDT_UDBASE) 523 return (EINVAL); 524 525 ndesc = null_udesc; 526 mutex_enter(&pp->p_ldtlock); 527 528 /* 529 * If this is the first time for this process then setup a 530 * private LDT for it. 531 */ 532 if (pp->p_ldt == NULL) { 533 ldt_alloc(pp, seli); 534 535 /* 536 * Now that this process has a private LDT, the use of 537 * the syscall/sysret and sysenter/sysexit instructions 538 * is forbidden for this processes because they destroy 539 * the contents of %cs and %ss segment registers. 540 * 541 * Explicity disable them here and add a context handler 542 * to the process. Note that disabling 543 * them here means we can't use sysret or sysexit on 544 * the way out of this system call - so we force this 545 * thread to take the slow path (which doesn't make use 546 * of sysenter or sysexit) back out. 547 */ 548 kpreempt_disable(); 549 ldt_installctx(pp, NULL); 550 cpu_fast_syscall_disable(); 551 ASSERT(curthread->t_post_sys != 0); 552 kpreempt_enable(); 553 554 } else if (seli > pp->p_ldtlimit) { 555 ASSERT(pp->p_pctx != NULL); 556 557 /* 558 * Increase size of ldt to include seli. 559 */ 560 ldt_grow(pp, seli); 561 } 562 563 ASSERT(seli <= pp->p_ldtlimit); 564 ldp = &pp->p_ldt[seli]; 565 566 /* 567 * On the 64-bit kernel, this is where things get more subtle. 568 * Recall that in the 64-bit kernel, when we enter the kernel we 569 * deliberately -don't- reload the segment selectors we came in on 570 * for %ds, %es, %fs or %gs. Messing with selectors is expensive, 571 * and the underlying descriptors are essentially ignored by the 572 * hardware in long mode - except for the base that we override with 573 * the gsbase MSRs. 574 * 575 * However, there's one unfortunate issue with this rosy picture -- 576 * a descriptor that's not marked as 'present' will still generate 577 * an #np when loading a segment register. 578 * 579 * Consider this case. An lwp creates a harmless LDT entry, points 580 * one of it's segment registers at it, then tells the kernel (here) 581 * to delete it. In the 32-bit kernel, the #np will happen on the 582 * way back to userland where we reload the segment registers, and be 583 * handled in kern_gpfault(). In the 64-bit kernel, the same thing 584 * will happen in the normal case too. However, if we're trying to 585 * use a debugger that wants to save and restore the segment registers, 586 * and the debugger things that we have valid segment registers, we 587 * have the problem that the debugger will try and restore the 588 * segment register that points at the now 'not present' descriptor 589 * and will take a #np right there. 590 * 591 * We should obviously fix the debugger to be paranoid about 592 * -not- restoring segment registers that point to bad descriptors; 593 * however we can prevent the problem here if we check to see if any 594 * of the segment registers are still pointing at the thing we're 595 * destroying; if they are, return an error instead. (That also seems 596 * a lot better failure mode than SIGKILL and a core file 597 * from kern_gpfault() too.) 598 */ 599 if (SI86SSD_PRES(ssd) == 0) { 600 kthread_t *t; 601 int bad = 0; 602 603 /* 604 * Look carefully at the segment registers of every lwp 605 * in the process (they're all stopped by our caller). 606 * If we're about to invalidate a descriptor that's still 607 * being referenced by *any* of them, return an error, 608 * rather than having them #gp on their way out of the kernel. 609 */ 610 ASSERT(pp->p_lwprcnt == 1); 611 612 mutex_enter(&pp->p_lock); 613 t = pp->p_tlist; 614 do { 615 klwp_t *lwp = ttolwp(t); 616 struct regs *rp = lwp->lwp_regs; 617 #if defined(__amd64) 618 pcb_t *pcb = &lwp->lwp_pcb; 619 #endif 620 621 if (ssd->sel == rp->r_cs || ssd->sel == rp->r_ss) { 622 bad = 1; 623 break; 624 } 625 626 #if defined(__amd64) 627 if (pcb->pcb_rupdate == 1) { 628 if (ssd->sel == pcb->pcb_ds || 629 ssd->sel == pcb->pcb_es || 630 ssd->sel == pcb->pcb_fs || 631 ssd->sel == pcb->pcb_gs) { 632 bad = 1; 633 break; 634 } 635 } else 636 #endif 637 { 638 if (ssd->sel == rp->r_ds || 639 ssd->sel == rp->r_es || 640 ssd->sel == rp->r_fs || 641 ssd->sel == rp->r_gs) { 642 bad = 1; 643 break; 644 } 645 } 646 647 } while ((t = t->t_forw) != pp->p_tlist); 648 mutex_exit(&pp->p_lock); 649 650 if (bad) { 651 mutex_exit(&pp->p_ldtlock); 652 return (EBUSY); 653 } 654 } 655 656 /* 657 * If acc1 is zero, clear the descriptor (including the 'present' bit). 658 * Make sure we update the CPU-private copy of the LDT. 659 */ 660 if (ssd->acc1 == 0) { 661 rc = ldt_update_segd(ldp, &null_udesc); 662 kpreempt_disable(); 663 ldt_load(); 664 kpreempt_enable(); 665 mutex_exit(&pp->p_ldtlock); 666 return (rc); 667 } 668 669 /* 670 * Check segment type, allow segment not present and 671 * only user DPL (3). 672 */ 673 if (SI86SSD_DPL(ssd) != SEL_UPL) { 674 mutex_exit(&pp->p_ldtlock); 675 return (EINVAL); 676 } 677 678 /* 679 * Do not allow 32-bit applications to create 64-bit mode code 680 * segments. 681 */ 682 if (SI86SSD_ISUSEG(ssd) && ((SI86SSD_TYPE(ssd) >> 3) & 1) == 1 && 683 SI86SSD_ISLONG(ssd)) { 684 mutex_exit(&pp->p_ldtlock); 685 return (EINVAL); 686 } 687 688 /* 689 * Set up a code or data user segment descriptor, making sure to update 690 * the CPU-private copy of the LDT. 691 */ 692 if (SI86SSD_ISUSEG(ssd)) { 693 ssd_to_usd(ssd, &ndesc); 694 rc = ldt_update_segd(ldp, &ndesc); 695 kpreempt_disable(); 696 ldt_load(); 697 kpreempt_enable(); 698 mutex_exit(&pp->p_ldtlock); 699 return (rc); 700 } 701 702 mutex_exit(&pp->p_ldtlock); 703 return (EINVAL); 704 } 705 706 /* 707 * Allocate new LDT for process just large enough to contain seli. Note we 708 * allocate and grow LDT in PAGESIZE chunks. We do this to simplify the 709 * implementation and because on the hypervisor it's required, since the LDT 710 * must live on pages that have PROT_WRITE removed and which are given to the 711 * hypervisor. 712 * 713 * Note that we don't actually load the LDT into the current CPU here: it's done 714 * later by our caller. 715 */ 716 static void 717 ldt_alloc(proc_t *pp, uint_t seli) 718 { 719 user_desc_t *ldt; 720 size_t ldtsz; 721 uint_t nsels; 722 723 ASSERT(MUTEX_HELD(&pp->p_ldtlock)); 724 ASSERT(pp->p_ldt == NULL); 725 ASSERT(pp->p_ldtlimit == 0); 726 727 /* 728 * Allocate new LDT just large enough to contain seli. The LDT must 729 * always be allocated in units of pages for KPTI. 730 */ 731 ldtsz = P2ROUNDUP((seli + 1) * sizeof (user_desc_t), PAGESIZE); 732 nsels = ldtsz / sizeof (user_desc_t); 733 ASSERT(nsels >= MINNLDT && nsels <= MAXNLDT); 734 735 ldt = kmem_zalloc(ldtsz, KM_SLEEP); 736 ASSERT(IS_P2ALIGNED(ldt, PAGESIZE)); 737 738 #if defined(__xpv) 739 if (xen_ldt_setprot(ldt, ldtsz, PROT_READ)) 740 panic("ldt_alloc:xen_ldt_setprot(PROT_READ) failed"); 741 #endif 742 743 pp->p_ldt = ldt; 744 pp->p_ldtlimit = nsels - 1; 745 } 746 747 static void 748 ldt_free(proc_t *pp) 749 { 750 user_desc_t *ldt; 751 size_t ldtsz; 752 753 ASSERT(pp->p_ldt != NULL); 754 755 mutex_enter(&pp->p_ldtlock); 756 ldt = pp->p_ldt; 757 ldtsz = (pp->p_ldtlimit + 1) * sizeof (user_desc_t); 758 759 ASSERT(IS_P2ALIGNED(ldtsz, PAGESIZE)); 760 761 pp->p_ldt = NULL; 762 pp->p_ldtlimit = 0; 763 mutex_exit(&pp->p_ldtlock); 764 765 if (pp == curproc) { 766 kpreempt_disable(); 767 ldt_unload(); 768 kpreempt_enable(); 769 } 770 771 #if defined(__xpv) 772 /* 773 * We are not allowed to make the ldt writable until after 774 * we tell the hypervisor to unload it. 775 */ 776 if (xen_ldt_setprot(ldt, ldtsz, PROT_READ | PROT_WRITE)) 777 panic("ldt_free:xen_ldt_setprot(PROT_READ|PROT_WRITE) failed"); 778 #endif 779 780 kmem_free(ldt, ldtsz); 781 } 782 783 /* 784 * On fork copy new ldt for child. 785 */ 786 static void 787 ldt_dup(proc_t *pp, proc_t *cp) 788 { 789 size_t ldtsz; 790 791 ASSERT(pp->p_ldt != NULL); 792 ASSERT(cp != curproc); 793 794 /* 795 * I assume the parent's ldt can't increase since we're in a fork. 796 */ 797 mutex_enter(&pp->p_ldtlock); 798 mutex_enter(&cp->p_ldtlock); 799 800 ldtsz = (pp->p_ldtlimit + 1) * sizeof (user_desc_t); 801 802 ldt_alloc(cp, pp->p_ldtlimit); 803 804 #if defined(__xpv) 805 /* 806 * Make child's ldt writable so it can be copied into from 807 * parent's ldt. This works since ldt_alloc above did not load 808 * the ldt since its for the child process. If we tried to make 809 * an LDT writable that is loaded in hw the setprot operation 810 * would fail. 811 */ 812 if (xen_ldt_setprot(cp->p_ldt, ldtsz, PROT_READ | PROT_WRITE)) 813 panic("ldt_dup:xen_ldt_setprot(PROT_READ|PROT_WRITE) failed"); 814 #endif 815 816 bcopy(pp->p_ldt, cp->p_ldt, ldtsz); 817 818 #if defined(__xpv) 819 if (xen_ldt_setprot(cp->p_ldt, ldtsz, PROT_READ)) 820 panic("ldt_dup:xen_ldt_setprot(PROT_READ) failed"); 821 #endif 822 mutex_exit(&cp->p_ldtlock); 823 mutex_exit(&pp->p_ldtlock); 824 825 } 826 827 /* 828 * Note that we don't actually load the LDT into the current CPU here: it's done 829 * later by our caller - unless we take an error. This works out because 830 * ldt_load() does a copy of ->p_ldt instead of directly loading it into the GDT 831 * (and therefore can't be using the freed old LDT), and by definition if the 832 * new entry didn't pass validation, then the proc shouldn't be referencing an 833 * entry in the extended region. 834 */ 835 static void 836 ldt_grow(proc_t *pp, uint_t seli) 837 { 838 user_desc_t *oldt, *nldt; 839 uint_t nsels; 840 size_t oldtsz, nldtsz; 841 842 ASSERT(MUTEX_HELD(&pp->p_ldtlock)); 843 ASSERT(pp->p_ldt != NULL); 844 ASSERT(pp->p_ldtlimit != 0); 845 846 /* 847 * Allocate larger LDT just large enough to contain seli. The LDT must 848 * always be allocated in units of pages for KPTI. 849 */ 850 nldtsz = P2ROUNDUP((seli + 1) * sizeof (user_desc_t), PAGESIZE); 851 nsels = nldtsz / sizeof (user_desc_t); 852 ASSERT(nsels >= MINNLDT && nsels <= MAXNLDT); 853 ASSERT(nsels > pp->p_ldtlimit); 854 855 oldt = pp->p_ldt; 856 oldtsz = (pp->p_ldtlimit + 1) * sizeof (user_desc_t); 857 858 nldt = kmem_zalloc(nldtsz, KM_SLEEP); 859 ASSERT(IS_P2ALIGNED(nldt, PAGESIZE)); 860 861 bcopy(oldt, nldt, oldtsz); 862 863 /* 864 * unload old ldt. 865 */ 866 kpreempt_disable(); 867 ldt_unload(); 868 kpreempt_enable(); 869 870 #if defined(__xpv) 871 872 /* 873 * Make old ldt writable and new ldt read only. 874 */ 875 if (xen_ldt_setprot(oldt, oldtsz, PROT_READ | PROT_WRITE)) 876 panic("ldt_grow:xen_ldt_setprot(PROT_READ|PROT_WRITE) failed"); 877 878 if (xen_ldt_setprot(nldt, nldtsz, PROT_READ)) 879 panic("ldt_grow:xen_ldt_setprot(PROT_READ) failed"); 880 #endif 881 882 pp->p_ldt = nldt; 883 pp->p_ldtlimit = nsels - 1; 884 885 kmem_free(oldt, oldtsz); 886 }