Print this page
9600 LDT still not happy under KPTI


 267 #if defined(__amd64)
 268         ssd->acc2 |= usd->usd_long << 1;
 269 #else
 270         ssd->acc2 |= usd->usd_reserved << 1;
 271 #endif
 272 
 273         ssd->acc2 |= usd->usd_def32 << (1 + 1);
 274         ssd->acc2 |= usd->usd_gran << (1 + 1 + 1);
 275 }
 276 
 277 static void
 278 ssd_to_usd(struct ssd *ssd, user_desc_t *usd)
 279 {
 280 
 281         ASSERT(bcmp(usd, &null_udesc, sizeof (*usd)) == 0);
 282 
 283         USEGD_SETBASE(usd, ssd->bo);
 284         USEGD_SETLIMIT(usd, ssd->ls);
 285 
 286         /*
 287          * set type, dpl and present bits.



 288          */
 289         usd->usd_type = ssd->acc1;
 290         usd->usd_dpl = ssd->acc1 >> 5;
 291         usd->usd_p = ssd->acc1 >> (5 + 2);
 292 
 293         ASSERT(usd->usd_type >= SDT_MEMRO);
 294         ASSERT(usd->usd_dpl == SEL_UPL);
 295 
 296         /*
 297          * 64-bit code selectors are never allowed in the LDT.
 298          * Reserved bit is always 0 on 32-bit systems.
 299          */
 300 #if defined(__amd64)
 301         usd->usd_long = 0;
 302 #else
 303         usd->usd_reserved = 0;
 304 #endif
 305 
 306         /*
 307          * set avl, DB and granularity bits.
 308          */
 309         usd->usd_avl = ssd->acc2;


 328         /*
 329          * set type, dpl and present bits.
 330          */
 331         sgd->sgd_type = ssd->acc1;
 332         sgd->sgd_dpl = ssd->acc1 >> 5;
 333         sgd->sgd_p = ssd->acc1 >> 7;
 334         ASSERT(sgd->sgd_type == SDT_SYSCGT);
 335         ASSERT(sgd->sgd_dpl == SEL_UPL);
 336         sgd->sgd_stkcpy = 0;
 337 }
 338 
 339 #endif  /* __i386 */
 340 
 341 /*
 342  * Load LDT register with the current process's LDT.
 343  */
 344 static void
 345 ldt_load(void)
 346 {
 347 #if defined(__xpv)
 348         xen_set_ldt(get_ssd_base(&curproc->p_ldt_desc),
 349             curproc->p_ldtlimit + 1);
 350 #else
 351         size_t len;
 352         system_desc_t desc;
 353 
 354         /*
 355          * Before we can use the LDT on this CPU, we must install the LDT in the
 356          * user mapping table.
 357          */
 358         len = (curproc->p_ldtlimit + 1) * sizeof (user_desc_t);
 359         bcopy(curproc->p_ldt, CPU->cpu_m.mcpu_ldt, len);
 360         CPU->cpu_m.mcpu_ldt_len = len;
 361         set_syssegd(&desc, CPU->cpu_m.mcpu_ldt, len - 1, SDT_SYSLDT, SEL_KPL);
 362         *((system_desc_t *)&CPU->cpu_gdt[GDT_LDT]) = desc;
 363 
 364         wr_ldtr(ULDT_SEL);
 365 #endif
 366 }
 367 
 368 /*
 369  * Store a NULL selector in the LDTR. All subsequent illegal references to


 395          * The 64-bit kernel must be sure to clear any stale ldt
 396          * selectors when context switching away from a process that
 397          * has a private ldt. Consider the following example:
 398          *
 399          *      Wine creats a ldt descriptor and points a segment register
 400          *      to it.
 401          *
 402          *      We then context switch away from wine lwp to kernel
 403          *      thread and hit breakpoint in kernel with kmdb
 404          *
 405          *      When we continue and resume from kmdb we will #gp
 406          *      fault since kmdb will have saved the stale ldt selector
 407          *      from wine and will try to restore it but we are no longer in
 408          *      the context of the wine process and do not have our
 409          *      ldtr register pointing to the private ldt.
 410          */
 411         reset_sregs();
 412 #endif
 413 
 414         ldt_unload();
 415         cpu_fast_syscall_enable(NULL);
 416 }
 417 
 418 static void
 419 ldt_restorectx(proc_t *p)
 420 {
 421         ASSERT(p->p_ldt != NULL);
 422         ASSERT(p == curproc);
 423 
 424         ldt_load();
 425         cpu_fast_syscall_disable(NULL);
 426 }
 427 
 428 /*
 429  * When a process with a private LDT execs, fast syscalls must be enabled for
 430  * the new process image.










 431  */
 432 /* ARGSUSED */
 433 static void
 434 ldt_freectx(proc_t *p, int isexec)
 435 {
 436         ASSERT(p->p_ldt);

 437 
 438         if (isexec) {
 439                 kpreempt_disable();
 440                 cpu_fast_syscall_enable(NULL);
 441                 kpreempt_enable();
 442         }
 443 
 444         /*
 445          * ldt_free() will free the memory used by the private LDT, reset the
 446          * process's descriptor, and re-program the LDTR.
 447          */
 448         ldt_free(p);


 449 }
 450 
 451 /*
 452  * Install ctx op that ensures syscall/sysenter are disabled.
 453  * See comments below.
 454  *
 455  * When a thread with a private LDT forks, the new process
 456  * must have the LDT context ops installed.
 457  */
 458 /* ARGSUSED */
 459 static void
 460 ldt_installctx(proc_t *p, proc_t *cp)
 461 {
 462         proc_t          *targ = p;
 463         kthread_t       *t;
 464 
 465         /*
 466          * If this is a fork, operate on the child process.
 467          */
 468         if (cp != NULL) {


 483          * We've just disabled fast system call and return instructions; take
 484          * the slow path out to make sure we don't try to use one to return
 485          * back to user. We must set t_post_sys for every thread in the
 486          * process to make sure none of them escape out via fast return.
 487          */
 488 
 489         mutex_enter(&targ->p_lock);
 490         t = targ->p_tlist;
 491         do {
 492                 t->t_post_sys = 1;
 493         } while ((t = t->t_forw) != targ->p_tlist);
 494         mutex_exit(&targ->p_lock);
 495 }
 496 
 497 int
 498 setdscr(struct ssd *ssd)
 499 {
 500         ushort_t seli;          /* selector index */
 501         user_desc_t *ldp;       /* descriptor pointer */
 502         user_desc_t ndesc;      /* new descriptor */
 503         proc_t  *pp = ttoproc(curthread);
 504         int     rc = 0;
 505 
 506         /*
 507          * LDT segments: executable and data at DPL 3 only.
 508          */
 509         if (!SELISLDT(ssd->sel) || !SELISUPL(ssd->sel))
 510                 return (EINVAL);
 511 
 512         /*
 513          * check the selector index.
 514          */
 515         seli = SELTOIDX(ssd->sel);
 516         if (seli >= MAXNLDT || seli < LDT_UDBASE)
 517                 return (EINVAL);
 518 
 519         ndesc = null_udesc;
 520         mutex_enter(&pp->p_ldtlock);
 521 
 522         /*
 523          * If this is the first time for this process then setup a
 524          * private LDT for it.
 525          */
 526         if (pp->p_ldt == NULL) {
 527                 ldt_alloc(pp, seli);
 528 
 529                 /*
 530                  * Now that this process has a private LDT, the use of
 531                  * the syscall/sysret and sysenter/sysexit instructions
 532                  * is forbidden for this processes because they destroy
 533                  * the contents of %cs and %ss segment registers.
 534                  *
 535                  * Explicity disable them here and add a context handler
 536                  * to the process. Note that disabling
 537                  * them here means we can't use sysret or sysexit on
 538                  * the way out of this system call - so we force this
 539                  * thread to take the slow path (which doesn't make use
 540                  * of sysenter or sysexit) back out.
 541                  */
 542                 kpreempt_disable();
 543                 ldt_installctx(pp, NULL);
 544                 cpu_fast_syscall_disable(NULL);
 545                 ASSERT(curthread->t_post_sys != 0);
 546                 kpreempt_enable();
 547 
 548         } else if (seli > pp->p_ldtlimit) {

 549 
 550                 /*
 551                  * Increase size of ldt to include seli.
 552                  */
 553                 ldt_grow(pp, seli);
 554         }
 555 
 556         ASSERT(seli <= pp->p_ldtlimit);
 557         ldp = &pp->p_ldt[seli];
 558 
 559         /*
 560          * On the 64-bit kernel, this is where things get more subtle.
 561          * Recall that in the 64-bit kernel, when we enter the kernel we
 562          * deliberately -don't- reload the segment selectors we came in on
 563          * for %ds, %es, %fs or %gs. Messing with selectors is expensive,
 564          * and the underlying descriptors are essentially ignored by the
 565          * hardware in long mode - except for the base that we override with
 566          * the gsbase MSRs.
 567          *
 568          * However, there's one unfortunate issue with this rosy picture --


 630                         {
 631                                 if (ssd->sel == rp->r_ds ||
 632                                     ssd->sel == rp->r_es ||
 633                                     ssd->sel == rp->r_fs ||
 634                                     ssd->sel == rp->r_gs) {
 635                                         bad = 1;
 636                                         break;
 637                                 }
 638                         }
 639 
 640                 } while ((t = t->t_forw) != pp->p_tlist);
 641                 mutex_exit(&pp->p_lock);
 642 
 643                 if (bad) {
 644                         mutex_exit(&pp->p_ldtlock);
 645                         return (EBUSY);
 646                 }
 647         }
 648 
 649         /*
 650          * If acc1 is zero, clear the descriptor (including the 'present' bit)

 651          */
 652         if (ssd->acc1 == 0) {
 653                 rc  = ldt_update_segd(ldp, &null_udesc);



 654                 mutex_exit(&pp->p_ldtlock);
 655                 return (rc);
 656         }
 657 
 658         /*
 659          * Check segment type, allow segment not present and
 660          * only user DPL (3).
 661          */
 662         if (SI86SSD_DPL(ssd) != SEL_UPL) {
 663                 mutex_exit(&pp->p_ldtlock);
 664                 return (EINVAL);
 665         }
 666 
 667 #if defined(__amd64)
 668         /*
 669          * Do not allow 32-bit applications to create 64-bit mode code
 670          * segments.
 671          */
 672         if (SI86SSD_ISUSEG(ssd) && ((SI86SSD_TYPE(ssd) >> 3) & 1) == 1 &&
 673             SI86SSD_ISLONG(ssd)) {
 674                 mutex_exit(&pp->p_ldtlock);
 675                 return (EINVAL);
 676         }
 677 #endif /* __amd64 */
 678 
 679         /*
 680          * Set up a code or data user segment descriptor.

 681          */
 682         if (SI86SSD_ISUSEG(ssd)) {
 683                 ssd_to_usd(ssd, &ndesc);
 684                 rc = ldt_update_segd(ldp, &ndesc);



 685                 mutex_exit(&pp->p_ldtlock);
 686                 return (rc);
 687         }
 688 
 689 #if defined(__i386)
 690         /*
 691          * Allow a call gate only if the destination is in the LDT
 692          * and the system is running in 32-bit legacy mode.
 693          *
 694          * In long mode 32-bit call gates are redefined as 64-bit call
 695          * gates and the hw enforces that the target code selector
 696          * of the call gate must be 64-bit selector. A #gp fault is
 697          * generated if otherwise. Since we do not allow 32-bit processes
 698          * to switch themselves to 64-bits we never allow call gates
 699          * on 64-bit system system.
 700          */
 701         if (SI86SSD_TYPE(ssd) == SDT_SYSCGT && SELISLDT(ssd->ls)) {
 702 
 703 
 704                 ssd_to_sgd(ssd, (gate_desc_t *)&ndesc);
 705                 rc = ldt_update_segd(ldp, &ndesc);
 706                 mutex_exit(&pp->p_ldtlock);
 707                 return (rc);
 708         }
 709 #endif  /* __i386 */
 710 
 711         mutex_exit(&pp->p_ldtlock);
 712         return (EINVAL);
 713 }
 714 
 715 /*
 716  * Allocate new LDT for process just large enough to contain seli.
 717  * Note we allocate and grow LDT in PAGESIZE chunks. We do this
 718  * to simplify the implementation and because on the hypervisor it's
 719  * required, since the LDT must live on pages that have PROT_WRITE
 720  * removed and which are given to the hypervisor.



 721  */
 722 static void
 723 ldt_alloc(proc_t *pp, uint_t seli)
 724 {
 725         user_desc_t     *ldt;
 726         size_t          ldtsz;
 727         uint_t          nsels;
 728 
 729         ASSERT(MUTEX_HELD(&pp->p_ldtlock));
 730         ASSERT(pp->p_ldt == NULL);
 731         ASSERT(pp->p_ldtlimit == 0);
 732 
 733         /*
 734          * Allocate new LDT just large enough to contain seli. The LDT must
 735          * always be allocated in units of pages for KPTI.
 736          */
 737         ldtsz = P2ROUNDUP((seli + 1) * sizeof (user_desc_t), PAGESIZE);
 738         nsels = ldtsz / sizeof (user_desc_t);
 739         ASSERT(nsels >= MINNLDT && nsels <= MAXNLDT);
 740 
 741         ldt = kmem_zalloc(ldtsz, KM_SLEEP);
 742         ASSERT(IS_P2ALIGNED(ldt, PAGESIZE));
 743 
 744 #if defined(__xpv)
 745         if (xen_ldt_setprot(ldt, ldtsz, PROT_READ))
 746                 panic("ldt_alloc:xen_ldt_setprot(PROT_READ) failed");
 747 #endif
 748 
 749         pp->p_ldt = ldt;
 750         pp->p_ldtlimit = nsels - 1;
 751         set_syssegd(&pp->p_ldt_desc, ldt, ldtsz - 1, SDT_SYSLDT, SEL_KPL);
 752 
 753         if (pp == curproc) {
 754                 kpreempt_disable();
 755                 ldt_load();
 756                 kpreempt_enable();
 757         }
 758 }
 759 
 760 static void
 761 ldt_free(proc_t *pp)
 762 {
 763         user_desc_t     *ldt;
 764         size_t          ldtsz;
 765 
 766         ASSERT(pp->p_ldt != NULL);
 767 
 768         mutex_enter(&pp->p_ldtlock);
 769         ldt = pp->p_ldt;
 770         ldtsz = (pp->p_ldtlimit + 1) * sizeof (user_desc_t);
 771 
 772         ASSERT(IS_P2ALIGNED(ldtsz, PAGESIZE));
 773 
 774         pp->p_ldt = NULL;
 775         pp->p_ldtlimit = 0;
 776         pp->p_ldt_desc = null_sdesc;
 777         mutex_exit(&pp->p_ldtlock);
 778 
 779         if (pp == curproc) {
 780                 kpreempt_disable();
 781                 ldt_unload();
 782                 kpreempt_enable();
 783         }
 784 
 785 #if defined(__xpv)
 786         /*
 787          * We are not allowed to make the ldt writable until after
 788          * we tell the hypervisor to unload it.
 789          */
 790         if (xen_ldt_setprot(ldt, ldtsz, PROT_READ | PROT_WRITE))
 791                 panic("ldt_free:xen_ldt_setprot(PROT_READ|PROT_WRITE) failed");
 792 #endif
 793 
 794         kmem_free(ldt, ldtsz);
 795 }
 796 


 821          * parent's ldt. This works since ldt_alloc above did not load
 822          * the ldt since its for the child process. If we tried to make
 823          * an LDT writable that is loaded in hw the setprot operation
 824          * would fail.
 825          */
 826         if (xen_ldt_setprot(cp->p_ldt, ldtsz, PROT_READ | PROT_WRITE))
 827                 panic("ldt_dup:xen_ldt_setprot(PROT_READ|PROT_WRITE) failed");
 828 #endif
 829 
 830         bcopy(pp->p_ldt, cp->p_ldt, ldtsz);
 831 
 832 #if defined(__xpv)
 833         if (xen_ldt_setprot(cp->p_ldt, ldtsz, PROT_READ))
 834                 panic("ldt_dup:xen_ldt_setprot(PROT_READ) failed");
 835 #endif
 836         mutex_exit(&cp->p_ldtlock);
 837         mutex_exit(&pp->p_ldtlock);
 838 
 839 }
 840 








 841 static void
 842 ldt_grow(proc_t *pp, uint_t seli)
 843 {
 844         user_desc_t     *oldt, *nldt;
 845         uint_t          nsels;
 846         size_t          oldtsz, nldtsz;
 847 
 848         ASSERT(MUTEX_HELD(&pp->p_ldtlock));
 849         ASSERT(pp->p_ldt != NULL);
 850         ASSERT(pp->p_ldtlimit != 0);
 851 
 852         /*
 853          * Allocate larger LDT just large enough to contain seli. The LDT must
 854          * always be allocated in units of pages for KPTI.
 855          */
 856         nldtsz = P2ROUNDUP((seli + 1) * sizeof (user_desc_t), PAGESIZE);
 857         nsels = nldtsz / sizeof (user_desc_t);
 858         ASSERT(nsels >= MINNLDT && nsels <= MAXNLDT);
 859         ASSERT(nsels > pp->p_ldtlimit);
 860 


 871          */
 872         kpreempt_disable();
 873         ldt_unload();
 874         kpreempt_enable();
 875 
 876 #if defined(__xpv)
 877 
 878         /*
 879          * Make old ldt writable and new ldt read only.
 880          */
 881         if (xen_ldt_setprot(oldt, oldtsz, PROT_READ | PROT_WRITE))
 882                 panic("ldt_grow:xen_ldt_setprot(PROT_READ|PROT_WRITE) failed");
 883 
 884         if (xen_ldt_setprot(nldt, nldtsz, PROT_READ))
 885                 panic("ldt_grow:xen_ldt_setprot(PROT_READ) failed");
 886 #endif
 887 
 888         pp->p_ldt = nldt;
 889         pp->p_ldtlimit = nsels - 1;
 890 
 891         /*
 892          * write new ldt segment descriptor.
 893          */
 894         set_syssegd(&pp->p_ldt_desc, nldt, nldtsz - 1, SDT_SYSLDT, SEL_KPL);
 895 
 896         /*
 897          * load the new ldt.
 898          */
 899         kpreempt_disable();
 900         ldt_load();
 901         kpreempt_enable();
 902 
 903         kmem_free(oldt, oldtsz);
 904 }


 267 #if defined(__amd64)
 268         ssd->acc2 |= usd->usd_long << 1;
 269 #else
 270         ssd->acc2 |= usd->usd_reserved << 1;
 271 #endif
 272 
 273         ssd->acc2 |= usd->usd_def32 << (1 + 1);
 274         ssd->acc2 |= usd->usd_gran << (1 + 1 + 1);
 275 }
 276 
 277 static void
 278 ssd_to_usd(struct ssd *ssd, user_desc_t *usd)
 279 {
 280 
 281         ASSERT(bcmp(usd, &null_udesc, sizeof (*usd)) == 0);
 282 
 283         USEGD_SETBASE(usd, ssd->bo);
 284         USEGD_SETLIMIT(usd, ssd->ls);
 285 
 286         /*
 287          * Set type, dpl and present bits.
 288          *
 289          * Force the "accessed" bit to on so that we don't run afoul of
 290          * KPTI.
 291          */
 292         usd->usd_type = ssd->acc1 | SDT_A;
 293         usd->usd_dpl = ssd->acc1 >> 5;
 294         usd->usd_p = ssd->acc1 >> (5 + 2);
 295 
 296         ASSERT(usd->usd_type >= SDT_MEMRO);
 297         ASSERT(usd->usd_dpl == SEL_UPL);
 298 
 299         /*
 300          * 64-bit code selectors are never allowed in the LDT.
 301          * Reserved bit is always 0 on 32-bit systems.
 302          */
 303 #if defined(__amd64)
 304         usd->usd_long = 0;
 305 #else
 306         usd->usd_reserved = 0;
 307 #endif
 308 
 309         /*
 310          * set avl, DB and granularity bits.
 311          */
 312         usd->usd_avl = ssd->acc2;


 331         /*
 332          * set type, dpl and present bits.
 333          */
 334         sgd->sgd_type = ssd->acc1;
 335         sgd->sgd_dpl = ssd->acc1 >> 5;
 336         sgd->sgd_p = ssd->acc1 >> 7;
 337         ASSERT(sgd->sgd_type == SDT_SYSCGT);
 338         ASSERT(sgd->sgd_dpl == SEL_UPL);
 339         sgd->sgd_stkcpy = 0;
 340 }
 341 
 342 #endif  /* __i386 */
 343 
 344 /*
 345  * Load LDT register with the current process's LDT.
 346  */
 347 static void
 348 ldt_load(void)
 349 {
 350 #if defined(__xpv)
 351         xen_set_ldt(curproc->p_ldt, curproc->p_ldtlimit + 1);

 352 #else
 353         size_t len;
 354         system_desc_t desc;
 355 
 356         /*
 357          * Before we can use the LDT on this CPU, we must install the LDT in the
 358          * user mapping table.
 359          */
 360         len = (curproc->p_ldtlimit + 1) * sizeof (user_desc_t);
 361         bcopy(curproc->p_ldt, CPU->cpu_m.mcpu_ldt, len);
 362         CPU->cpu_m.mcpu_ldt_len = len;
 363         set_syssegd(&desc, CPU->cpu_m.mcpu_ldt, len - 1, SDT_SYSLDT, SEL_KPL);
 364         *((system_desc_t *)&CPU->cpu_gdt[GDT_LDT]) = desc;
 365 
 366         wr_ldtr(ULDT_SEL);
 367 #endif
 368 }
 369 
 370 /*
 371  * Store a NULL selector in the LDTR. All subsequent illegal references to


 397          * The 64-bit kernel must be sure to clear any stale ldt
 398          * selectors when context switching away from a process that
 399          * has a private ldt. Consider the following example:
 400          *
 401          *      Wine creats a ldt descriptor and points a segment register
 402          *      to it.
 403          *
 404          *      We then context switch away from wine lwp to kernel
 405          *      thread and hit breakpoint in kernel with kmdb
 406          *
 407          *      When we continue and resume from kmdb we will #gp
 408          *      fault since kmdb will have saved the stale ldt selector
 409          *      from wine and will try to restore it but we are no longer in
 410          *      the context of the wine process and do not have our
 411          *      ldtr register pointing to the private ldt.
 412          */
 413         reset_sregs();
 414 #endif
 415 
 416         ldt_unload();
 417         cpu_fast_syscall_enable();
 418 }
 419 
 420 static void
 421 ldt_restorectx(proc_t *p)
 422 {
 423         ASSERT(p->p_ldt != NULL);
 424         ASSERT(p == curproc);
 425 
 426         ldt_load();
 427         cpu_fast_syscall_disable();
 428 }
 429 
 430 /*
 431  * At exec time, we need to clear up our LDT context and re-enable fast syscalls
 432  * for the new process image.
 433  *
 434  * The same is true for the other case, where we have:
 435  *
 436  * proc_exit()
 437  *  ->exitpctx()->ldt_savectx()
 438  *  ->freepctx()->ldt_freectx()
 439  *
 440  * Because pre-emption is not prevented between the two callbacks, we could have
 441  * come off CPU, and brought back LDT context when coming back on CPU via
 442  * ldt_restorectx().
 443  */
 444 /* ARGSUSED */
 445 static void
 446 ldt_freectx(proc_t *p, int isexec)
 447 {
 448         ASSERT(p->p_ldt != NULL);
 449         ASSERT(p == curproc);
 450 

 451         kpreempt_disable();








 452         ldt_free(p);
 453         cpu_fast_syscall_enable();
 454         kpreempt_enable();
 455 }
 456 
 457 /*
 458  * Install ctx op that ensures syscall/sysenter are disabled.
 459  * See comments below.
 460  *
 461  * When a thread with a private LDT forks, the new process
 462  * must have the LDT context ops installed.
 463  */
 464 /* ARGSUSED */
 465 static void
 466 ldt_installctx(proc_t *p, proc_t *cp)
 467 {
 468         proc_t          *targ = p;
 469         kthread_t       *t;
 470 
 471         /*
 472          * If this is a fork, operate on the child process.
 473          */
 474         if (cp != NULL) {


 489          * We've just disabled fast system call and return instructions; take
 490          * the slow path out to make sure we don't try to use one to return
 491          * back to user. We must set t_post_sys for every thread in the
 492          * process to make sure none of them escape out via fast return.
 493          */
 494 
 495         mutex_enter(&targ->p_lock);
 496         t = targ->p_tlist;
 497         do {
 498                 t->t_post_sys = 1;
 499         } while ((t = t->t_forw) != targ->p_tlist);
 500         mutex_exit(&targ->p_lock);
 501 }
 502 
 503 int
 504 setdscr(struct ssd *ssd)
 505 {
 506         ushort_t seli;          /* selector index */
 507         user_desc_t *ldp;       /* descriptor pointer */
 508         user_desc_t ndesc;      /* new descriptor */
 509         proc_t  *pp = curproc;
 510         int     rc = 0;
 511 
 512         /*
 513          * LDT segments: executable and data at DPL 3 only.
 514          */
 515         if (!SELISLDT(ssd->sel) || !SELISUPL(ssd->sel))
 516                 return (EINVAL);
 517 
 518         /*
 519          * check the selector index.
 520          */
 521         seli = SELTOIDX(ssd->sel);
 522         if (seli >= MAXNLDT || seli < LDT_UDBASE)
 523                 return (EINVAL);
 524 
 525         ndesc = null_udesc;
 526         mutex_enter(&pp->p_ldtlock);
 527 
 528         /*
 529          * If this is the first time for this process then setup a
 530          * private LDT for it.
 531          */
 532         if (pp->p_ldt == NULL) {
 533                 ldt_alloc(pp, seli);
 534 
 535                 /*
 536                  * Now that this process has a private LDT, the use of
 537                  * the syscall/sysret and sysenter/sysexit instructions
 538                  * is forbidden for this processes because they destroy
 539                  * the contents of %cs and %ss segment registers.
 540                  *
 541                  * Explicity disable them here and add a context handler
 542                  * to the process. Note that disabling
 543                  * them here means we can't use sysret or sysexit on
 544                  * the way out of this system call - so we force this
 545                  * thread to take the slow path (which doesn't make use
 546                  * of sysenter or sysexit) back out.
 547                  */
 548                 kpreempt_disable();
 549                 ldt_installctx(pp, NULL);
 550                 cpu_fast_syscall_disable();
 551                 ASSERT(curthread->t_post_sys != 0);
 552                 kpreempt_enable();
 553 
 554         } else if (seli > pp->p_ldtlimit) {
 555                 ASSERT(pp->p_pctx != NULL);
 556 
 557                 /*
 558                  * Increase size of ldt to include seli.
 559                  */
 560                 ldt_grow(pp, seli);
 561         }
 562 
 563         ASSERT(seli <= pp->p_ldtlimit);
 564         ldp = &pp->p_ldt[seli];
 565 
 566         /*
 567          * On the 64-bit kernel, this is where things get more subtle.
 568          * Recall that in the 64-bit kernel, when we enter the kernel we
 569          * deliberately -don't- reload the segment selectors we came in on
 570          * for %ds, %es, %fs or %gs. Messing with selectors is expensive,
 571          * and the underlying descriptors are essentially ignored by the
 572          * hardware in long mode - except for the base that we override with
 573          * the gsbase MSRs.
 574          *
 575          * However, there's one unfortunate issue with this rosy picture --


 637                         {
 638                                 if (ssd->sel == rp->r_ds ||
 639                                     ssd->sel == rp->r_es ||
 640                                     ssd->sel == rp->r_fs ||
 641                                     ssd->sel == rp->r_gs) {
 642                                         bad = 1;
 643                                         break;
 644                                 }
 645                         }
 646 
 647                 } while ((t = t->t_forw) != pp->p_tlist);
 648                 mutex_exit(&pp->p_lock);
 649 
 650                 if (bad) {
 651                         mutex_exit(&pp->p_ldtlock);
 652                         return (EBUSY);
 653                 }
 654         }
 655 
 656         /*
 657          * If acc1 is zero, clear the descriptor (including the 'present' bit).
 658          * Make sure we update the CPU-private copy of the LDT.
 659          */
 660         if (ssd->acc1 == 0) {
 661                 rc  = ldt_update_segd(ldp, &null_udesc);
 662                 kpreempt_disable();
 663                 ldt_load();
 664                 kpreempt_enable();
 665                 mutex_exit(&pp->p_ldtlock);
 666                 return (rc);
 667         }
 668 
 669         /*
 670          * Check segment type, allow segment not present and
 671          * only user DPL (3).
 672          */
 673         if (SI86SSD_DPL(ssd) != SEL_UPL) {
 674                 mutex_exit(&pp->p_ldtlock);
 675                 return (EINVAL);
 676         }
 677 

 678         /*
 679          * Do not allow 32-bit applications to create 64-bit mode code
 680          * segments.
 681          */
 682         if (SI86SSD_ISUSEG(ssd) && ((SI86SSD_TYPE(ssd) >> 3) & 1) == 1 &&
 683             SI86SSD_ISLONG(ssd)) {
 684                 mutex_exit(&pp->p_ldtlock);
 685                 return (EINVAL);
 686         }

 687 
 688         /*
 689          * Set up a code or data user segment descriptor, making sure to update
 690          * the CPU-private copy of the LDT.
 691          */
 692         if (SI86SSD_ISUSEG(ssd)) {
 693                 ssd_to_usd(ssd, &ndesc);
 694                 rc = ldt_update_segd(ldp, &ndesc);
 695                 kpreempt_disable();
 696                 ldt_load();
 697                 kpreempt_enable();
 698                 mutex_exit(&pp->p_ldtlock);
 699                 return (rc);
 700         }
 701 

















 702         mutex_exit(&pp->p_ldtlock);





 703         return (EINVAL);
 704 }
 705 
 706 /*
 707  * Allocate new LDT for process just large enough to contain seli.  Note we
 708  * allocate and grow LDT in PAGESIZE chunks. We do this to simplify the
 709  * implementation and because on the hypervisor it's required, since the LDT
 710  * must live on pages that have PROT_WRITE removed and which are given to the
 711  * hypervisor.
 712  *
 713  * Note that we don't actually load the LDT into the current CPU here: it's done
 714  * later by our caller.
 715  */
 716 static void
 717 ldt_alloc(proc_t *pp, uint_t seli)
 718 {
 719         user_desc_t     *ldt;
 720         size_t          ldtsz;
 721         uint_t          nsels;
 722 
 723         ASSERT(MUTEX_HELD(&pp->p_ldtlock));
 724         ASSERT(pp->p_ldt == NULL);
 725         ASSERT(pp->p_ldtlimit == 0);
 726 
 727         /*
 728          * Allocate new LDT just large enough to contain seli. The LDT must
 729          * always be allocated in units of pages for KPTI.
 730          */
 731         ldtsz = P2ROUNDUP((seli + 1) * sizeof (user_desc_t), PAGESIZE);
 732         nsels = ldtsz / sizeof (user_desc_t);
 733         ASSERT(nsels >= MINNLDT && nsels <= MAXNLDT);
 734 
 735         ldt = kmem_zalloc(ldtsz, KM_SLEEP);
 736         ASSERT(IS_P2ALIGNED(ldt, PAGESIZE));
 737 
 738 #if defined(__xpv)
 739         if (xen_ldt_setprot(ldt, ldtsz, PROT_READ))
 740                 panic("ldt_alloc:xen_ldt_setprot(PROT_READ) failed");
 741 #endif
 742 
 743         pp->p_ldt = ldt;
 744         pp->p_ldtlimit = nsels - 1;







 745 }
 746 
 747 static void
 748 ldt_free(proc_t *pp)
 749 {
 750         user_desc_t     *ldt;
 751         size_t          ldtsz;
 752 
 753         ASSERT(pp->p_ldt != NULL);
 754 
 755         mutex_enter(&pp->p_ldtlock);
 756         ldt = pp->p_ldt;
 757         ldtsz = (pp->p_ldtlimit + 1) * sizeof (user_desc_t);
 758 
 759         ASSERT(IS_P2ALIGNED(ldtsz, PAGESIZE));
 760 
 761         pp->p_ldt = NULL;
 762         pp->p_ldtlimit = 0;

 763         mutex_exit(&pp->p_ldtlock);
 764 
 765         if (pp == curproc) {
 766                 kpreempt_disable();
 767                 ldt_unload();
 768                 kpreempt_enable();
 769         }
 770 
 771 #if defined(__xpv)
 772         /*
 773          * We are not allowed to make the ldt writable until after
 774          * we tell the hypervisor to unload it.
 775          */
 776         if (xen_ldt_setprot(ldt, ldtsz, PROT_READ | PROT_WRITE))
 777                 panic("ldt_free:xen_ldt_setprot(PROT_READ|PROT_WRITE) failed");
 778 #endif
 779 
 780         kmem_free(ldt, ldtsz);
 781 }
 782 


 807          * parent's ldt. This works since ldt_alloc above did not load
 808          * the ldt since its for the child process. If we tried to make
 809          * an LDT writable that is loaded in hw the setprot operation
 810          * would fail.
 811          */
 812         if (xen_ldt_setprot(cp->p_ldt, ldtsz, PROT_READ | PROT_WRITE))
 813                 panic("ldt_dup:xen_ldt_setprot(PROT_READ|PROT_WRITE) failed");
 814 #endif
 815 
 816         bcopy(pp->p_ldt, cp->p_ldt, ldtsz);
 817 
 818 #if defined(__xpv)
 819         if (xen_ldt_setprot(cp->p_ldt, ldtsz, PROT_READ))
 820                 panic("ldt_dup:xen_ldt_setprot(PROT_READ) failed");
 821 #endif
 822         mutex_exit(&cp->p_ldtlock);
 823         mutex_exit(&pp->p_ldtlock);
 824 
 825 }
 826 
 827 /*
 828  * Note that we don't actually load the LDT into the current CPU here: it's done
 829  * later by our caller - unless we take an error.  This works out because
 830  * ldt_load() does a copy of ->p_ldt instead of directly loading it into the GDT
 831  * (and therefore can't be using the freed old LDT), and by definition if the
 832  * new entry didn't pass validation, then the proc shouldn't be referencing an
 833  * entry in the extended region.
 834  */
 835 static void
 836 ldt_grow(proc_t *pp, uint_t seli)
 837 {
 838         user_desc_t     *oldt, *nldt;
 839         uint_t          nsels;
 840         size_t          oldtsz, nldtsz;
 841 
 842         ASSERT(MUTEX_HELD(&pp->p_ldtlock));
 843         ASSERT(pp->p_ldt != NULL);
 844         ASSERT(pp->p_ldtlimit != 0);
 845 
 846         /*
 847          * Allocate larger LDT just large enough to contain seli. The LDT must
 848          * always be allocated in units of pages for KPTI.
 849          */
 850         nldtsz = P2ROUNDUP((seli + 1) * sizeof (user_desc_t), PAGESIZE);
 851         nsels = nldtsz / sizeof (user_desc_t);
 852         ASSERT(nsels >= MINNLDT && nsels <= MAXNLDT);
 853         ASSERT(nsels > pp->p_ldtlimit);
 854 


 865          */
 866         kpreempt_disable();
 867         ldt_unload();
 868         kpreempt_enable();
 869 
 870 #if defined(__xpv)
 871 
 872         /*
 873          * Make old ldt writable and new ldt read only.
 874          */
 875         if (xen_ldt_setprot(oldt, oldtsz, PROT_READ | PROT_WRITE))
 876                 panic("ldt_grow:xen_ldt_setprot(PROT_READ|PROT_WRITE) failed");
 877 
 878         if (xen_ldt_setprot(nldt, nldtsz, PROT_READ))
 879                 panic("ldt_grow:xen_ldt_setprot(PROT_READ) failed");
 880 #endif
 881 
 882         pp->p_ldt = nldt;
 883         pp->p_ldtlimit = nsels - 1;
 884 












 885         kmem_free(oldt, oldtsz);
 886 }