Print this page
9600 LDT still not happy under KPTI


 161         int ih_inum;
 162         gate_desc_t ih_interp_desc;
 163         gate_desc_t ih_default_desc;
 164 };
 165 
 166 /*
 167  * The brand infrastructure interposes on two handlers, and we use one as a
 168  * NULL signpost.
 169  */
 170 static struct interposing_handler brand_tbl[2];
 171 
 172 /*
 173  * software prototypes for default local descriptor table
 174  */
 175 
 176 /*
 177  * Routines for loading segment descriptors in format the hardware
 178  * can understand.
 179  */
 180 
 181 #if defined(__amd64)
 182 
 183 /*
 184  * In long mode we have the new L or long mode attribute bit
 185  * for code segments. Only the conforming bit in type is used along
 186  * with descriptor priority and present bits. Default operand size must
 187  * be zero when in long mode. In 32-bit compatibility mode all fields
 188  * are treated as in legacy mode. For data segments while in long mode
 189  * only the present bit is loaded.
 190  */
 191 void
 192 set_usegd(user_desc_t *dp, uint_t lmode, void *base, size_t size,
 193     uint_t type, uint_t dpl, uint_t gran, uint_t defopsz)
 194 {
 195         ASSERT(lmode == SDP_SHORT || lmode == SDP_LONG);


 196 
 197         /*
 198          * 64-bit long mode.
 199          */
 200         if (lmode == SDP_LONG)
 201                 dp->usd_def32 = 0;           /* 32-bit operands only */
 202         else
 203                 /*
 204                  * 32-bit compatibility mode.
 205                  */
 206                 dp->usd_def32 = defopsz;     /* 0 = 16, 1 = 32-bit ops */
 207 








 208         dp->usd_long = lmode;        /* 64-bit mode */
 209         dp->usd_type = type;
 210         dp->usd_dpl = dpl;
 211         dp->usd_p = 1;
 212         dp->usd_gran = gran;         /* 0 = bytes, 1 = pages */
 213 
 214         dp->usd_lobase = (uintptr_t)base;
 215         dp->usd_midbase = (uintptr_t)base >> 16;
 216         dp->usd_hibase = (uintptr_t)base >> (16 + 8);
 217         dp->usd_lolimit = size;
 218         dp->usd_hilimit = (uintptr_t)size >> 16;
 219 }
 220 
 221 #elif defined(__i386)
 222 
 223 /*
 224  * Install user segment descriptor for code and data.
 225  */
 226 void
 227 set_usegd(user_desc_t *dp, void *base, size_t size, uint_t type,
 228     uint_t dpl, uint_t gran, uint_t defopsz)
 229 {
 230         dp->usd_lolimit = size;
 231         dp->usd_hilimit = (uintptr_t)size >> 16;
 232 
 233         dp->usd_lobase = (uintptr_t)base;
 234         dp->usd_midbase = (uintptr_t)base >> 16;
 235         dp->usd_hibase = (uintptr_t)base >> (16 + 8);
 236 
 237         dp->usd_type = type;
 238         dp->usd_dpl = dpl;
 239         dp->usd_p = 1;
 240         dp->usd_def32 = defopsz;     /* 0 = 16, 1 = 32 bit operands */
 241         dp->usd_gran = gran;         /* 0 = bytes, 1 = pages */
 242 }
 243 
 244 #endif  /* __i386 */
 245 
 246 /*
 247  * Install system segment descriptor for LDT and TSS segments.
 248  */
 249 
 250 #if defined(__amd64)
 251 
 252 void
 253 set_syssegd(system_desc_t *dp, void *base, size_t size, uint_t type,
 254     uint_t dpl)
 255 {
 256         dp->ssd_lolimit = size;
 257         dp->ssd_hilimit = (uintptr_t)size >> 16;
 258 
 259         dp->ssd_lobase = (uintptr_t)base;
 260         dp->ssd_midbase = (uintptr_t)base >> 16;
 261         dp->ssd_hibase = (uintptr_t)base >> (16 + 8);
 262         dp->ssd_hi64base = (uintptr_t)base >> (16 + 8 + 8);
 263 
 264         dp->ssd_type = type;
 265         dp->ssd_zero1 = 0;   /* must be zero */
 266         dp->ssd_zero2 = 0;
 267         dp->ssd_dpl = dpl;
 268         dp->ssd_p = 1;
 269         dp->ssd_gran = 0;    /* force byte units */
 270 }
 271 
 272 void *
 273 get_ssd_base(system_desc_t *dp)
 274 {
 275         uintptr_t       base;
 276 
 277         base = (uintptr_t)dp->ssd_lobase |
 278             (uintptr_t)dp->ssd_midbase << 16 |
 279             (uintptr_t)dp->ssd_hibase << (16 + 8) |
 280             (uintptr_t)dp->ssd_hi64base << (16 + 8 + 8);
 281         return ((void *)base);
 282 }
 283 
 284 #elif defined(__i386)
 285 
 286 void
 287 set_syssegd(system_desc_t *dp, void *base, size_t size, uint_t type,
 288     uint_t dpl)
 289 {
 290         dp->ssd_lolimit = size;
 291         dp->ssd_hilimit = (uintptr_t)size >> 16;
 292 
 293         dp->ssd_lobase = (uintptr_t)base;
 294         dp->ssd_midbase = (uintptr_t)base >> 16;
 295         dp->ssd_hibase = (uintptr_t)base >> (16 + 8);
 296 
 297         dp->ssd_type = type;
 298         dp->ssd_zero = 0;    /* must be zero */
 299         dp->ssd_dpl = dpl;
 300         dp->ssd_p = 1;
 301         dp->ssd_gran = 0;    /* force byte units */
 302 }
 303 
 304 void *
 305 get_ssd_base(system_desc_t *dp)
 306 {
 307         uintptr_t       base;
 308 
 309         base = (uintptr_t)dp->ssd_lobase |
 310             (uintptr_t)dp->ssd_midbase << 16 |
 311             (uintptr_t)dp->ssd_hibase << (16 + 8);
 312         return ((void *)base);
 313 }
 314 
 315 #endif  /* __i386 */
 316 
 317 /*
 318  * Install gate segment descriptor for interrupt, trap, call and task gates.
 319  *
 320  * For 64 bit native if we have KPTI enabled, we use the IST stack mechanism on
 321  * all interrupts.  We have different ISTs for each class of exceptions that are
 322  * most likely to occur while handling an existing exception; while many of
 323  * these are just going to panic, it's nice not to trample on the existing
 324  * exception state for debugging purposes.
 325  *
 326  * Normal interrupts are all redirected unconditionally to the KPTI trampoline
 327  * stack space. This unifies the trampoline handling between user and kernel
 328  * space (and avoids the need to touch %gs).
 329  *
 330  * The KDI IDT *all* uses the DBG IST: consider single stepping tr_pftrap, when
 331  * we do a read from KMDB that cause another #PF.  Without its own IST, this
 332  * would stomp on the kernel's mcpu_kpti_flt frame.
 333  */
 334 uint_t
 335 idt_vector_to_ist(uint_t vector)
 336 {


 374     uint_t type, uint_t dpl, uint_t ist)
 375 {
 376         dp->sgd_looffset = (uintptr_t)func;
 377         dp->sgd_hioffset = (uintptr_t)func >> 16;
 378         dp->sgd_hi64offset = (uintptr_t)func >> (16 + 16);
 379         dp->sgd_selector =  (uint16_t)sel;
 380         dp->sgd_ist = ist;
 381         dp->sgd_type = type;
 382         dp->sgd_dpl = dpl;
 383         dp->sgd_p = 1;
 384 }
 385 
 386 /*
 387  * Updates a single user descriptor in the the GDT of the current cpu.
 388  * Caller is responsible for preventing cpu migration.
 389  */
 390 
 391 void
 392 gdt_update_usegd(uint_t sidx, user_desc_t *udp)
 393 {
 394 #if defined(__xpv)














 395 

 396         uint64_t dpa = CPU->cpu_m.mcpu_gdtpa + sizeof (*udp) * sidx;
 397 
 398         if (HYPERVISOR_update_descriptor(pa_to_ma(dpa), *(uint64_t *)udp))
 399                 panic("gdt_update_usegd: HYPERVISOR_update_descriptor");
 400 
 401 #else   /* __xpv */
 402 
 403         CPU->cpu_gdt[sidx] = *udp;
 404 
 405 #endif  /* __xpv */
 406 }
 407 
 408 /*
 409  * Writes single descriptor pointed to by udp into a processes
 410  * LDT entry pointed to by ldp.
 411  */
 412 int
 413 ldt_update_segd(user_desc_t *ldp, user_desc_t *udp)
 414 {
 415 #if defined(__xpv)














 416 

 417         uint64_t dpa;
 418 
 419         dpa = mmu_ptob(hat_getpfnum(kas.a_hat, (caddr_t)ldp)) |
 420             ((uintptr_t)ldp & PAGEOFFSET);
 421 
 422         /*
 423          * The hypervisor is a little more restrictive about what it
 424          * supports in the LDT.
 425          */
 426         if (HYPERVISOR_update_descriptor(pa_to_ma(dpa), *(uint64_t *)udp) != 0)
 427                 return (EINVAL);
 428 
 429 #else   /* __xpv */
 430 
 431         *ldp = *udp;
 432 
 433 #endif  /* __xpv */
 434         return (0);
 435 }
 436 
 437 #if defined(__xpv)
 438 
 439 /*
 440  * Converts hw format gate descriptor into pseudo-IDT format for the hypervisor.
 441  * Returns true if a valid entry was written.
 442  */
 443 int
 444 xen_idt_to_trap_info(uint_t vec, gate_desc_t *sgd, void *ti_arg)
 445 {
 446         trap_info_t *ti = ti_arg;       /* XXPV Aargh - segments.h comment */
 447 
 448         /*
 449          * skip holes in the IDT
 450          */




 161         int ih_inum;
 162         gate_desc_t ih_interp_desc;
 163         gate_desc_t ih_default_desc;
 164 };
 165 
 166 /*
 167  * The brand infrastructure interposes on two handlers, and we use one as a
 168  * NULL signpost.
 169  */
 170 static struct interposing_handler brand_tbl[2];
 171 
 172 /*
 173  * software prototypes for default local descriptor table
 174  */
 175 
 176 /*
 177  * Routines for loading segment descriptors in format the hardware
 178  * can understand.
 179  */
 180 


 181 /*
 182  * In long mode we have the new L or long mode attribute bit
 183  * for code segments. Only the conforming bit in type is used along
 184  * with descriptor priority and present bits. Default operand size must
 185  * be zero when in long mode. In 32-bit compatibility mode all fields
 186  * are treated as in legacy mode. For data segments while in long mode
 187  * only the present bit is loaded.
 188  */
 189 void
 190 set_usegd(user_desc_t *dp, uint_t lmode, void *base, size_t size,
 191     uint_t type, uint_t dpl, uint_t gran, uint_t defopsz)
 192 {
 193         ASSERT(lmode == SDP_SHORT || lmode == SDP_LONG);
 194         /* This should never be a "system" segment. */
 195         ASSERT3U(type & SDT_S, !=, 0);
 196 
 197         /*
 198          * 64-bit long mode.
 199          */
 200         if (lmode == SDP_LONG)
 201                 dp->usd_def32 = 0;           /* 32-bit operands only */
 202         else
 203                 /*
 204                  * 32-bit compatibility mode.
 205                  */
 206                 dp->usd_def32 = defopsz;     /* 0 = 16, 1 = 32-bit ops */
 207 
 208         /*
 209          * We should always set the "accessed" bit (SDT_A), otherwise the CPU
 210          * will write to the GDT whenever we change segment registers around.
 211          * With KPTI on, the GDT is read-only in the user page table, which
 212          * causes crashes if we don't set this.
 213          */
 214         ASSERT3U(type & SDT_A, !=, 0);
 215 
 216         dp->usd_long = lmode;        /* 64-bit mode */
 217         dp->usd_type = type;
 218         dp->usd_dpl = dpl;
 219         dp->usd_p = 1;
 220         dp->usd_gran = gran;         /* 0 = bytes, 1 = pages */
 221 
 222         dp->usd_lobase = (uintptr_t)base;
 223         dp->usd_midbase = (uintptr_t)base >> 16;
 224         dp->usd_hibase = (uintptr_t)base >> (16 + 8);
 225         dp->usd_lolimit = size;
 226         dp->usd_hilimit = (uintptr_t)size >> 16;
 227 }
 228 


 229 /*























 230  * Install system segment descriptor for LDT and TSS segments.
 231  */
 232 


 233 void
 234 set_syssegd(system_desc_t *dp, void *base, size_t size, uint_t type,
 235     uint_t dpl)
 236 {
 237         dp->ssd_lolimit = size;
 238         dp->ssd_hilimit = (uintptr_t)size >> 16;
 239 
 240         dp->ssd_lobase = (uintptr_t)base;
 241         dp->ssd_midbase = (uintptr_t)base >> 16;
 242         dp->ssd_hibase = (uintptr_t)base >> (16 + 8);
 243         dp->ssd_hi64base = (uintptr_t)base >> (16 + 8 + 8);
 244 
 245         dp->ssd_type = type;
 246         dp->ssd_zero1 = 0;   /* must be zero */
 247         dp->ssd_zero2 = 0;
 248         dp->ssd_dpl = dpl;
 249         dp->ssd_p = 1;
 250         dp->ssd_gran = 0;    /* force byte units */
 251 }
 252 
 253 void *
 254 get_ssd_base(system_desc_t *dp)
 255 {
 256         uintptr_t       base;
 257 
 258         base = (uintptr_t)dp->ssd_lobase |
 259             (uintptr_t)dp->ssd_midbase << 16 |
 260             (uintptr_t)dp->ssd_hibase << (16 + 8) |
 261             (uintptr_t)dp->ssd_hi64base << (16 + 8 + 8);
 262         return ((void *)base);
 263 }
 264 

































 265 /*
 266  * Install gate segment descriptor for interrupt, trap, call and task gates.
 267  *
 268  * For 64 bit native if we have KPTI enabled, we use the IST stack mechanism on
 269  * all interrupts.  We have different ISTs for each class of exceptions that are
 270  * most likely to occur while handling an existing exception; while many of
 271  * these are just going to panic, it's nice not to trample on the existing
 272  * exception state for debugging purposes.
 273  *
 274  * Normal interrupts are all redirected unconditionally to the KPTI trampoline
 275  * stack space. This unifies the trampoline handling between user and kernel
 276  * space (and avoids the need to touch %gs).
 277  *
 278  * The KDI IDT *all* uses the DBG IST: consider single stepping tr_pftrap, when
 279  * we do a read from KMDB that cause another #PF.  Without its own IST, this
 280  * would stomp on the kernel's mcpu_kpti_flt frame.
 281  */
 282 uint_t
 283 idt_vector_to_ist(uint_t vector)
 284 {


 322     uint_t type, uint_t dpl, uint_t ist)
 323 {
 324         dp->sgd_looffset = (uintptr_t)func;
 325         dp->sgd_hioffset = (uintptr_t)func >> 16;
 326         dp->sgd_hi64offset = (uintptr_t)func >> (16 + 16);
 327         dp->sgd_selector =  (uint16_t)sel;
 328         dp->sgd_ist = ist;
 329         dp->sgd_type = type;
 330         dp->sgd_dpl = dpl;
 331         dp->sgd_p = 1;
 332 }
 333 
 334 /*
 335  * Updates a single user descriptor in the the GDT of the current cpu.
 336  * Caller is responsible for preventing cpu migration.
 337  */
 338 
 339 void
 340 gdt_update_usegd(uint_t sidx, user_desc_t *udp)
 341 {
 342 #if defined(DEBUG)
 343         /* This should never be a "system" segment, but it might be null. */
 344         if (udp->usd_p != 0 || udp->usd_type != 0) {
 345                 ASSERT3U(udp->usd_type & SDT_S, !=, 0);
 346         }
 347         /*
 348          * We should always set the "accessed" bit (SDT_A), otherwise the CPU
 349          * will write to the GDT whenever we change segment registers around.
 350          * With KPTI on, the GDT is read-only in the user page table, which
 351          * causes crashes if we don't set this.
 352          */
 353         if (udp->usd_p != 0 || udp->usd_type != 0) {
 354                 ASSERT3U(udp->usd_type & SDT_A, !=, 0);
 355         }
 356 #endif
 357 
 358 #if defined(__xpv)
 359         uint64_t dpa = CPU->cpu_m.mcpu_gdtpa + sizeof (*udp) * sidx;
 360 
 361         if (HYPERVISOR_update_descriptor(pa_to_ma(dpa), *(uint64_t *)udp))
 362                 panic("gdt_update_usegd: HYPERVISOR_update_descriptor");
 363 
 364 #else   /* __xpv */

 365         CPU->cpu_gdt[sidx] = *udp;

 366 #endif  /* __xpv */
 367 }
 368 
 369 /*
 370  * Writes single descriptor pointed to by udp into a processes
 371  * LDT entry pointed to by ldp.
 372  */
 373 int
 374 ldt_update_segd(user_desc_t *ldp, user_desc_t *udp)
 375 {
 376 #if defined(DEBUG)
 377         /* This should never be a "system" segment, but it might be null. */
 378         if (udp->usd_p != 0 || udp->usd_type != 0) {
 379                 ASSERT3U(udp->usd_type & SDT_S, !=, 0);
 380         }
 381         /*
 382          * We should always set the "accessed" bit (SDT_A), otherwise the CPU
 383          * will write to the LDT whenever we change segment registers around.
 384          * With KPTI on, the LDT is read-only in the user page table, which
 385          * causes crashes if we don't set this.
 386          */
 387         if (udp->usd_p != 0 || udp->usd_type != 0) {
 388                 ASSERT3U(udp->usd_type & SDT_A, !=, 0);
 389         }
 390 #endif
 391 
 392 #if defined(__xpv)
 393         uint64_t dpa;
 394 
 395         dpa = mmu_ptob(hat_getpfnum(kas.a_hat, (caddr_t)ldp)) |
 396             ((uintptr_t)ldp & PAGEOFFSET);
 397 
 398         /*
 399          * The hypervisor is a little more restrictive about what it
 400          * supports in the LDT.
 401          */
 402         if (HYPERVISOR_update_descriptor(pa_to_ma(dpa), *(uint64_t *)udp) != 0)
 403                 return (EINVAL);
 404 
 405 #else   /* __xpv */

 406         *ldp = *udp;
 407 
 408 #endif  /* __xpv */
 409         return (0);
 410 }
 411 
 412 #if defined(__xpv)
 413 
 414 /*
 415  * Converts hw format gate descriptor into pseudo-IDT format for the hypervisor.
 416  * Returns true if a valid entry was written.
 417  */
 418 int
 419 xen_idt_to_trap_info(uint_t vec, gate_desc_t *sgd, void *ti_arg)
 420 {
 421         trap_info_t *ti = ti_arg;       /* XXPV Aargh - segments.h comment */
 422 
 423         /*
 424          * skip holes in the IDT
 425          */