1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright 2018 Joyent, Inc. 14 */ 15 16 /* 17 * HT exclusion: prevent a sibling in a hyper-threaded core from running in VMX 18 * non-root guest mode, when certain threads are running on the other sibling. 19 * This avoids speculation-based information leaks such as L1TF being available 20 * to the untrusted guest. The stance we take is that threads from the same 21 * zone as the guest VPCU thread are considered safe to run alongside, but all 22 * other threads (except the idle thread), and all interrupts, are unsafe. Note 23 * that due to the implementation here, there are significant sections of e.g. 24 * the dispatcher code that can run concurrently with a guest, until the thread 25 * reaches ht_mark(). This code assumes there are only two HT threads per core. 26 * 27 * The entry points are as follows: 28 * 29 * ht_mark_as_vcpu() 30 * 31 * All threads that enter guest mode (i.e. VCPU threads) need to call this at 32 * least once, which sets TS_VCPU in ->t_schedflag. 33 * 34 * ht_mark() 35 * 36 * A new ->cpu_thread is now curthread (although interrupt threads have their 37 * own separate handling). After preventing any interrupts, we will take our 38 * own CPU's spinlock and update our own state in mcpu_ht. 39 * 40 * If our sibling is poisoned (i.e. in guest mode or the little bit of code 41 * around it), and we're not compatible (that is, same zone ID, or the idle 42 * thread), then we need to ht_kick() that sibling. ht_kick() itself waits for 43 * the sibling to call ht_release(), and it will not re-enter guest mode until 44 * allowed. 45 * 46 * Note that we ignore the fact a process can change its zone ID: poisoning 47 * threads never do so, and we can ignore the other cases. 48 * 49 * ht_acquire() 50 * 51 * We are a VCPU thread about to start guest execution. Interrupts are 52 * disabled. We must have already run ht_mark() to be in this code, so there's 53 * no need to take our *own* spinlock in order to mark ourselves as CM_POISONED. 54 * Instead, we take our sibling's lock to also mark ourselves as poisoned in the 55 * sibling cpu_ht_t. This is so ht_mark() will only ever need to look at its 56 * local mcpu_ht. 57 * 58 * We'll loop here for up to ht_acquire_wait_time microseconds; this is mainly 59 * to wait out any sibling interrupt: many of them will complete quicker than 60 * this. 61 * 62 * Finally, if we succeeded in acquiring the core, we'll flush the L1 cache as 63 * mitigation against L1TF: no incompatible thread will now be able to populate 64 * the L1 cache until *we* ht_release(). 65 * 66 * ht_release() 67 * 68 * Simply unpoison ourselves similarly to ht_acquire(); ht_kick() will wait for 69 * this to happen if needed. 70 * 71 * ht_begin_intr() 72 * 73 * In an interrupt prolog. We're either a hilevel interrupt, or a pinning 74 * interrupt. In both cases, we mark our interrupt depth, and potentially 75 * ht_kick(). This enforces exclusion, but doesn't otherwise modify ->ch_state: 76 * we want the dispatcher code to essentially ignore interrupts. 77 * 78 * ht_end_intr() 79 * 80 * In an interrupt epilogue *or* thread_unpin(). In the first case, we never 81 * slept, and we can simply decrement our counter. In the second case, we're an 82 * interrupt thread about to sleep: we'll still just decrement our counter, and 83 * henceforth treat the thread as a normal thread when it next gets scheduled, 84 * until it finally gets to its epilogue. 85 * 86 * ht_mark_unsafe() / ht_mark_safe() 87 * 88 * Mark the current thread as temporarily unsafe (guests should not be executing 89 * while a sibling is marked unsafe). This can be used for a thread that's 90 * otherwise considered safe, if it needs to handle potentially sensitive data. 91 * Right now, this means certain I/O handling operations that reach down into 92 * the networking and ZFS sub-systems. 93 * 94 * ht_should_run(thread, cpu) 95 * 96 * This is used by the dispatcher when making scheduling decisions: if the 97 * sibling is compatible with the given thread, we return B_TRUE. This is 98 * essentially trying to guess if any subsequent ht_acquire() will fail, by 99 * peeking at the sibling CPU's state. The peek is racy, but if we get things 100 * wrong, the "only" consequence is that ht_acquire() may lose. 101 * 102 * ht_adjust_cpu_score() 103 * 104 * Used when scoring other CPUs in disp_lowpri_cpu(). If we shouldn't run here, 105 * we'll add a small penalty to the score. This also makes sure a VCPU thread 106 * migration behaves properly. 107 */ 108 109 #include <sys/archsystm.h> 110 #include <sys/disp.h> 111 #include <sys/cmt.h> 112 #include <sys/systm.h> 113 #include <sys/cpu.h> 114 #include <sys/var.h> 115 #include <sys/xc_levels.h> 116 #include <sys/cmn_err.h> 117 #include <sys/sysmacros.h> 118 #include <sys/x86_archext.h> 119 120 #define CS_SHIFT (8) 121 #define CS_MASK ((1 << CS_SHIFT) - 1) 122 #define CS_MARK(s) ((s) & CS_MASK) 123 #define CS_ZONE(s) ((s) >> CS_SHIFT) 124 #define CS_MK(s, z) ((s) | (z << CS_SHIFT)) 125 126 typedef enum ch_mark { 127 CM_IDLE = 0, /* running CPU idle thread */ 128 CM_THREAD, /* running general non-VCPU thread */ 129 CM_UNSAFE, /* running ->t_unsafe thread */ 130 CM_VCPU, /* running VCPU thread */ 131 CM_POISONED /* running in guest */ 132 } ch_mark_t; 133 134 /* Double-check our false-sharing padding. */ 135 CTASSERT(offsetof(cpu_ht_t, ch_sib) == 64); 136 CTASSERT(CM_IDLE == 0); 137 CTASSERT(CM_POISONED < (1 << CS_SHIFT)); 138 CTASSERT(CM_POISONED > CM_VCPU); 139 CTASSERT(CM_VCPU > CM_UNSAFE); 140 141 static uint_t empty_pil = XC_CPUPOKE_PIL; 142 143 /* 144 * If disabled, no HT exclusion is performed, and system is potentially 145 * vulnerable to L1TF if hyper-threading is enabled, and we don't have the "not 146 * vulnerable" CPUID bit. 147 */ 148 int ht_exclusion = 1; 149 150 /* 151 * How long ht_acquire() will spin trying to acquire the core, in micro-seconds. 152 * This is enough time to wait out a significant proportion of interrupts. 153 */ 154 clock_t ht_acquire_wait_time = 64; 155 156 static cpu_t * 157 ht_find_sibling(cpu_t *cp) 158 { 159 for (uint_t i = 0; i < GROUP_SIZE(&cp->cpu_pg->cmt_pgs); i++) { 160 pg_cmt_t *pg = GROUP_ACCESS(&cp->cpu_pg->cmt_pgs, i); 161 group_t *cg = &pg->cmt_pg.pghw_pg.pg_cpus; 162 163 if (pg->cmt_pg.pghw_hw != PGHW_IPIPE) 164 continue; 165 166 if (GROUP_SIZE(cg) == 1) 167 break; 168 169 VERIFY3U(GROUP_SIZE(cg), ==, 2); 170 171 if (GROUP_ACCESS(cg, 0) != cp) 172 return (GROUP_ACCESS(cg, 0)); 173 174 VERIFY3P(GROUP_ACCESS(cg, 1), !=, cp); 175 176 return (GROUP_ACCESS(cg, 1)); 177 } 178 179 return (NULL); 180 } 181 182 /* 183 * Initialize HT links. We have to be careful here not to race with 184 * ht_begin/end_intr(), which also complicates trying to do this initialization 185 * from a cross-call; hence the slightly odd approach below. 186 */ 187 void 188 ht_init(void) 189 { 190 cpu_t *scp = CPU; 191 cpu_t *cp = scp; 192 ulong_t flags; 193 194 if (!ht_exclusion) 195 return; 196 197 mutex_enter(&cpu_lock); 198 199 do { 200 thread_affinity_set(curthread, cp->cpu_id); 201 flags = intr_clear(); 202 203 cp->cpu_m.mcpu_ht.ch_intr_depth = 0; 204 cp->cpu_m.mcpu_ht.ch_state = CS_MK(CM_THREAD, GLOBAL_ZONEID); 205 cp->cpu_m.mcpu_ht.ch_sibstate = CS_MK(CM_THREAD, GLOBAL_ZONEID); 206 ASSERT3P(cp->cpu_m.mcpu_ht.ch_sib, ==, NULL); 207 cp->cpu_m.mcpu_ht.ch_sib = ht_find_sibling(cp); 208 209 intr_restore(flags); 210 thread_affinity_clear(curthread); 211 } while ((cp = cp->cpu_next_onln) != scp); 212 213 mutex_exit(&cpu_lock); 214 } 215 216 /* 217 * We're adding an interrupt handler of some kind at the given PIL. If this 218 * happens to be the same PIL as XC_CPUPOKE_PIL, then we need to disable our 219 * pil_needs_kick() optimization, as there is now potentially an unsafe 220 * interrupt handler at that PIL. This typically won't occur, so we're not that 221 * careful about what's actually getting added, which CPU it's on, or if it gets 222 * removed. This also presumes that softints can't cover our empty_pil. 223 */ 224 void 225 ht_intr_alloc_pil(uint_t pil) 226 { 227 ASSERT(pil <= PIL_MAX); 228 229 if (empty_pil == pil) 230 empty_pil = PIL_MAX + 1; 231 } 232 233 /* 234 * If our sibling is also a VCPU thread from a different zone, we need one of 235 * them to give up, otherwise they will just battle each other for exclusion 236 * until they exhaust their quantum. 237 * 238 * We arbitrate between them by dispatch priority: clearly, a higher-priority 239 * thread deserves to win the acquisition. However, under CPU load, it'll be 240 * very common to see both threads with ->t_pri == 1. If so, we'll break the 241 * tie by cpu_id (which is hopefully arbitrary enough). 242 * 243 * If we lose, the VMM code will take this as a hint to call 244 * thread_affinity_set(CPU_BEST), which will likely migrate the VCPU thread 245 * somewhere else. 246 * 247 * Note that all of this state examination is racy, as we don't own any locks 248 * here. 249 */ 250 static boolean_t 251 yield_to_vcpu(cpu_t *sib, zoneid_t zoneid) 252 { 253 cpu_ht_t *sibht = &sib->cpu_m.mcpu_ht; 254 uint64_t sibstate = sibht->ch_state; 255 256 /* 257 * If we're likely just waiting for an interrupt, don't yield. 258 */ 259 if (sibht->ch_intr_depth != 0) 260 return (B_FALSE); 261 262 /* 263 * We're only interested in VCPUs from a different zone. 264 */ 265 if (CS_MARK(sibstate) < CM_VCPU || CS_ZONE(sibstate) == zoneid) 266 return (B_FALSE); 267 268 if (curthread->t_pri < sib->cpu_dispatch_pri) 269 return (B_TRUE); 270 271 if (curthread->t_pri == sib->cpu_dispatch_pri && 272 CPU->cpu_id < sib->cpu_id) 273 return (B_TRUE); 274 275 return (B_FALSE); 276 } 277 278 static inline boolean_t 279 sibling_compatible(cpu_ht_t *sibht, zoneid_t zoneid) 280 { 281 uint64_t sibstate = sibht->ch_state; 282 283 if (sibht->ch_intr_depth != 0) 284 return (B_FALSE); 285 286 if (CS_MARK(sibstate) == CM_UNSAFE) 287 return (B_FALSE); 288 289 if (CS_MARK(sibstate) == CM_IDLE) 290 return (B_TRUE); 291 292 return (CS_ZONE(sibstate) == zoneid); 293 } 294 295 int 296 ht_acquire(void) 297 { 298 clock_t wait = ht_acquire_wait_time; 299 cpu_ht_t *ht = &CPU->cpu_m.mcpu_ht; 300 zoneid_t zoneid = getzoneid(); 301 cpu_ht_t *sibht; 302 int ret = 0; 303 304 ASSERT(!interrupts_enabled()); 305 306 if (ht->ch_sib == NULL) { 307 /* For the "sequential" L1TF case. */ 308 spec_l1d_flush(); 309 return (1); 310 } 311 312 sibht = &ht->ch_sib->cpu_m.mcpu_ht; 313 314 /* A VCPU thread should never change zone. */ 315 ASSERT3U(CS_ZONE(ht->ch_state), ==, zoneid); 316 ASSERT3U(CS_MARK(ht->ch_state), ==, CM_VCPU); 317 ASSERT3U(zoneid, !=, GLOBAL_ZONEID); 318 ASSERT3U(curthread->t_preempt, >=, 1); 319 ASSERT(curthread->t_schedflag & TS_VCPU); 320 321 while (ret == 0 && wait > 0) { 322 323 if (yield_to_vcpu(ht->ch_sib, zoneid)) { 324 ret = -1; 325 break; 326 } 327 328 if (sibling_compatible(sibht, zoneid)) { 329 lock_set(&sibht->ch_lock); 330 331 if (sibling_compatible(sibht, zoneid)) { 332 ht->ch_state = CS_MK(CM_POISONED, zoneid); 333 sibht->ch_sibstate = CS_MK(CM_POISONED, zoneid); 334 membar_enter(); 335 ret = 1; 336 } 337 338 lock_clear(&sibht->ch_lock); 339 } else { 340 drv_usecwait(10); 341 wait -= 10; 342 } 343 } 344 345 DTRACE_PROBE4(ht__acquire, int, ret, uint64_t, sibht->ch_state, 346 uint64_t, sibht->ch_intr_depth, clock_t, wait); 347 348 if (ret == 1) 349 spec_l1d_flush(); 350 351 return (ret); 352 } 353 354 void 355 ht_release(void) 356 { 357 cpu_ht_t *ht = &CPU->cpu_m.mcpu_ht; 358 zoneid_t zoneid = getzoneid(); 359 cpu_ht_t *sibht; 360 361 ASSERT(!interrupts_enabled()); 362 363 if (ht->ch_sib == NULL) 364 return; 365 366 ASSERT3U(zoneid, !=, GLOBAL_ZONEID); 367 ASSERT3U(CS_ZONE(ht->ch_state), ==, zoneid); 368 ASSERT3U(CS_MARK(ht->ch_state), ==, CM_POISONED); 369 ASSERT3U(curthread->t_preempt, >=, 1); 370 371 sibht = &ht->ch_sib->cpu_m.mcpu_ht; 372 373 lock_set(&sibht->ch_lock); 374 375 ht->ch_state = CS_MK(CM_VCPU, zoneid); 376 sibht->ch_sibstate = CS_MK(CM_VCPU, zoneid); 377 membar_producer(); 378 379 lock_clear(&sibht->ch_lock); 380 } 381 382 static void 383 ht_kick(cpu_ht_t *ht, zoneid_t zoneid) 384 { 385 uint64_t sibstate; 386 387 ASSERT(LOCK_HELD(&ht->ch_lock)); 388 ASSERT(!interrupts_enabled()); 389 390 poke_cpu(ht->ch_sib->cpu_id); 391 392 membar_consumer(); 393 sibstate = ht->ch_sibstate; 394 395 if (CS_MARK(sibstate) != CM_POISONED || CS_ZONE(sibstate) == zoneid) 396 return; 397 398 lock_clear(&ht->ch_lock); 399 400 /* 401 * Spin until we can see the sibling has been kicked out or is otherwise 402 * OK. 403 */ 404 for (;;) { 405 membar_consumer(); 406 sibstate = ht->ch_sibstate; 407 408 if (CS_MARK(sibstate) != CM_POISONED || 409 CS_ZONE(sibstate) == zoneid) 410 break; 411 412 SMT_PAUSE(); 413 } 414 415 lock_set(&ht->ch_lock); 416 } 417 418 static boolean_t 419 pil_needs_kick(uint_t pil) 420 { 421 return (pil != empty_pil); 422 } 423 424 void 425 ht_begin_intr(uint_t pil) 426 { 427 ulong_t flags; 428 cpu_ht_t *ht; 429 430 ASSERT(pil <= PIL_MAX); 431 432 flags = intr_clear(); 433 ht = &CPU->cpu_m.mcpu_ht; 434 435 if (ht->ch_sib == NULL) { 436 intr_restore(flags); 437 return; 438 } 439 440 if (atomic_inc_64_nv(&ht->ch_intr_depth) == 1 && pil_needs_kick(pil)) { 441 lock_set(&ht->ch_lock); 442 443 membar_consumer(); 444 445 if (CS_MARK(ht->ch_sibstate) == CM_POISONED) 446 ht_kick(ht, GLOBAL_ZONEID); 447 448 lock_clear(&ht->ch_lock); 449 } 450 451 intr_restore(flags); 452 } 453 454 void 455 ht_end_intr(void) 456 { 457 ulong_t flags; 458 cpu_ht_t *ht; 459 460 flags = intr_clear(); 461 ht = &CPU->cpu_m.mcpu_ht; 462 463 if (ht->ch_sib == NULL) { 464 intr_restore(flags); 465 return; 466 } 467 468 ASSERT3U(ht->ch_intr_depth, >, 0); 469 atomic_dec_64(&ht->ch_intr_depth); 470 471 intr_restore(flags); 472 } 473 474 static inline boolean_t 475 ht_need_kick(cpu_ht_t *ht, zoneid_t zoneid) 476 { 477 membar_consumer(); 478 479 if (CS_MARK(ht->ch_sibstate) != CM_POISONED) 480 return (B_FALSE); 481 482 if (CS_MARK(ht->ch_state) == CM_UNSAFE) 483 return (B_TRUE); 484 485 return (CS_ZONE(ht->ch_sibstate) != zoneid); 486 } 487 488 void 489 ht_mark(void) 490 { 491 zoneid_t zoneid = getzoneid(); 492 kthread_t *t = curthread; 493 ulong_t flags; 494 cpu_ht_t *ht; 495 cpu_t *cp; 496 497 flags = intr_clear(); 498 499 cp = CPU; 500 ht = &cp->cpu_m.mcpu_ht; 501 502 if (ht->ch_sib == NULL) { 503 intr_restore(flags); 504 return; 505 } 506 507 lock_set(&ht->ch_lock); 508 509 /* 510 * If we were a nested interrupt and went through the resume_from_intr() 511 * path, we can now be resuming to a pinning interrupt thread; in which 512 * case, skip marking, until we later resume to a "real" thread. 513 */ 514 if (ht->ch_intr_depth > 0) { 515 ASSERT3P(t->t_intr, !=, NULL); 516 517 if (ht_need_kick(ht, zoneid)) 518 ht_kick(ht, zoneid); 519 goto out; 520 } 521 522 if (t == t->t_cpu->cpu_idle_thread) { 523 ASSERT3U(zoneid, ==, GLOBAL_ZONEID); 524 ht->ch_state = CS_MK(CM_IDLE, zoneid); 525 } else { 526 uint64_t state = CM_THREAD; 527 528 if (t->t_unsafe) 529 state = CM_UNSAFE; 530 else if (t->t_schedflag & TS_VCPU) 531 state = CM_VCPU; 532 533 ht->ch_state = CS_MK(state, zoneid); 534 535 if (ht_need_kick(ht, zoneid)) 536 ht_kick(ht, zoneid); 537 } 538 539 out: 540 membar_producer(); 541 lock_clear(&ht->ch_lock); 542 intr_restore(flags); 543 } 544 545 void 546 ht_begin_unsafe(void) 547 { 548 curthread->t_unsafe++; 549 ht_mark(); 550 } 551 552 void 553 ht_end_unsafe(void) 554 { 555 ASSERT3U(curthread->t_unsafe, >, 0); 556 curthread->t_unsafe--; 557 ht_mark(); 558 } 559 560 void 561 ht_mark_as_vcpu(void) 562 { 563 thread_lock(curthread); 564 curthread->t_schedflag |= TS_VCPU; 565 ht_mark(); 566 thread_unlock(curthread); 567 } 568 569 boolean_t 570 ht_should_run(kthread_t *t, cpu_t *cp) 571 { 572 uint64_t sibstate; 573 cpu_t *sib; 574 575 if (t == t->t_cpu->cpu_idle_thread) 576 return (B_TRUE); 577 578 if ((sib = cp->cpu_m.mcpu_ht.ch_sib) == NULL) 579 return (B_TRUE); 580 581 sibstate = sib->cpu_m.mcpu_ht.ch_state; 582 583 if ((t->t_schedflag & TS_VCPU)) { 584 if (CS_MARK(sibstate) == CM_IDLE) 585 return (B_TRUE); 586 if (CS_MARK(sibstate) == CM_UNSAFE) 587 return (B_FALSE); 588 return (CS_ZONE(sibstate) == ttozone(t)->zone_id); 589 } 590 591 if (CS_MARK(sibstate) < CM_VCPU) 592 return (B_TRUE); 593 594 return (CS_ZONE(sibstate) == ttozone(t)->zone_id); 595 } 596 597 pri_t 598 ht_adjust_cpu_score(kthread_t *t, struct cpu *cp, pri_t score) 599 { 600 if (ht_should_run(t, cp)) 601 return (score); 602 603 /* 604 * If we're a VCPU thread scoring our current CPU, we are most likely 605 * asking to be rescheduled elsewhere after losing ht_acquire(). In 606 * this case, the current CPU is not a good choice, most likely, and we 607 * should go elsewhere. 608 */ 609 if ((t->t_schedflag & TS_VCPU) && cp == t->t_cpu && score < 0) 610 return ((v.v_maxsyspri + 1) * 2); 611 612 return (score + 1); 613 }