1 /*
   2  * This file and its contents are supplied under the terms of the
   3  * Common Development and Distribution License ("CDDL"), version 1.0.
   4  * You may only use this file in accordance with the terms of version
   5  * 1.0 of the CDDL.
   6  *
   7  * A full copy of the text of the CDDL should have accompanied this
   8  * source.  A copy of the CDDL is also available via the Internet at
   9  * http://www.illumos.org/license/CDDL.
  10  */
  11 
  12 /*
  13  * Copyright 2018 Joyent, Inc.
  14  */
  15 
  16 /*
  17  * HT exclusion: prevent a sibling in a hyper-threaded core from running in VMX
  18  * non-root guest mode, when certain threads are running on the other sibling.
  19  * This avoids speculation-based information leaks such as L1TF being available
  20  * to the untrusted guest.  The stance we take is that threads from the same
  21  * zone as the guest VPCU thread are considered safe to run alongside, but all
  22  * other threads (except the idle thread), and all interrupts, are unsafe.  Note
  23  * that due to the implementation here, there are significant sections of e.g.
  24  * the dispatcher code that can run concurrently with a guest, until the thread
  25  * reaches ht_mark().  This code assumes there are only two HT threads per core.
  26  *
  27  * The entry points are as follows:
  28  *
  29  * ht_mark_as_vcpu()
  30  *
  31  * All threads that enter guest mode (i.e. VCPU threads) need to call this at
  32  * least once, which sets TS_VCPU in ->t_schedflag.
  33  *
  34  * ht_mark()
  35  *
  36  * A new ->cpu_thread is now curthread (although interrupt threads have their
  37  * own separate handling).  After preventing any interrupts, we will take our
  38  * own CPU's spinlock and update our own state in mcpu_ht.
  39  *
  40  * If our sibling is poisoned (i.e. in guest mode or the little bit of code
  41  * around it), and we're not compatible (that is, same zone ID, or the idle
  42  * thread), then we need to ht_kick() that sibling.  ht_kick() itself waits for
  43  * the sibling to call ht_release(), and it will not re-enter guest mode until
  44  * allowed.
  45  *
  46  * Note that we ignore the fact a process can change its zone ID: poisoning
  47  * threads never do so, and we can ignore the other cases.
  48  *
  49  * ht_acquire()
  50  *
  51  * We are a VCPU thread about to start guest execution.  Interrupts are
  52  * disabled.  We must have already run ht_mark() to be in this code, so there's
  53  * no need to take our *own* spinlock in order to mark ourselves as CM_POISONED.
  54  * Instead, we take our sibling's lock to also mark ourselves as poisoned in the
  55  * sibling cpu_ht_t.  This is so ht_mark() will only ever need to look at its
  56  * local mcpu_ht.
  57  *
  58  * We'll loop here for up to ht_acquire_wait_time microseconds; this is mainly
  59  * to wait out any sibling interrupt: many of them will complete quicker than
  60  * this.
  61  *
  62  * Finally, if we succeeded in acquiring the core, we'll flush the L1 cache as
  63  * mitigation against L1TF: no incompatible thread will now be able to populate
  64  * the L1 cache until *we* ht_release().
  65  *
  66  * ht_release()
  67  *
  68  * Simply unpoison ourselves similarly to ht_acquire(); ht_kick() will wait for
  69  * this to happen if needed.
  70  *
  71  * ht_begin_intr()
  72  *
  73  * In an interrupt prolog.  We're either a hilevel interrupt, or a pinning
  74  * interrupt.  In both cases, we mark our interrupt depth, and potentially
  75  * ht_kick().  This enforces exclusion, but doesn't otherwise modify ->ch_state:
  76  * we want the dispatcher code to essentially ignore interrupts.
  77  *
  78  * ht_end_intr()
  79  *
  80  * In an interrupt epilogue *or* thread_unpin().  In the first case, we never
  81  * slept, and we can simply decrement our counter.  In the second case, we're an
  82  * interrupt thread about to sleep: we'll still just decrement our counter, and
  83  * henceforth treat the thread as a normal thread when it next gets scheduled,
  84  * until it finally gets to its epilogue.
  85  *
  86  * ht_mark_unsafe() / ht_mark_safe()
  87  *
  88  * Mark the current thread as temporarily unsafe (guests should not be executing
  89  * while a sibling is marked unsafe).  This can be used for a thread that's
  90  * otherwise considered safe, if it needs to handle potentially sensitive data.
  91  * Right now, this means certain I/O handling operations that reach down into
  92  * the networking and ZFS sub-systems.
  93  *
  94  * ht_should_run(thread, cpu)
  95  *
  96  * This is used by the dispatcher when making scheduling decisions: if the
  97  * sibling is compatible with the given thread, we return B_TRUE. This is
  98  * essentially trying to guess if any subsequent ht_acquire() will fail, by
  99  * peeking at the sibling CPU's state.  The peek is racy, but if we get things
 100  * wrong, the "only" consequence is that ht_acquire() may lose.
 101  *
 102  * ht_adjust_cpu_score()
 103  *
 104  * Used when scoring other CPUs in disp_lowpri_cpu().  If we shouldn't run here,
 105  * we'll add a small penalty to the score.  This also makes sure a VCPU thread
 106  * migration behaves properly.
 107  */
 108 
 109 #include <sys/archsystm.h>
 110 #include <sys/disp.h>
 111 #include <sys/cmt.h>
 112 #include <sys/systm.h>
 113 #include <sys/cpu.h>
 114 #include <sys/var.h>
 115 #include <sys/xc_levels.h>
 116 #include <sys/cmn_err.h>
 117 #include <sys/sysmacros.h>
 118 #include <sys/x86_archext.h>
 119 
 120 #define CS_SHIFT (8)
 121 #define CS_MASK ((1 << CS_SHIFT) - 1)
 122 #define CS_MARK(s) ((s) & CS_MASK)
 123 #define CS_ZONE(s) ((s) >> CS_SHIFT)
 124 #define CS_MK(s, z) ((s) | (z << CS_SHIFT))
 125 
 126 typedef enum ch_mark {
 127         CM_IDLE = 0,    /* running CPU idle thread */
 128         CM_THREAD,      /* running general non-VCPU thread */
 129         CM_UNSAFE,      /* running ->t_unsafe thread */
 130         CM_VCPU,        /* running VCPU thread */
 131         CM_POISONED     /* running in guest */
 132 } ch_mark_t;
 133 
 134 /* Double-check our false-sharing padding. */
 135 CTASSERT(offsetof(cpu_ht_t, ch_sib) == 64);
 136 CTASSERT(CM_IDLE == 0);
 137 CTASSERT(CM_POISONED < (1 << CS_SHIFT));
 138 CTASSERT(CM_POISONED > CM_VCPU);
 139 CTASSERT(CM_VCPU > CM_UNSAFE);
 140 
 141 static uint_t empty_pil = XC_CPUPOKE_PIL;
 142 
 143 /*
 144  * If disabled, no HT exclusion is performed, and system is potentially
 145  * vulnerable to L1TF if hyper-threading is enabled, and we don't have the "not
 146  * vulnerable" CPUID bit.
 147  */
 148 int ht_exclusion = 1;
 149 
 150 /*
 151  * How long ht_acquire() will spin trying to acquire the core, in micro-seconds.
 152  * This is enough time to wait out a significant proportion of interrupts.
 153  */
 154 clock_t ht_acquire_wait_time = 64;
 155 
 156 static cpu_t *
 157 ht_find_sibling(cpu_t *cp)
 158 {
 159         for (uint_t i = 0; i < GROUP_SIZE(&cp->cpu_pg->cmt_pgs); i++) {
 160                 pg_cmt_t *pg = GROUP_ACCESS(&cp->cpu_pg->cmt_pgs, i);
 161                 group_t *cg = &pg->cmt_pg.pghw_pg.pg_cpus;
 162 
 163                 if (pg->cmt_pg.pghw_hw != PGHW_IPIPE)
 164                         continue;
 165 
 166                 if (GROUP_SIZE(cg) == 1)
 167                         break;
 168 
 169                 VERIFY3U(GROUP_SIZE(cg), ==, 2);
 170 
 171                 if (GROUP_ACCESS(cg, 0) != cp)
 172                         return (GROUP_ACCESS(cg, 0));
 173 
 174                 VERIFY3P(GROUP_ACCESS(cg, 1), !=, cp);
 175 
 176                 return (GROUP_ACCESS(cg, 1));
 177         }
 178 
 179         return (NULL);
 180 }
 181 
 182 /*
 183  * Initialize HT links.  We have to be careful here not to race with
 184  * ht_begin/end_intr(), which also complicates trying to do this initialization
 185  * from a cross-call; hence the slightly odd approach below.
 186  */
 187 void
 188 ht_init(void)
 189 {
 190         cpu_t *scp = CPU;
 191         cpu_t *cp = scp;
 192         ulong_t flags;
 193 
 194         if (!ht_exclusion)
 195                 return;
 196 
 197         mutex_enter(&cpu_lock);
 198 
 199         do {
 200                 thread_affinity_set(curthread, cp->cpu_id);
 201                 flags = intr_clear();
 202 
 203                 cp->cpu_m.mcpu_ht.ch_intr_depth = 0;
 204                 cp->cpu_m.mcpu_ht.ch_state = CS_MK(CM_THREAD, GLOBAL_ZONEID);
 205                 cp->cpu_m.mcpu_ht.ch_sibstate = CS_MK(CM_THREAD, GLOBAL_ZONEID);
 206                 ASSERT3P(cp->cpu_m.mcpu_ht.ch_sib, ==, NULL);
 207                 cp->cpu_m.mcpu_ht.ch_sib = ht_find_sibling(cp);
 208 
 209                 intr_restore(flags);
 210                 thread_affinity_clear(curthread);
 211         } while ((cp = cp->cpu_next_onln) != scp);
 212 
 213         mutex_exit(&cpu_lock);
 214 }
 215 
 216 /*
 217  * We're adding an interrupt handler of some kind at the given PIL.  If this
 218  * happens to be the same PIL as XC_CPUPOKE_PIL, then we need to disable our
 219  * pil_needs_kick() optimization, as there is now potentially an unsafe
 220  * interrupt handler at that PIL.  This typically won't occur, so we're not that
 221  * careful about what's actually getting added, which CPU it's on, or if it gets
 222  * removed.  This also presumes that softints can't cover our empty_pil.
 223  */
 224 void
 225 ht_intr_alloc_pil(uint_t pil)
 226 {
 227         ASSERT(pil <= PIL_MAX);
 228 
 229         if (empty_pil == pil)
 230                 empty_pil = PIL_MAX + 1;
 231 }
 232 
 233 /*
 234  * If our sibling is also a VCPU thread from a different zone, we need one of
 235  * them to give up, otherwise they will just battle each other for exclusion
 236  * until they exhaust their quantum.
 237  *
 238  * We arbitrate between them by dispatch priority: clearly, a higher-priority
 239  * thread deserves to win the acquisition.  However, under CPU load, it'll be
 240  * very common to see both threads with ->t_pri == 1.  If so, we'll break the
 241  * tie by cpu_id (which is hopefully arbitrary enough).
 242  *
 243  * If we lose, the VMM code will take this as a hint to call
 244  * thread_affinity_set(CPU_BEST), which will likely migrate the VCPU thread
 245  * somewhere else.
 246  *
 247  * Note that all of this state examination is racy, as we don't own any locks
 248  * here.
 249  */
 250 static boolean_t
 251 yield_to_vcpu(cpu_t *sib, zoneid_t zoneid)
 252 {
 253         cpu_ht_t *sibht = &sib->cpu_m.mcpu_ht;
 254         uint64_t sibstate = sibht->ch_state;
 255 
 256         /*
 257          * If we're likely just waiting for an interrupt, don't yield.
 258          */
 259         if (sibht->ch_intr_depth != 0)
 260                 return (B_FALSE);
 261 
 262         /*
 263          * We're only interested in VCPUs from a different zone.
 264          */
 265         if (CS_MARK(sibstate) < CM_VCPU || CS_ZONE(sibstate) == zoneid)
 266                 return (B_FALSE);
 267 
 268         if (curthread->t_pri < sib->cpu_dispatch_pri)
 269                 return (B_TRUE);
 270 
 271         if (curthread->t_pri == sib->cpu_dispatch_pri &&
 272             CPU->cpu_id < sib->cpu_id)
 273                 return (B_TRUE);
 274 
 275         return (B_FALSE);
 276 }
 277 
 278 static inline boolean_t
 279 sibling_compatible(cpu_ht_t *sibht, zoneid_t zoneid)
 280 {
 281         uint64_t sibstate = sibht->ch_state;
 282 
 283         if (sibht->ch_intr_depth != 0)
 284                 return (B_FALSE);
 285 
 286         if (CS_MARK(sibstate) == CM_UNSAFE)
 287                 return (B_FALSE);
 288 
 289         if (CS_MARK(sibstate) == CM_IDLE)
 290                 return (B_TRUE);
 291 
 292         return (CS_ZONE(sibstate) == zoneid);
 293 }
 294 
 295 int
 296 ht_acquire(void)
 297 {
 298         clock_t wait = ht_acquire_wait_time;
 299         cpu_ht_t *ht = &CPU->cpu_m.mcpu_ht;
 300         zoneid_t zoneid = getzoneid();
 301         cpu_ht_t *sibht;
 302         int ret = 0;
 303 
 304         ASSERT(!interrupts_enabled());
 305 
 306         if (ht->ch_sib == NULL) {
 307                 /* For the "sequential" L1TF case. */
 308                 spec_l1d_flush();
 309                 return (1);
 310         }
 311 
 312         sibht = &ht->ch_sib->cpu_m.mcpu_ht;
 313 
 314         /* A VCPU thread should never change zone. */
 315         ASSERT3U(CS_ZONE(ht->ch_state), ==, zoneid);
 316         ASSERT3U(CS_MARK(ht->ch_state), ==, CM_VCPU);
 317         ASSERT3U(zoneid, !=, GLOBAL_ZONEID);
 318         ASSERT3U(curthread->t_preempt, >=, 1);
 319         ASSERT(curthread->t_schedflag & TS_VCPU);
 320 
 321         while (ret == 0 && wait > 0) {
 322 
 323                 if (yield_to_vcpu(ht->ch_sib, zoneid)) {
 324                         ret = -1;
 325                         break;
 326                 }
 327 
 328                 if (sibling_compatible(sibht, zoneid)) {
 329                         lock_set(&sibht->ch_lock);
 330 
 331                         if (sibling_compatible(sibht, zoneid)) {
 332                                 ht->ch_state = CS_MK(CM_POISONED, zoneid);
 333                                 sibht->ch_sibstate = CS_MK(CM_POISONED, zoneid);
 334                                 membar_enter();
 335                                 ret = 1;
 336                         }
 337 
 338                         lock_clear(&sibht->ch_lock);
 339                 } else {
 340                         drv_usecwait(10);
 341                         wait -= 10;
 342                 }
 343         }
 344 
 345         DTRACE_PROBE4(ht__acquire, int, ret, uint64_t, sibht->ch_state,
 346             uint64_t, sibht->ch_intr_depth, clock_t, wait);
 347 
 348         if (ret == 1)
 349                 spec_l1d_flush();
 350 
 351         return (ret);
 352 }
 353 
 354 void
 355 ht_release(void)
 356 {
 357         cpu_ht_t *ht = &CPU->cpu_m.mcpu_ht;
 358         zoneid_t zoneid = getzoneid();
 359         cpu_ht_t *sibht;
 360 
 361         ASSERT(!interrupts_enabled());
 362 
 363         if (ht->ch_sib == NULL)
 364                 return;
 365 
 366         ASSERT3U(zoneid, !=, GLOBAL_ZONEID);
 367         ASSERT3U(CS_ZONE(ht->ch_state), ==, zoneid);
 368         ASSERT3U(CS_MARK(ht->ch_state), ==, CM_POISONED);
 369         ASSERT3U(curthread->t_preempt, >=, 1);
 370 
 371         sibht = &ht->ch_sib->cpu_m.mcpu_ht;
 372 
 373         lock_set(&sibht->ch_lock);
 374 
 375         ht->ch_state = CS_MK(CM_VCPU, zoneid);
 376         sibht->ch_sibstate = CS_MK(CM_VCPU, zoneid);
 377         membar_producer();
 378 
 379         lock_clear(&sibht->ch_lock);
 380 }
 381 
 382 static void
 383 ht_kick(cpu_ht_t *ht, zoneid_t zoneid)
 384 {
 385         uint64_t sibstate;
 386 
 387         ASSERT(LOCK_HELD(&ht->ch_lock));
 388         ASSERT(!interrupts_enabled());
 389 
 390         poke_cpu(ht->ch_sib->cpu_id);
 391 
 392         membar_consumer();
 393         sibstate = ht->ch_sibstate;
 394 
 395         if (CS_MARK(sibstate) != CM_POISONED || CS_ZONE(sibstate) == zoneid)
 396                 return;
 397 
 398         lock_clear(&ht->ch_lock);
 399 
 400         /*
 401          * Spin until we can see the sibling has been kicked out or is otherwise
 402          * OK.
 403          */
 404         for (;;) {
 405                 membar_consumer();
 406                 sibstate = ht->ch_sibstate;
 407 
 408                 if (CS_MARK(sibstate) != CM_POISONED ||
 409                     CS_ZONE(sibstate) == zoneid)
 410                         break;
 411 
 412                 SMT_PAUSE();
 413         }
 414 
 415         lock_set(&ht->ch_lock);
 416 }
 417 
 418 static boolean_t
 419 pil_needs_kick(uint_t pil)
 420 {
 421         return (pil != empty_pil);
 422 }
 423 
 424 void
 425 ht_begin_intr(uint_t pil)
 426 {
 427         ulong_t flags;
 428         cpu_ht_t *ht;
 429 
 430         ASSERT(pil <= PIL_MAX);
 431 
 432         flags = intr_clear();
 433         ht = &CPU->cpu_m.mcpu_ht;
 434 
 435         if (ht->ch_sib == NULL) {
 436                 intr_restore(flags);
 437                 return;
 438         }
 439 
 440         if (atomic_inc_64_nv(&ht->ch_intr_depth) == 1 && pil_needs_kick(pil)) {
 441                 lock_set(&ht->ch_lock);
 442 
 443                 membar_consumer();
 444 
 445                 if (CS_MARK(ht->ch_sibstate) == CM_POISONED)
 446                         ht_kick(ht, GLOBAL_ZONEID);
 447 
 448                 lock_clear(&ht->ch_lock);
 449         }
 450 
 451         intr_restore(flags);
 452 }
 453 
 454 void
 455 ht_end_intr(void)
 456 {
 457         ulong_t flags;
 458         cpu_ht_t *ht;
 459 
 460         flags = intr_clear();
 461         ht = &CPU->cpu_m.mcpu_ht;
 462 
 463         if (ht->ch_sib == NULL) {
 464                 intr_restore(flags);
 465                 return;
 466         }
 467 
 468         ASSERT3U(ht->ch_intr_depth, >, 0);
 469         atomic_dec_64(&ht->ch_intr_depth);
 470 
 471         intr_restore(flags);
 472 }
 473 
 474 static inline boolean_t
 475 ht_need_kick(cpu_ht_t *ht, zoneid_t zoneid)
 476 {
 477         membar_consumer();
 478 
 479         if (CS_MARK(ht->ch_sibstate) != CM_POISONED)
 480                 return (B_FALSE);
 481 
 482         if (CS_MARK(ht->ch_state) == CM_UNSAFE)
 483                 return (B_TRUE);
 484 
 485         return (CS_ZONE(ht->ch_sibstate) != zoneid);
 486 }
 487 
 488 void
 489 ht_mark(void)
 490 {
 491         zoneid_t zoneid = getzoneid();
 492         kthread_t *t = curthread;
 493         ulong_t flags;
 494         cpu_ht_t *ht;
 495         cpu_t *cp;
 496 
 497         flags = intr_clear();
 498 
 499         cp = CPU;
 500         ht = &cp->cpu_m.mcpu_ht;
 501 
 502         if (ht->ch_sib == NULL) {
 503                 intr_restore(flags);
 504                 return;
 505         }
 506 
 507         lock_set(&ht->ch_lock);
 508 
 509         /*
 510          * If we were a nested interrupt and went through the resume_from_intr()
 511          * path, we can now be resuming to a pinning interrupt thread; in which
 512          * case, skip marking, until we later resume to a "real" thread.
 513          */
 514         if (ht->ch_intr_depth > 0) {
 515                 ASSERT3P(t->t_intr, !=, NULL);
 516 
 517                 if (ht_need_kick(ht, zoneid))
 518                         ht_kick(ht, zoneid);
 519                 goto out;
 520         }
 521 
 522         if (t == t->t_cpu->cpu_idle_thread) {
 523                 ASSERT3U(zoneid, ==, GLOBAL_ZONEID);
 524                 ht->ch_state = CS_MK(CM_IDLE, zoneid);
 525         } else {
 526                 uint64_t state = CM_THREAD;
 527 
 528                 if (t->t_unsafe)
 529                         state = CM_UNSAFE;
 530                 else if (t->t_schedflag & TS_VCPU)
 531                         state = CM_VCPU;
 532 
 533                 ht->ch_state = CS_MK(state, zoneid);
 534 
 535                 if (ht_need_kick(ht, zoneid))
 536                         ht_kick(ht, zoneid);
 537         }
 538 
 539 out:
 540         membar_producer();
 541         lock_clear(&ht->ch_lock);
 542         intr_restore(flags);
 543 }
 544 
 545 void
 546 ht_begin_unsafe(void)
 547 {
 548         curthread->t_unsafe++;
 549         ht_mark();
 550 }
 551 
 552 void
 553 ht_end_unsafe(void)
 554 {
 555         ASSERT3U(curthread->t_unsafe, >, 0);
 556         curthread->t_unsafe--;
 557         ht_mark();
 558 }
 559 
 560 void
 561 ht_mark_as_vcpu(void)
 562 {
 563         thread_lock(curthread);
 564         curthread->t_schedflag |= TS_VCPU;
 565         ht_mark();
 566         thread_unlock(curthread);
 567 }
 568 
 569 boolean_t
 570 ht_should_run(kthread_t *t, cpu_t *cp)
 571 {
 572         uint64_t sibstate;
 573         cpu_t *sib;
 574 
 575         if (t == t->t_cpu->cpu_idle_thread)
 576                 return (B_TRUE);
 577 
 578         if ((sib = cp->cpu_m.mcpu_ht.ch_sib) == NULL)
 579                 return (B_TRUE);
 580 
 581         sibstate = sib->cpu_m.mcpu_ht.ch_state;
 582 
 583         if ((t->t_schedflag & TS_VCPU)) {
 584                 if (CS_MARK(sibstate) == CM_IDLE)
 585                         return (B_TRUE);
 586                 if (CS_MARK(sibstate) == CM_UNSAFE)
 587                         return (B_FALSE);
 588                 return (CS_ZONE(sibstate) == ttozone(t)->zone_id);
 589         }
 590 
 591         if (CS_MARK(sibstate) < CM_VCPU)
 592                 return (B_TRUE);
 593 
 594         return (CS_ZONE(sibstate) == ttozone(t)->zone_id);
 595 }
 596 
 597 pri_t
 598 ht_adjust_cpu_score(kthread_t *t, struct cpu *cp, pri_t score)
 599 {
 600         if (ht_should_run(t, cp))
 601                 return (score);
 602 
 603         /*
 604          * If we're a VCPU thread scoring our current CPU, we are most likely
 605          * asking to be rescheduled elsewhere after losing ht_acquire().  In
 606          * this case, the current CPU is not a good choice, most likely, and we
 607          * should go elsewhere.
 608          */
 609         if ((t->t_schedflag & TS_VCPU) && cp == t->t_cpu && score < 0)
 610                 return ((v.v_maxsyspri + 1) * 2);
 611 
 612         return (score + 1);
 613 }