1 /*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11
12 /*
13 * Copyright 2018 Joyent, Inc.
14 */
15
16 /*
17 * HT exclusion: prevent a sibling in a hyper-threaded core from running in VMX
18 * non-root guest mode, when certain threads are running on the other sibling.
19 * This avoids speculation-based information leaks such as L1TF being available
20 * to the untrusted guest. The stance we take is that threads from the same
21 * zone as the guest VPCU thread are considered safe to run alongside, but all
22 * other threads (except the idle thread), and all interrupts, are unsafe. Note
23 * that due to the implementation here, there are significant sections of e.g.
24 * the dispatcher code that can run concurrently with a guest, until the thread
25 * reaches ht_mark(). This code assumes there are only two HT threads per core.
26 *
27 * The entry points are as follows:
28 *
29 * ht_mark_as_vcpu()
30 *
31 * All threads that enter guest mode (i.e. VCPU threads) need to call this at
32 * least once, which sets TS_VCPU in ->t_schedflag.
33 *
34 * ht_mark()
35 *
36 * A new ->cpu_thread is now curthread (although interrupt threads have their
37 * own separate handling). After preventing any interrupts, we will take our
38 * own CPU's spinlock and update our own state in mcpu_ht.
39 *
40 * If our sibling is poisoned (i.e. in guest mode or the little bit of code
41 * around it), and we're not compatible (that is, same zone ID, or the idle
42 * thread), then we need to ht_kick() that sibling. ht_kick() itself waits for
43 * the sibling to call ht_release(), and it will not re-enter guest mode until
44 * allowed.
45 *
46 * Note that we ignore the fact a process can change its zone ID: poisoning
47 * threads never do so, and we can ignore the other cases.
48 *
49 * ht_acquire()
50 *
51 * We are a VCPU thread about to start guest execution. Interrupts are
52 * disabled. We must have already run ht_mark() to be in this code, so there's
53 * no need to take our *own* spinlock in order to mark ourselves as CM_POISONED.
54 * Instead, we take our sibling's lock to also mark ourselves as poisoned in the
55 * sibling cpu_ht_t. This is so ht_mark() will only ever need to look at its
56 * local mcpu_ht.
57 *
58 * We'll loop here for up to ht_acquire_wait_time microseconds; this is mainly
59 * to wait out any sibling interrupt: many of them will complete quicker than
60 * this.
61 *
62 * Finally, if we succeeded in acquiring the core, we'll flush the L1 cache as
63 * mitigation against L1TF: no incompatible thread will now be able to populate
64 * the L1 cache until *we* ht_release().
65 *
66 * ht_release()
67 *
68 * Simply unpoison ourselves similarly to ht_acquire(); ht_kick() will wait for
69 * this to happen if needed.
70 *
71 * ht_begin_intr()
72 *
73 * In an interrupt prolog. We're either a hilevel interrupt, or a pinning
74 * interrupt. In both cases, we mark our interrupt depth, and potentially
75 * ht_kick(). This enforces exclusion, but doesn't otherwise modify ->ch_state:
76 * we want the dispatcher code to essentially ignore interrupts.
77 *
78 * ht_end_intr()
79 *
80 * In an interrupt epilogue *or* thread_unpin(). In the first case, we never
81 * slept, and we can simply decrement our counter. In the second case, we're an
82 * interrupt thread about to sleep: we'll still just decrement our counter, and
83 * henceforth treat the thread as a normal thread when it next gets scheduled,
84 * until it finally gets to its epilogue.
85 *
86 * ht_mark_unsafe() / ht_mark_safe()
87 *
88 * Mark the current thread as temporarily unsafe (guests should not be executing
89 * while a sibling is marked unsafe). This can be used for a thread that's
90 * otherwise considered safe, if it needs to handle potentially sensitive data.
91 * Right now, this means certain I/O handling operations that reach down into
92 * the networking and ZFS sub-systems.
93 *
94 * ht_should_run(thread, cpu)
95 *
96 * This is used by the dispatcher when making scheduling decisions: if the
97 * sibling is compatible with the given thread, we return B_TRUE. This is
98 * essentially trying to guess if any subsequent ht_acquire() will fail, by
99 * peeking at the sibling CPU's state. The peek is racy, but if we get things
100 * wrong, the "only" consequence is that ht_acquire() may lose.
101 *
102 * ht_adjust_cpu_score()
103 *
104 * Used when scoring other CPUs in disp_lowpri_cpu(). If we shouldn't run here,
105 * we'll add a small penalty to the score. This also makes sure a VCPU thread
106 * migration behaves properly.
107 */
108
109 #include <sys/archsystm.h>
110 #include <sys/disp.h>
111 #include <sys/cmt.h>
112 #include <sys/systm.h>
113 #include <sys/cpu.h>
114 #include <sys/var.h>
115 #include <sys/xc_levels.h>
116 #include <sys/cmn_err.h>
117 #include <sys/sysmacros.h>
118 #include <sys/x86_archext.h>
119
120 #define CS_SHIFT (8)
121 #define CS_MASK ((1 << CS_SHIFT) - 1)
122 #define CS_MARK(s) ((s) & CS_MASK)
123 #define CS_ZONE(s) ((s) >> CS_SHIFT)
124 #define CS_MK(s, z) ((s) | (z << CS_SHIFT))
125
126 typedef enum ch_mark {
127 CM_IDLE = 0, /* running CPU idle thread */
128 CM_THREAD, /* running general non-VCPU thread */
129 CM_UNSAFE, /* running ->t_unsafe thread */
130 CM_VCPU, /* running VCPU thread */
131 CM_POISONED /* running in guest */
132 } ch_mark_t;
133
134 /* Double-check our false-sharing padding. */
135 CTASSERT(offsetof(cpu_ht_t, ch_sib) == 64);
136 CTASSERT(CM_IDLE == 0);
137 CTASSERT(CM_POISONED < (1 << CS_SHIFT));
138 CTASSERT(CM_POISONED > CM_VCPU);
139 CTASSERT(CM_VCPU > CM_UNSAFE);
140
141 static uint_t empty_pil = XC_CPUPOKE_PIL;
142
143 /*
144 * If disabled, no HT exclusion is performed, and system is potentially
145 * vulnerable to L1TF if hyper-threading is enabled, and we don't have the "not
146 * vulnerable" CPUID bit.
147 */
148 int ht_exclusion = 1;
149
150 /*
151 * How long ht_acquire() will spin trying to acquire the core, in micro-seconds.
152 * This is enough time to wait out a significant proportion of interrupts.
153 */
154 clock_t ht_acquire_wait_time = 64;
155
156 static cpu_t *
157 ht_find_sibling(cpu_t *cp)
158 {
159 for (uint_t i = 0; i < GROUP_SIZE(&cp->cpu_pg->cmt_pgs); i++) {
160 pg_cmt_t *pg = GROUP_ACCESS(&cp->cpu_pg->cmt_pgs, i);
161 group_t *cg = &pg->cmt_pg.pghw_pg.pg_cpus;
162
163 if (pg->cmt_pg.pghw_hw != PGHW_IPIPE)
164 continue;
165
166 if (GROUP_SIZE(cg) == 1)
167 break;
168
169 VERIFY3U(GROUP_SIZE(cg), ==, 2);
170
171 if (GROUP_ACCESS(cg, 0) != cp)
172 return (GROUP_ACCESS(cg, 0));
173
174 VERIFY3P(GROUP_ACCESS(cg, 1), !=, cp);
175
176 return (GROUP_ACCESS(cg, 1));
177 }
178
179 return (NULL);
180 }
181
182 /*
183 * Initialize HT links. We have to be careful here not to race with
184 * ht_begin/end_intr(), which also complicates trying to do this initialization
185 * from a cross-call; hence the slightly odd approach below.
186 */
187 void
188 ht_init(void)
189 {
190 cpu_t *scp = CPU;
191 cpu_t *cp = scp;
192 ulong_t flags;
193
194 if (!ht_exclusion)
195 return;
196
197 mutex_enter(&cpu_lock);
198
199 do {
200 thread_affinity_set(curthread, cp->cpu_id);
201 flags = intr_clear();
202
203 cp->cpu_m.mcpu_ht.ch_intr_depth = 0;
204 cp->cpu_m.mcpu_ht.ch_state = CS_MK(CM_THREAD, GLOBAL_ZONEID);
205 cp->cpu_m.mcpu_ht.ch_sibstate = CS_MK(CM_THREAD, GLOBAL_ZONEID);
206 ASSERT3P(cp->cpu_m.mcpu_ht.ch_sib, ==, NULL);
207 cp->cpu_m.mcpu_ht.ch_sib = ht_find_sibling(cp);
208
209 intr_restore(flags);
210 thread_affinity_clear(curthread);
211 } while ((cp = cp->cpu_next_onln) != scp);
212
213 mutex_exit(&cpu_lock);
214 }
215
216 /*
217 * We're adding an interrupt handler of some kind at the given PIL. If this
218 * happens to be the same PIL as XC_CPUPOKE_PIL, then we need to disable our
219 * pil_needs_kick() optimization, as there is now potentially an unsafe
220 * interrupt handler at that PIL. This typically won't occur, so we're not that
221 * careful about what's actually getting added, which CPU it's on, or if it gets
222 * removed. This also presumes that softints can't cover our empty_pil.
223 */
224 void
225 ht_intr_alloc_pil(uint_t pil)
226 {
227 ASSERT(pil <= PIL_MAX);
228
229 if (empty_pil == pil)
230 empty_pil = PIL_MAX + 1;
231 }
232
233 /*
234 * If our sibling is also a VCPU thread from a different zone, we need one of
235 * them to give up, otherwise they will just battle each other for exclusion
236 * until they exhaust their quantum.
237 *
238 * We arbitrate between them by dispatch priority: clearly, a higher-priority
239 * thread deserves to win the acquisition. However, under CPU load, it'll be
240 * very common to see both threads with ->t_pri == 1. If so, we'll break the
241 * tie by cpu_id (which is hopefully arbitrary enough).
242 *
243 * If we lose, the VMM code will take this as a hint to call
244 * thread_affinity_set(CPU_BEST), which will likely migrate the VCPU thread
245 * somewhere else.
246 *
247 * Note that all of this state examination is racy, as we don't own any locks
248 * here.
249 */
250 static boolean_t
251 yield_to_vcpu(cpu_t *sib, zoneid_t zoneid)
252 {
253 cpu_ht_t *sibht = &sib->cpu_m.mcpu_ht;
254 uint64_t sibstate = sibht->ch_state;
255
256 /*
257 * If we're likely just waiting for an interrupt, don't yield.
258 */
259 if (sibht->ch_intr_depth != 0)
260 return (B_FALSE);
261
262 /*
263 * We're only interested in VCPUs from a different zone.
264 */
265 if (CS_MARK(sibstate) < CM_VCPU || CS_ZONE(sibstate) == zoneid)
266 return (B_FALSE);
267
268 if (curthread->t_pri < sib->cpu_dispatch_pri)
269 return (B_TRUE);
270
271 if (curthread->t_pri == sib->cpu_dispatch_pri &&
272 CPU->cpu_id < sib->cpu_id)
273 return (B_TRUE);
274
275 return (B_FALSE);
276 }
277
278 static inline boolean_t
279 sibling_compatible(cpu_ht_t *sibht, zoneid_t zoneid)
280 {
281 uint64_t sibstate = sibht->ch_state;
282
283 if (sibht->ch_intr_depth != 0)
284 return (B_FALSE);
285
286 if (CS_MARK(sibstate) == CM_UNSAFE)
287 return (B_FALSE);
288
289 if (CS_MARK(sibstate) == CM_IDLE)
290 return (B_TRUE);
291
292 return (CS_ZONE(sibstate) == zoneid);
293 }
294
295 int
296 ht_acquire(void)
297 {
298 clock_t wait = ht_acquire_wait_time;
299 cpu_ht_t *ht = &CPU->cpu_m.mcpu_ht;
300 zoneid_t zoneid = getzoneid();
301 cpu_ht_t *sibht;
302 int ret = 0;
303
304 ASSERT(!interrupts_enabled());
305
306 if (ht->ch_sib == NULL) {
307 /* For the "sequential" L1TF case. */
308 spec_l1d_flush();
309 return (1);
310 }
311
312 sibht = &ht->ch_sib->cpu_m.mcpu_ht;
313
314 /* A VCPU thread should never change zone. */
315 ASSERT3U(CS_ZONE(ht->ch_state), ==, zoneid);
316 ASSERT3U(CS_MARK(ht->ch_state), ==, CM_VCPU);
317 ASSERT3U(zoneid, !=, GLOBAL_ZONEID);
318 ASSERT3U(curthread->t_preempt, >=, 1);
319 ASSERT(curthread->t_schedflag & TS_VCPU);
320
321 while (ret == 0 && wait > 0) {
322
323 if (yield_to_vcpu(ht->ch_sib, zoneid)) {
324 ret = -1;
325 break;
326 }
327
328 if (sibling_compatible(sibht, zoneid)) {
329 lock_set(&sibht->ch_lock);
330
331 if (sibling_compatible(sibht, zoneid)) {
332 ht->ch_state = CS_MK(CM_POISONED, zoneid);
333 sibht->ch_sibstate = CS_MK(CM_POISONED, zoneid);
334 membar_enter();
335 ret = 1;
336 }
337
338 lock_clear(&sibht->ch_lock);
339 } else {
340 drv_usecwait(10);
341 wait -= 10;
342 }
343 }
344
345 DTRACE_PROBE4(ht__acquire, int, ret, uint64_t, sibht->ch_state,
346 uint64_t, sibht->ch_intr_depth, clock_t, wait);
347
348 if (ret == 1)
349 spec_l1d_flush();
350
351 return (ret);
352 }
353
354 void
355 ht_release(void)
356 {
357 cpu_ht_t *ht = &CPU->cpu_m.mcpu_ht;
358 zoneid_t zoneid = getzoneid();
359 cpu_ht_t *sibht;
360
361 ASSERT(!interrupts_enabled());
362
363 if (ht->ch_sib == NULL)
364 return;
365
366 ASSERT3U(zoneid, !=, GLOBAL_ZONEID);
367 ASSERT3U(CS_ZONE(ht->ch_state), ==, zoneid);
368 ASSERT3U(CS_MARK(ht->ch_state), ==, CM_POISONED);
369 ASSERT3U(curthread->t_preempt, >=, 1);
370
371 sibht = &ht->ch_sib->cpu_m.mcpu_ht;
372
373 lock_set(&sibht->ch_lock);
374
375 ht->ch_state = CS_MK(CM_VCPU, zoneid);
376 sibht->ch_sibstate = CS_MK(CM_VCPU, zoneid);
377 membar_producer();
378
379 lock_clear(&sibht->ch_lock);
380 }
381
382 static void
383 ht_kick(cpu_ht_t *ht, zoneid_t zoneid)
384 {
385 uint64_t sibstate;
386
387 ASSERT(LOCK_HELD(&ht->ch_lock));
388 ASSERT(!interrupts_enabled());
389
390 poke_cpu(ht->ch_sib->cpu_id);
391
392 membar_consumer();
393 sibstate = ht->ch_sibstate;
394
395 if (CS_MARK(sibstate) != CM_POISONED || CS_ZONE(sibstate) == zoneid)
396 return;
397
398 lock_clear(&ht->ch_lock);
399
400 /*
401 * Spin until we can see the sibling has been kicked out or is otherwise
402 * OK.
403 */
404 for (;;) {
405 membar_consumer();
406 sibstate = ht->ch_sibstate;
407
408 if (CS_MARK(sibstate) != CM_POISONED ||
409 CS_ZONE(sibstate) == zoneid)
410 break;
411
412 SMT_PAUSE();
413 }
414
415 lock_set(&ht->ch_lock);
416 }
417
418 static boolean_t
419 pil_needs_kick(uint_t pil)
420 {
421 return (pil != empty_pil);
422 }
423
424 void
425 ht_begin_intr(uint_t pil)
426 {
427 ulong_t flags;
428 cpu_ht_t *ht;
429
430 ASSERT(pil <= PIL_MAX);
431
432 flags = intr_clear();
433 ht = &CPU->cpu_m.mcpu_ht;
434
435 if (ht->ch_sib == NULL) {
436 intr_restore(flags);
437 return;
438 }
439
440 if (atomic_inc_64_nv(&ht->ch_intr_depth) == 1 && pil_needs_kick(pil)) {
441 lock_set(&ht->ch_lock);
442
443 membar_consumer();
444
445 if (CS_MARK(ht->ch_sibstate) == CM_POISONED)
446 ht_kick(ht, GLOBAL_ZONEID);
447
448 lock_clear(&ht->ch_lock);
449 }
450
451 intr_restore(flags);
452 }
453
454 void
455 ht_end_intr(void)
456 {
457 ulong_t flags;
458 cpu_ht_t *ht;
459
460 flags = intr_clear();
461 ht = &CPU->cpu_m.mcpu_ht;
462
463 if (ht->ch_sib == NULL) {
464 intr_restore(flags);
465 return;
466 }
467
468 ASSERT3U(ht->ch_intr_depth, >, 0);
469 atomic_dec_64(&ht->ch_intr_depth);
470
471 intr_restore(flags);
472 }
473
474 static inline boolean_t
475 ht_need_kick(cpu_ht_t *ht, zoneid_t zoneid)
476 {
477 membar_consumer();
478
479 if (CS_MARK(ht->ch_sibstate) != CM_POISONED)
480 return (B_FALSE);
481
482 if (CS_MARK(ht->ch_state) == CM_UNSAFE)
483 return (B_TRUE);
484
485 return (CS_ZONE(ht->ch_sibstate) != zoneid);
486 }
487
488 void
489 ht_mark(void)
490 {
491 zoneid_t zoneid = getzoneid();
492 kthread_t *t = curthread;
493 ulong_t flags;
494 cpu_ht_t *ht;
495 cpu_t *cp;
496
497 flags = intr_clear();
498
499 cp = CPU;
500 ht = &cp->cpu_m.mcpu_ht;
501
502 if (ht->ch_sib == NULL) {
503 intr_restore(flags);
504 return;
505 }
506
507 lock_set(&ht->ch_lock);
508
509 /*
510 * If we were a nested interrupt and went through the resume_from_intr()
511 * path, we can now be resuming to a pinning interrupt thread; in which
512 * case, skip marking, until we later resume to a "real" thread.
513 */
514 if (ht->ch_intr_depth > 0) {
515 ASSERT3P(t->t_intr, !=, NULL);
516
517 if (ht_need_kick(ht, zoneid))
518 ht_kick(ht, zoneid);
519 goto out;
520 }
521
522 if (t == t->t_cpu->cpu_idle_thread) {
523 ASSERT3U(zoneid, ==, GLOBAL_ZONEID);
524 ht->ch_state = CS_MK(CM_IDLE, zoneid);
525 } else {
526 uint64_t state = CM_THREAD;
527
528 if (t->t_unsafe)
529 state = CM_UNSAFE;
530 else if (t->t_schedflag & TS_VCPU)
531 state = CM_VCPU;
532
533 ht->ch_state = CS_MK(state, zoneid);
534
535 if (ht_need_kick(ht, zoneid))
536 ht_kick(ht, zoneid);
537 }
538
539 out:
540 membar_producer();
541 lock_clear(&ht->ch_lock);
542 intr_restore(flags);
543 }
544
545 void
546 ht_begin_unsafe(void)
547 {
548 curthread->t_unsafe++;
549 ht_mark();
550 }
551
552 void
553 ht_end_unsafe(void)
554 {
555 ASSERT3U(curthread->t_unsafe, >, 0);
556 curthread->t_unsafe--;
557 ht_mark();
558 }
559
560 void
561 ht_mark_as_vcpu(void)
562 {
563 thread_lock(curthread);
564 curthread->t_schedflag |= TS_VCPU;
565 ht_mark();
566 thread_unlock(curthread);
567 }
568
569 boolean_t
570 ht_should_run(kthread_t *t, cpu_t *cp)
571 {
572 uint64_t sibstate;
573 cpu_t *sib;
574
575 if (t == t->t_cpu->cpu_idle_thread)
576 return (B_TRUE);
577
578 if ((sib = cp->cpu_m.mcpu_ht.ch_sib) == NULL)
579 return (B_TRUE);
580
581 sibstate = sib->cpu_m.mcpu_ht.ch_state;
582
583 if ((t->t_schedflag & TS_VCPU)) {
584 if (CS_MARK(sibstate) == CM_IDLE)
585 return (B_TRUE);
586 if (CS_MARK(sibstate) == CM_UNSAFE)
587 return (B_FALSE);
588 return (CS_ZONE(sibstate) == ttozone(t)->zone_id);
589 }
590
591 if (CS_MARK(sibstate) < CM_VCPU)
592 return (B_TRUE);
593
594 return (CS_ZONE(sibstate) == ttozone(t)->zone_id);
595 }
596
597 pri_t
598 ht_adjust_cpu_score(kthread_t *t, struct cpu *cp, pri_t score)
599 {
600 if (ht_should_run(t, cp))
601 return (score);
602
603 /*
604 * If we're a VCPU thread scoring our current CPU, we are most likely
605 * asking to be rescheduled elsewhere after losing ht_acquire(). In
606 * this case, the current CPU is not a good choice, most likely, and we
607 * should go elsewhere.
608 */
609 if ((t->t_schedflag & TS_VCPU) && cp == t->t_cpu && score < 0)
610 return ((v.v_maxsyspri + 1) * 2);
611
612 return (score + 1);
613 }