Print this page
OS-7125 Need mitigation of L1TF (CVE-2018-3646)
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>

Split Close
Expand all
Collapse all
          --- old/usr/src/uts/common/disp/disp.c
          +++ new/usr/src/uts/common/disp/disp.c
↓ open down ↓ 15 lines elided ↑ open up ↑
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23   23   * Use is subject to license terms.
  24   24   */
  25   25  
       26 +/*
       27 + * Copyright (c) 2018, Joyent, Inc. All rights reserved.
       28 + */
       29 +
  26   30  /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  27   31  /*        All Rights Reserved   */
  28   32  
  29   33  
  30   34  #include <sys/types.h>
  31   35  #include <sys/param.h>
  32   36  #include <sys/sysmacros.h>
  33   37  #include <sys/signal.h>
  34   38  #include <sys/user.h>
  35   39  #include <sys/systm.h>
↓ open down ↓ 13 lines elided ↑ open up ↑
  49   53  #include <sys/cpupart.h>
  50   54  #include <sys/lgrp.h>
  51   55  #include <sys/pg.h>
  52   56  #include <sys/cmt.h>
  53   57  #include <sys/bitset.h>
  54   58  #include <sys/schedctl.h>
  55   59  #include <sys/atomic.h>
  56   60  #include <sys/dtrace.h>
  57   61  #include <sys/sdt.h>
  58   62  #include <sys/archsystm.h>
       63 +#include <sys/ht.h>
  59   64  
  60   65  #include <vm/as.h>
  61   66  
  62   67  #define BOUND_CPU       0x1
  63   68  #define BOUND_PARTITION 0x2
  64   69  #define BOUND_INTR      0x4
  65   70  
  66   71  /* Dispatch queue allocation structure and functions */
  67   72  struct disp_queue_info {
  68   73          disp_t  *dp;
↓ open down ↓ 1039 lines elided ↑ open up ↑
1108 1113                  dtrace_vtime_switch(next);
1109 1114  
1110 1115          resume(next);
1111 1116          /*
1112 1117           * The TR_RESUME_END and TR_SWTCH_END trace points
1113 1118           * appear at the end of resume(), because we may not
1114 1119           * return here
1115 1120           */
1116 1121  }
1117 1122  
1118      -#define CPU_IDLING(pri) ((pri) == -1)
1119      -
1120 1123  static void
1121 1124  cpu_resched(cpu_t *cp, pri_t tpri)
1122 1125  {
1123 1126          int     call_poke_cpu = 0;
1124 1127          pri_t   cpupri = cp->cpu_dispatch_pri;
1125 1128  
1126      -        if (!CPU_IDLING(cpupri) && (cpupri < tpri)) {
     1129 +        if (cpupri != CPU_IDLE_PRI && cpupri < tpri) {
1127 1130                  TRACE_2(TR_FAC_DISP, TR_CPU_RESCHED,
1128 1131                      "CPU_RESCHED:Tpri %d Cpupri %d", tpri, cpupri);
1129 1132                  if (tpri >= upreemptpri && cp->cpu_runrun == 0) {
1130 1133                          cp->cpu_runrun = 1;
1131 1134                          aston(cp->cpu_dispthread);
1132 1135                          if (tpri < kpreemptpri && cp != CPU)
1133 1136                                  call_poke_cpu = 1;
1134 1137                  }
1135 1138                  if (tpri >= kpreemptpri && cp->cpu_kprunrun == 0) {
1136 1139                          cp->cpu_kprunrun = 1;
↓ open down ↓ 75 lines elided ↑ open up ↑
1212 1215                  cp = tp->t_cpu;
1213 1216          else if (!bound) {
1214 1217                  if (tpri >= kpqpri) {
1215 1218                          setkpdq(tp, SETKP_BACK);
1216 1219                          return;
1217 1220                  }
1218 1221  
1219 1222                  /*
1220 1223                   * We'll generally let this thread continue to run where
1221 1224                   * it last ran...but will consider migration if:
1222      -                 * - We thread probably doesn't have much cache warmth.
     1225 +                 * - The thread probably doesn't have much cache warmth.
     1226 +                 * - HT exclusion would prefer us to run elsewhere
1223 1227                   * - The CPU where it last ran is the target of an offline
1224 1228                   *   request.
1225      -                 * - The thread last ran outside it's home lgroup.
     1229 +                 * - The thread last ran outside its home lgroup.
1226 1230                   */
1227 1231                  if ((!THREAD_HAS_CACHE_WARMTH(tp)) ||
1228      -                    (tp->t_cpu == cpu_inmotion)) {
1229      -                        cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri, NULL);
1230      -                } else if (!LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, tp->t_cpu)) {
1231      -                        cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri,
1232      -                            self ? tp->t_cpu : NULL);
     1232 +                    !ht_should_run(tp, tp->t_cpu) ||
     1233 +                    (tp->t_cpu == cpu_inmotion) ||
     1234 +                    !LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, tp->t_cpu)) {
     1235 +                        cp = disp_lowpri_cpu(tp->t_cpu, tp, tpri);
1233 1236                  } else {
1234 1237                          cp = tp->t_cpu;
1235 1238                  }
1236 1239  
1237 1240                  if (tp->t_cpupart == cp->cpu_part) {
1238 1241                          int     qlen;
1239 1242  
1240 1243                          /*
1241 1244                           * Perform any CMT load balancing
1242 1245                           */
↓ open down ↓ 8 lines elided ↑ open up ↑
1251 1254                                  qlen -= RUNQ_MAX_DIFF;
1252 1255                          if (qlen > 0) {
1253 1256                                  cpu_t *newcp;
1254 1257  
1255 1258                                  if (tp->t_lpl->lpl_lgrpid == LGRP_ROOTID) {
1256 1259                                          newcp = cp->cpu_next_part;
1257 1260                                  } else if ((newcp = cp->cpu_next_lpl) == cp) {
1258 1261                                          newcp = cp->cpu_next_part;
1259 1262                                  }
1260 1263  
1261      -                                if (RUNQ_LEN(newcp, tpri) < qlen) {
     1264 +                                if (ht_should_run(tp, newcp) &&
     1265 +                                    RUNQ_LEN(newcp, tpri) < qlen) {
1262 1266                                          DTRACE_PROBE3(runq__balance,
1263 1267                                              kthread_t *, tp,
1264 1268                                              cpu_t *, cp, cpu_t *, newcp);
1265 1269                                          cp = newcp;
1266 1270                                  }
1267 1271                          }
1268 1272                  } else {
1269 1273                          /*
1270 1274                           * Migrate to a cpu in the new partition.
1271 1275                           */
1272      -                        cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist,
1273      -                            tp->t_lpl, tp->t_pri, NULL);
     1276 +                        cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist, tp,
     1277 +                            tp->t_pri);
1274 1278                  }
1275 1279                  ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1276 1280          } else {
1277 1281                  /*
1278 1282                   * It is possible that t_weakbound_cpu != t_bound_cpu (for
1279 1283                   * a short time until weak binding that existed when the
1280 1284                   * strong binding was established has dropped) so we must
1281 1285                   * favour weak binding over strong.
1282 1286                   */
1283 1287                  cp = tp->t_weakbound_cpu ?
↓ open down ↓ 116 lines elided ↑ open up ↑
1400 1404          else if (!bound) {
1401 1405                  if (tpri >= kpqpri) {
1402 1406                          setkpdq(tp, SETKP_FRONT);
1403 1407                          return;
1404 1408                  }
1405 1409                  cp = tp->t_cpu;
1406 1410                  if (tp->t_cpupart == cp->cpu_part) {
1407 1411                          /*
1408 1412                           * We'll generally let this thread continue to run
1409 1413                           * where it last ran, but will consider migration if:
1410      -                         * - The thread last ran outside it's home lgroup.
     1414 +                         * - The thread last ran outside its home lgroup.
1411 1415                           * - The CPU where it last ran is the target of an
1412 1416                           *   offline request (a thread_nomigrate() on the in
1413 1417                           *   motion CPU relies on this when forcing a preempt).
1414 1418                           * - The thread isn't the highest priority thread where
1415 1419                           *   it last ran, and it is considered not likely to
1416 1420                           *   have significant cache warmth.
1417 1421                           */
1418      -                        if ((!LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, cp)) ||
1419      -                            (cp == cpu_inmotion)) {
1420      -                                cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri,
1421      -                                    (tp == curthread) ? cp : NULL);
1422      -                        } else if ((tpri < cp->cpu_disp->disp_maxrunpri) &&
1423      -                            (!THREAD_HAS_CACHE_WARMTH(tp))) {
1424      -                                cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri,
1425      -                                    NULL);
     1422 +                        if (!LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, cp) ||
     1423 +                            cp == cpu_inmotion ||
     1424 +                            (tpri < cp->cpu_disp->disp_maxrunpri &&
     1425 +                            !THREAD_HAS_CACHE_WARMTH(tp))) {
     1426 +                                cp = disp_lowpri_cpu(tp->t_cpu, tp, tpri);
1426 1427                          }
1427 1428                  } else {
1428 1429                          /*
1429 1430                           * Migrate to a cpu in the new partition.
1430 1431                           */
1431 1432                          cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist,
1432      -                            tp->t_lpl, tp->t_pri, NULL);
     1433 +                            tp, tp->t_pri);
1433 1434                  }
1434 1435                  ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1435 1436          } else {
1436 1437                  /*
1437 1438                   * It is possible that t_weakbound_cpu != t_bound_cpu (for
1438 1439                   * a short time until weak binding that existed when the
1439 1440                   * strong binding was established has dropped) so we must
1440 1441                   * favour weak binding over strong.
1441 1442                   */
1442 1443                  cp = tp->t_weakbound_cpu ?
↓ open down ↓ 130 lines elided ↑ open up ↑
1573 1574                          dp->disp_maxrunpri = tpri;
1574 1575                          membar_enter();
1575 1576                  }
1576 1577          }
1577 1578  
1578 1579          cp = tp->t_cpu;
1579 1580          if (tp->t_cpupart != cp->cpu_part) {
1580 1581                  /* migrate to a cpu in the new partition */
1581 1582                  cp = tp->t_cpupart->cp_cpulist;
1582 1583          }
1583      -        cp = disp_lowpri_cpu(cp, tp->t_lpl, tp->t_pri, NULL);
     1584 +        cp = disp_lowpri_cpu(cp, tp, tp->t_pri);
1584 1585          disp_lock_enter_high(&cp->cpu_disp->disp_lock);
1585 1586          ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1586 1587  
1587 1588  #ifndef NPROBE
1588 1589          /* Kernel probe */
1589 1590          if (tnf_tracing_active)
1590 1591                  tnf_thread_queue(tp, cp, tpri);
1591 1592  #endif /* NPROBE */
1592 1593  
1593 1594          if (cp->cpu_chosen_level < tpri)
↓ open down ↓ 952 lines elided ↑ open up ↑
2546 2547                   */
2547 2548                  ASSERT(tp->t_cpu != cp);
2548 2549                  thread_unlock(tp);
2549 2550  
2550 2551                  disp_lock_enter(&dp->disp_lock);
2551 2552          }
2552 2553          disp_lock_exit(&dp->disp_lock);
2553 2554  }
2554 2555  
2555 2556  /*
2556      - * disp_lowpri_cpu - find CPU running the lowest priority thread.
2557      - *      The hint passed in is used as a starting point so we don't favor
2558      - *      CPU 0 or any other CPU.  The caller should pass in the most recently
2559      - *      used CPU for the thread.
     2557 + * Return a score rating this CPU for running this thread: lower is better.
2560 2558   *
2561      - *      The lgroup and priority are used to determine the best CPU to run on
2562      - *      in a NUMA machine.  The lgroup specifies which CPUs are closest while
2563      - *      the thread priority will indicate whether the thread will actually run
2564      - *      there.  To pick the best CPU, the CPUs inside and outside of the given
2565      - *      lgroup which are running the lowest priority threads are found.  The
2566      - *      remote CPU is chosen only if the thread will not run locally on a CPU
2567      - *      within the lgroup, but will run on the remote CPU. If the thread
2568      - *      cannot immediately run on any CPU, the best local CPU will be chosen.
     2559 + * If curthread is looking for a new CPU, then we ignore cpu_dispatch_pri for
     2560 + * curcpu (as that's our own priority).
2569 2561   *
2570      - *      The lpl specified also identifies the cpu partition from which
2571      - *      disp_lowpri_cpu should select a CPU.
     2562 + * If a cpu is the target of an offline request, then try to avoid it.
2572 2563   *
2573      - *      curcpu is used to indicate that disp_lowpri_cpu is being called on
2574      - *      behalf of the current thread. (curthread is looking for a new cpu)
2575      - *      In this case, cpu_dispatch_pri for this thread's cpu should be
2576      - *      ignored.
     2564 + * Otherwise we'll use double the effective dispatcher priority for the CPU.
2577 2565   *
2578      - *      If a cpu is the target of an offline request then try to avoid it.
     2566 + * We do this so ht_adjust_cpu_score() can increment the score if needed,
     2567 + * without ending up over-riding a dispatcher priority.
     2568 + */
     2569 +static pri_t
     2570 +cpu_score(cpu_t *cp, kthread_t *tp)
     2571 +{
     2572 +        pri_t score;
     2573 +
     2574 +        if (tp == curthread && cp == curthread->t_cpu)
     2575 +                score = 2 * CPU_IDLE_PRI;
     2576 +        else if (cp == cpu_inmotion)
     2577 +                score = SHRT_MAX;
     2578 +        else
     2579 +                score = 2 * cp->cpu_dispatch_pri;
     2580 +
     2581 +        if (2 * cp->cpu_disp->disp_maxrunpri > score)
     2582 +                score = 2 * cp->cpu_disp->disp_maxrunpri;
     2583 +        if (2 * cp->cpu_chosen_level > score)
     2584 +                score = 2 * cp->cpu_chosen_level;
     2585 +
     2586 +        return (ht_adjust_cpu_score(tp, cp, score));
     2587 +}
     2588 +
     2589 +/*
     2590 + * disp_lowpri_cpu - find a suitable CPU to run the given thread.
2579 2591   *
2580      - *      This function must be called at either high SPL, or with preemption
2581      - *      disabled, so that the "hint" CPU cannot be removed from the online
2582      - *      CPU list while we are traversing it.
     2592 + * We are looking for a CPU with an effective dispatch priority lower than the
     2593 + * thread's, so that the thread will run immediately rather than be enqueued.
     2594 + * For NUMA locality, we prefer "home" CPUs within the thread's ->t_lpl group.
     2595 + * If we don't find an available CPU there, we will expand our search to include
     2596 + * wider locality levels. (Note these groups are already divided by CPU
     2597 + * partition.)
     2598 + *
     2599 + * If the thread cannot immediately run on *any* CPU, we'll enqueue ourselves on
     2600 + * the best home CPU we found.
     2601 + *
     2602 + * The hint passed in is used as a starting point so we don't favor CPU 0 or any
     2603 + * other CPU.  The caller should pass in the most recently used CPU for the
     2604 + * thread; it's of course possible that this CPU isn't in the home lgroup.
     2605 + *
     2606 + * This function must be called at either high SPL, or with preemption disabled,
     2607 + * so that the "hint" CPU cannot be removed from the online CPU list while we
     2608 + * are traversing it.
2583 2609   */
2584 2610  cpu_t *
2585      -disp_lowpri_cpu(cpu_t *hint, lpl_t *lpl, pri_t tpri, cpu_t *curcpu)
     2611 +disp_lowpri_cpu(cpu_t *hint, kthread_t *tp, pri_t tpri)
2586 2612  {
2587 2613          cpu_t   *bestcpu;
2588 2614          cpu_t   *besthomecpu;
2589 2615          cpu_t   *cp, *cpstart;
2590 2616  
2591      -        pri_t   bestpri;
2592      -        pri_t   cpupri;
2593      -
2594 2617          klgrpset_t      done;
2595      -        klgrpset_t      cur_set;
2596 2618  
2597 2619          lpl_t           *lpl_iter, *lpl_leaf;
2598      -        int             i;
2599 2620  
2600      -        /*
2601      -         * Scan for a CPU currently running the lowest priority thread.
2602      -         * Cannot get cpu_lock here because it is adaptive.
2603      -         * We do not require lock on CPU list.
2604      -         */
2605 2621          ASSERT(hint != NULL);
2606      -        ASSERT(lpl != NULL);
2607      -        ASSERT(lpl->lpl_ncpu > 0);
     2622 +        ASSERT(tp->t_lpl->lpl_ncpu > 0);
2608 2623  
2609      -        /*
2610      -         * First examine local CPUs. Note that it's possible the hint CPU
2611      -         * passed in in remote to the specified home lgroup. If our priority
2612      -         * isn't sufficient enough such that we can run immediately at home,
2613      -         * then examine CPUs remote to our home lgroup.
2614      -         * We would like to give preference to CPUs closest to "home".
2615      -         * If we can't find a CPU where we'll run at a given level
2616      -         * of locality, we expand our search to include the next level.
2617      -         */
2618 2624          bestcpu = besthomecpu = NULL;
2619 2625          klgrpset_clear(done);
2620      -        /* start with lpl we were passed */
2621 2626  
2622      -        lpl_iter = lpl;
     2627 +        lpl_iter = tp->t_lpl;
2623 2628  
2624 2629          do {
     2630 +                pri_t best = SHRT_MAX;
     2631 +                klgrpset_t cur_set;
2625 2632  
2626      -                bestpri = SHRT_MAX;
2627 2633                  klgrpset_clear(cur_set);
2628 2634  
2629      -                for (i = 0; i < lpl_iter->lpl_nrset; i++) {
     2635 +                for (int i = 0; i < lpl_iter->lpl_nrset; i++) {
2630 2636                          lpl_leaf = lpl_iter->lpl_rset[i];
2631 2637                          if (klgrpset_ismember(done, lpl_leaf->lpl_lgrpid))
2632 2638                                  continue;
2633 2639  
2634 2640                          klgrpset_add(cur_set, lpl_leaf->lpl_lgrpid);
2635 2641  
2636 2642                          if (hint->cpu_lpl == lpl_leaf)
2637 2643                                  cp = cpstart = hint;
2638 2644                          else
2639 2645                                  cp = cpstart = lpl_leaf->lpl_cpus;
2640 2646  
2641 2647                          do {
2642      -                                if (cp == curcpu)
2643      -                                        cpupri = -1;
2644      -                                else if (cp == cpu_inmotion)
2645      -                                        cpupri = SHRT_MAX;
2646      -                                else
2647      -                                        cpupri = cp->cpu_dispatch_pri;
2648      -                                if (cp->cpu_disp->disp_maxrunpri > cpupri)
2649      -                                        cpupri = cp->cpu_disp->disp_maxrunpri;
2650      -                                if (cp->cpu_chosen_level > cpupri)
2651      -                                        cpupri = cp->cpu_chosen_level;
2652      -                                if (cpupri < bestpri) {
2653      -                                        if (CPU_IDLING(cpupri)) {
2654      -                                                ASSERT((cp->cpu_flags &
2655      -                                                    CPU_QUIESCED) == 0);
2656      -                                                return (cp);
2657      -                                        }
     2648 +                                pri_t score = cpu_score(cp, tp);
     2649 +
     2650 +                                if (score < best) {
     2651 +                                        best = score;
2658 2652                                          bestcpu = cp;
2659      -                                        bestpri = cpupri;
     2653 +
     2654 +                                        /* An idle CPU: we're done. */
     2655 +                                        if (score / 2 == CPU_IDLE_PRI)
     2656 +                                                goto out;
2660 2657                                  }
2661 2658                          } while ((cp = cp->cpu_next_lpl) != cpstart);
2662 2659                  }
2663 2660  
2664      -                if (bestcpu && (tpri > bestpri)) {
2665      -                        ASSERT((bestcpu->cpu_flags & CPU_QUIESCED) == 0);
2666      -                        return (bestcpu);
2667      -                }
     2661 +                if (bestcpu != NULL && tpri > (best / 2))
     2662 +                        goto out;
     2663 +
2668 2664                  if (besthomecpu == NULL)
2669 2665                          besthomecpu = bestcpu;
     2666 +
2670 2667                  /*
2671 2668                   * Add the lgrps we just considered to the "done" set
2672 2669                   */
2673 2670                  klgrpset_or(done, cur_set);
2674 2671  
2675 2672          } while ((lpl_iter = lpl_iter->lpl_parent) != NULL);
2676 2673  
2677 2674          /*
2678 2675           * The specified priority isn't high enough to run immediately
2679 2676           * anywhere, so just return the best CPU from the home lgroup.
2680 2677           */
2681      -        ASSERT((besthomecpu->cpu_flags & CPU_QUIESCED) == 0);
2682      -        return (besthomecpu);
     2678 +        bestcpu = besthomecpu;
     2679 +
     2680 +out:
     2681 +        ASSERT((bestcpu->cpu_flags & CPU_QUIESCED) == 0);
     2682 +        return (bestcpu);
2683 2683  }
2684 2684  
2685 2685  /*
2686 2686   * This routine provides the generic idle cpu function for all processors.
2687 2687   * If a processor has some specific code to execute when idle (say, to stop
2688 2688   * the pipeline and save power) then that routine should be defined in the
2689 2689   * processors specific code (module_xx.c) and the global variable idle_cpu
2690 2690   * set to that function.
2691 2691   */
2692 2692  static void
2693 2693  generic_idle_cpu(void)
2694 2694  {
2695 2695  }
2696 2696  
2697 2697  /*ARGSUSED*/
2698 2698  static void
2699 2699  generic_enq_thread(cpu_t *cpu, int bound)
2700 2700  {
     2701 +}
     2702 +
     2703 +cpu_t *
     2704 +disp_choose_best_cpu(void)
     2705 +{
     2706 +        kthread_t *t = curthread;
     2707 +        cpu_t *curcpu = CPU;
     2708 +
     2709 +        ASSERT(t->t_preempt > 0);
     2710 +        ASSERT(t->t_state == TS_ONPROC);
     2711 +        ASSERT(t->t_schedflag & TS_VCPU);
     2712 +
     2713 +        if (ht_should_run(t, curcpu))
     2714 +                return (curcpu);
     2715 +
     2716 +        return (disp_lowpri_cpu(curcpu, t, t->t_pri));
2701 2717  }
    
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX