Print this page
10924 Need mitigation of L1TF (CVE-2018-3646)
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Peter Tribble <peter.tribble@gmail.com>


   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 




  26 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T     */
  27 /*        All Rights Reserved   */
  28 
  29 
  30 #include <sys/types.h>
  31 #include <sys/param.h>
  32 #include <sys/sysmacros.h>
  33 #include <sys/signal.h>
  34 #include <sys/user.h>
  35 #include <sys/systm.h>
  36 #include <sys/sysinfo.h>
  37 #include <sys/var.h>
  38 #include <sys/errno.h>
  39 #include <sys/cmn_err.h>
  40 #include <sys/debug.h>
  41 #include <sys/inline.h>
  42 #include <sys/disp.h>
  43 #include <sys/class.h>
  44 #include <sys/bitmap.h>
  45 #include <sys/kmem.h>
  46 #include <sys/cpuvar.h>
  47 #include <sys/vtrace.h>
  48 #include <sys/tnf.h>
  49 #include <sys/cpupart.h>
  50 #include <sys/lgrp.h>
  51 #include <sys/pg.h>
  52 #include <sys/cmt.h>
  53 #include <sys/bitset.h>
  54 #include <sys/schedctl.h>
  55 #include <sys/atomic.h>
  56 #include <sys/dtrace.h>
  57 #include <sys/sdt.h>
  58 #include <sys/archsystm.h>

  59 
  60 #include <vm/as.h>
  61 
  62 #define BOUND_CPU       0x1
  63 #define BOUND_PARTITION 0x2
  64 #define BOUND_INTR      0x4
  65 
  66 /* Dispatch queue allocation structure and functions */
  67 struct disp_queue_info {
  68         disp_t  *dp;
  69         dispq_t *olddispq;
  70         dispq_t *newdispq;
  71         ulong_t *olddqactmap;
  72         ulong_t *newdqactmap;
  73         int     oldnglobpris;
  74 };
  75 static void     disp_dq_alloc(struct disp_queue_info *dptr, int numpris,
  76     disp_t *dp);
  77 static void     disp_dq_assign(struct disp_queue_info *dptr, int numpris);
  78 static void     disp_dq_free(struct disp_queue_info *dptr);


1098          * queue.
1099          */
1100         if ((curthread->t_state == TS_RUN) && (curthread->t_waitrq == 0)) {
1101                 curthread->t_waitrq = now;
1102         }
1103 
1104         /* restore next thread to previously running microstate */
1105         restore_mstate(next);
1106 
1107         if (dtrace_vtime_active)
1108                 dtrace_vtime_switch(next);
1109 
1110         resume(next);
1111         /*
1112          * The TR_RESUME_END and TR_SWTCH_END trace points
1113          * appear at the end of resume(), because we may not
1114          * return here
1115          */
1116 }
1117 
1118 #define CPU_IDLING(pri) ((pri) == -1)
1119 
1120 static void
1121 cpu_resched(cpu_t *cp, pri_t tpri)
1122 {
1123         int     call_poke_cpu = 0;
1124         pri_t   cpupri = cp->cpu_dispatch_pri;
1125 
1126         if (!CPU_IDLING(cpupri) && (cpupri < tpri)) {
1127                 TRACE_2(TR_FAC_DISP, TR_CPU_RESCHED,
1128                     "CPU_RESCHED:Tpri %d Cpupri %d", tpri, cpupri);
1129                 if (tpri >= upreemptpri && cp->cpu_runrun == 0) {
1130                         cp->cpu_runrun = 1;
1131                         aston(cp->cpu_dispthread);
1132                         if (tpri < kpreemptpri && cp != CPU)
1133                                 call_poke_cpu = 1;
1134                 }
1135                 if (tpri >= kpreemptpri && cp->cpu_kprunrun == 0) {
1136                         cp->cpu_kprunrun = 1;
1137                         if (cp != CPU)
1138                                 call_poke_cpu = 1;
1139                 }
1140         }
1141 
1142         /*
1143          * Propagate cpu_runrun, and cpu_kprunrun to global visibility.
1144          */
1145         membar_enter();
1146 


1202 
1203         self = (tp == curthread);
1204 
1205         if (tp->t_bound_cpu || tp->t_weakbound_cpu)
1206                 bound = 1;
1207         else
1208                 bound = 0;
1209 
1210         tpri = DISP_PRIO(tp);
1211         if (ncpus == 1)
1212                 cp = tp->t_cpu;
1213         else if (!bound) {
1214                 if (tpri >= kpqpri) {
1215                         setkpdq(tp, SETKP_BACK);
1216                         return;
1217                 }
1218 
1219                 /*
1220                  * We'll generally let this thread continue to run where
1221                  * it last ran...but will consider migration if:
1222                  * - We thread probably doesn't have much cache warmth.

1223                  * - The CPU where it last ran is the target of an offline
1224                  *   request.
1225                  * - The thread last ran outside it's home lgroup.
1226                  */
1227                 if ((!THREAD_HAS_CACHE_WARMTH(tp)) ||
1228                     (tp->t_cpu == cpu_inmotion)) {
1229                         cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri, NULL);
1230                 } else if (!LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, tp->t_cpu)) {
1231                         cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri,
1232                             self ? tp->t_cpu : NULL);
1233                 } else {
1234                         cp = tp->t_cpu;
1235                 }
1236 
1237                 if (tp->t_cpupart == cp->cpu_part) {
1238                         int     qlen;
1239 
1240                         /*
1241                          * Perform any CMT load balancing
1242                          */
1243                         cp = cmt_balance(tp, cp);
1244 
1245                         /*
1246                          * Balance across the run queues
1247                          */
1248                         qlen = RUNQ_LEN(cp, tpri);
1249                         if (tpri >= RUNQ_MATCH_PRI &&
1250                             !(tp->t_schedflag & TS_RUNQMATCH))
1251                                 qlen -= RUNQ_MAX_DIFF;
1252                         if (qlen > 0) {
1253                                 cpu_t *newcp;
1254 
1255                                 if (tp->t_lpl->lpl_lgrpid == LGRP_ROOTID) {
1256                                         newcp = cp->cpu_next_part;
1257                                 } else if ((newcp = cp->cpu_next_lpl) == cp) {
1258                                         newcp = cp->cpu_next_part;
1259                                 }
1260 
1261                                 if (RUNQ_LEN(newcp, tpri) < qlen) {

1262                                         DTRACE_PROBE3(runq__balance,
1263                                             kthread_t *, tp,
1264                                             cpu_t *, cp, cpu_t *, newcp);
1265                                         cp = newcp;
1266                                 }
1267                         }
1268                 } else {
1269                         /*
1270                          * Migrate to a cpu in the new partition.
1271                          */
1272                         cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist,
1273                             tp->t_lpl, tp->t_pri, NULL);
1274                 }
1275                 ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1276         } else {
1277                 /*
1278                  * It is possible that t_weakbound_cpu != t_bound_cpu (for
1279                  * a short time until weak binding that existed when the
1280                  * strong binding was established has dropped) so we must
1281                  * favour weak binding over strong.
1282                  */
1283                 cp = tp->t_weakbound_cpu ?
1284                     tp->t_weakbound_cpu : tp->t_bound_cpu;
1285         }
1286         /*
1287          * A thread that is ONPROC may be temporarily placed on the run queue
1288          * but then chosen to run again by disp.  If the thread we're placing on
1289          * the queue is in TS_ONPROC state, don't set its t_waitrq until a
1290          * replacement process is actually scheduled in swtch().  In this
1291          * situation, curthread is the only thread that could be in the ONPROC
1292          * state.
1293          */


1390         }
1391 
1392         if (tp->t_bound_cpu || tp->t_weakbound_cpu)
1393                 bound = 1;
1394         else
1395                 bound = 0;
1396 
1397         tpri = DISP_PRIO(tp);
1398         if (ncpus == 1)
1399                 cp = tp->t_cpu;
1400         else if (!bound) {
1401                 if (tpri >= kpqpri) {
1402                         setkpdq(tp, SETKP_FRONT);
1403                         return;
1404                 }
1405                 cp = tp->t_cpu;
1406                 if (tp->t_cpupart == cp->cpu_part) {
1407                         /*
1408                          * We'll generally let this thread continue to run
1409                          * where it last ran, but will consider migration if:
1410                          * - The thread last ran outside it's home lgroup.
1411                          * - The CPU where it last ran is the target of an
1412                          *   offline request (a thread_nomigrate() on the in
1413                          *   motion CPU relies on this when forcing a preempt).
1414                          * - The thread isn't the highest priority thread where
1415                          *   it last ran, and it is considered not likely to
1416                          *   have significant cache warmth.
1417                          */
1418                         if ((!LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, cp)) ||
1419                             (cp == cpu_inmotion)) {
1420                                 cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri,
1421                                     (tp == curthread) ? cp : NULL);
1422                         } else if ((tpri < cp->cpu_disp->disp_maxrunpri) &&
1423                             (!THREAD_HAS_CACHE_WARMTH(tp))) {
1424                                 cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri,
1425                                     NULL);
1426                         }
1427                 } else {
1428                         /*
1429                          * Migrate to a cpu in the new partition.
1430                          */
1431                         cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist,
1432                             tp->t_lpl, tp->t_pri, NULL);
1433                 }
1434                 ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1435         } else {
1436                 /*
1437                  * It is possible that t_weakbound_cpu != t_bound_cpu (for
1438                  * a short time until weak binding that existed when the
1439                  * strong binding was established has dropped) so we must
1440                  * favour weak binding over strong.
1441                  */
1442                 cp = tp->t_weakbound_cpu ?
1443                     tp->t_weakbound_cpu : tp->t_bound_cpu;
1444         }
1445 
1446         /*
1447          * A thread that is ONPROC may be temporarily placed on the run queue
1448          * but then chosen to run again by disp.  If the thread we're placing on
1449          * the queue is in TS_ONPROC state, don't set its t_waitrq until a
1450          * replacement process is actually scheduled in swtch().  In this
1451          * situation, curthread is the only thread that could be in the ONPROC
1452          * state.


1563                 } else {
1564                         ASSERT(dq->dq_last == NULL);
1565                         ASSERT(dq->dq_first == NULL);
1566                         tp->t_link = NULL;
1567                         dq->dq_first = dq->dq_last = tp;
1568                 }
1569                 BT_SET(dp->disp_qactmap, tpri);
1570                 if (tpri > dp->disp_max_unbound_pri)
1571                         dp->disp_max_unbound_pri = tpri;
1572                 if (tpri > dp->disp_maxrunpri) {
1573                         dp->disp_maxrunpri = tpri;
1574                         membar_enter();
1575                 }
1576         }
1577 
1578         cp = tp->t_cpu;
1579         if (tp->t_cpupart != cp->cpu_part) {
1580                 /* migrate to a cpu in the new partition */
1581                 cp = tp->t_cpupart->cp_cpulist;
1582         }
1583         cp = disp_lowpri_cpu(cp, tp->t_lpl, tp->t_pri, NULL);
1584         disp_lock_enter_high(&cp->cpu_disp->disp_lock);
1585         ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1586 
1587 #ifndef NPROBE
1588         /* Kernel probe */
1589         if (tnf_tracing_active)
1590                 tnf_thread_queue(tp, cp, tpri);
1591 #endif /* NPROBE */
1592 
1593         if (cp->cpu_chosen_level < tpri)
1594                 cp->cpu_chosen_level = tpri;
1595         cpu_resched(cp, tpri);
1596         disp_lock_exit_high(&cp->cpu_disp->disp_lock);
1597         (*disp_enq_thread)(cp, 0);
1598 }
1599 
1600 /*
1601  * Remove a thread from the dispatcher queue if it is on it.
1602  * It is not an error if it is not found but we return whether
1603  * or not it was found in case the caller wants to check.


2536                  *
2537                  * cp has already been removed from the list of active cpus
2538                  * and tp->t_cpu has been changed so there is no risk of
2539                  * tp ending up back on cp.
2540                  *
2541                  * Called from cpupart_move_cpu:
2542                  *
2543                  * The cpu has moved to a new cpupart.  Any threads that
2544                  * were on it's dispatch queues before the move remain
2545                  * in the old partition and can't run in the new partition.
2546                  */
2547                 ASSERT(tp->t_cpu != cp);
2548                 thread_unlock(tp);
2549 
2550                 disp_lock_enter(&dp->disp_lock);
2551         }
2552         disp_lock_exit(&dp->disp_lock);
2553 }
2554 
2555 /*
2556  * disp_lowpri_cpu - find CPU running the lowest priority thread.
2557  *      The hint passed in is used as a starting point so we don't favor
2558  *      CPU 0 or any other CPU.  The caller should pass in the most recently
2559  *      used CPU for the thread.
2560  *
2561  *      The lgroup and priority are used to determine the best CPU to run on
2562  *      in a NUMA machine.  The lgroup specifies which CPUs are closest while
2563  *      the thread priority will indicate whether the thread will actually run
2564  *      there.  To pick the best CPU, the CPUs inside and outside of the given
2565  *      lgroup which are running the lowest priority threads are found.  The
2566  *      remote CPU is chosen only if the thread will not run locally on a CPU
2567  *      within the lgroup, but will run on the remote CPU. If the thread
2568  *      cannot immediately run on any CPU, the best local CPU will be chosen.
2569  *
2570  *      The lpl specified also identifies the cpu partition from which
2571  *      disp_lowpri_cpu should select a CPU.
2572  *
2573  *      curcpu is used to indicate that disp_lowpri_cpu is being called on
2574  *      behalf of the current thread. (curthread is looking for a new cpu)
2575  *      In this case, cpu_dispatch_pri for this thread's cpu should be
2576  *      ignored.
2577  *
2578  *      If a cpu is the target of an offline request then try to avoid it.
























2579  *
2580  *      This function must be called at either high SPL, or with preemption
2581  *      disabled, so that the "hint" CPU cannot be removed from the online
2582  *      CPU list while we are traversing it.














2583  */
2584 cpu_t *
2585 disp_lowpri_cpu(cpu_t *hint, lpl_t *lpl, pri_t tpri, cpu_t *curcpu)
2586 {
2587         cpu_t   *bestcpu;
2588         cpu_t   *besthomecpu;
2589         cpu_t   *cp, *cpstart;
2590 
2591         pri_t   bestpri;
2592         pri_t   cpupri;
2593 
2594         klgrpset_t      done;
2595         klgrpset_t      cur_set;
2596 
2597         lpl_t           *lpl_iter, *lpl_leaf;
2598         int             i;
2599 
2600         /*
2601          * Scan for a CPU currently running the lowest priority thread.
2602          * Cannot get cpu_lock here because it is adaptive.
2603          * We do not require lock on CPU list.
2604          */
2605         ASSERT(hint != NULL);
2606         ASSERT(lpl != NULL);
2607         ASSERT(lpl->lpl_ncpu > 0);
2608 
2609         /*
2610          * First examine local CPUs. Note that it's possible the hint CPU
2611          * passed in in remote to the specified home lgroup. If our priority
2612          * isn't sufficient enough such that we can run immediately at home,
2613          * then examine CPUs remote to our home lgroup.
2614          * We would like to give preference to CPUs closest to "home".
2615          * If we can't find a CPU where we'll run at a given level
2616          * of locality, we expand our search to include the next level.
2617          */
2618         bestcpu = besthomecpu = NULL;
2619         klgrpset_clear(done);
2620         /* start with lpl we were passed */
2621 
2622         lpl_iter = lpl;
2623 
2624         do {


2625 
2626                 bestpri = SHRT_MAX;
2627                 klgrpset_clear(cur_set);
2628 
2629                 for (i = 0; i < lpl_iter->lpl_nrset; i++) {
2630                         lpl_leaf = lpl_iter->lpl_rset[i];
2631                         if (klgrpset_ismember(done, lpl_leaf->lpl_lgrpid))
2632                                 continue;
2633 
2634                         klgrpset_add(cur_set, lpl_leaf->lpl_lgrpid);
2635 
2636                         if (hint->cpu_lpl == lpl_leaf)
2637                                 cp = cpstart = hint;
2638                         else
2639                                 cp = cpstart = lpl_leaf->lpl_cpus;
2640 
2641                         do {
2642                                 if (cp == curcpu)
2643                                         cpupri = -1;
2644                                 else if (cp == cpu_inmotion)
2645                                         cpupri = SHRT_MAX;
2646                                 else
2647                                         cpupri = cp->cpu_dispatch_pri;
2648                                 if (cp->cpu_disp->disp_maxrunpri > cpupri)
2649                                         cpupri = cp->cpu_disp->disp_maxrunpri;
2650                                 if (cp->cpu_chosen_level > cpupri)
2651                                         cpupri = cp->cpu_chosen_level;
2652                                 if (cpupri < bestpri) {
2653                                         if (CPU_IDLING(cpupri)) {
2654                                                 ASSERT((cp->cpu_flags &
2655                                                     CPU_QUIESCED) == 0);
2656                                                 return (cp);
2657                                         }
2658                                         bestcpu = cp;
2659                                         bestpri = cpupri;



2660                                 }
2661                         } while ((cp = cp->cpu_next_lpl) != cpstart);
2662                 }
2663 
2664                 if (bestcpu && (tpri > bestpri)) {
2665                         ASSERT((bestcpu->cpu_flags & CPU_QUIESCED) == 0);
2666                         return (bestcpu);
2667                 }
2668                 if (besthomecpu == NULL)
2669                         besthomecpu = bestcpu;

2670                 /*
2671                  * Add the lgrps we just considered to the "done" set
2672                  */
2673                 klgrpset_or(done, cur_set);
2674 
2675         } while ((lpl_iter = lpl_iter->lpl_parent) != NULL);
2676 
2677         /*
2678          * The specified priority isn't high enough to run immediately
2679          * anywhere, so just return the best CPU from the home lgroup.
2680          */
2681         ASSERT((besthomecpu->cpu_flags & CPU_QUIESCED) == 0);
2682         return (besthomecpu);



2683 }
2684 
2685 /*
2686  * This routine provides the generic idle cpu function for all processors.
2687  * If a processor has some specific code to execute when idle (say, to stop
2688  * the pipeline and save power) then that routine should be defined in the
2689  * processors specific code (module_xx.c) and the global variable idle_cpu
2690  * set to that function.
2691  */
2692 static void
2693 generic_idle_cpu(void)
2694 {
2695 }
2696 
2697 /*ARGSUSED*/
2698 static void
2699 generic_enq_thread(cpu_t *cpu, int bound)
2700 {
















2701 }


   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 
  26 /*
  27  * Copyright (c) 2018, Joyent, Inc. All rights reserved.
  28  */
  29 
  30 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T     */
  31 /*        All Rights Reserved   */
  32 
  33 
  34 #include <sys/types.h>
  35 #include <sys/param.h>
  36 #include <sys/sysmacros.h>
  37 #include <sys/signal.h>
  38 #include <sys/user.h>
  39 #include <sys/systm.h>
  40 #include <sys/sysinfo.h>
  41 #include <sys/var.h>
  42 #include <sys/errno.h>
  43 #include <sys/cmn_err.h>
  44 #include <sys/debug.h>
  45 #include <sys/inline.h>
  46 #include <sys/disp.h>
  47 #include <sys/class.h>
  48 #include <sys/bitmap.h>
  49 #include <sys/kmem.h>
  50 #include <sys/cpuvar.h>
  51 #include <sys/vtrace.h>
  52 #include <sys/tnf.h>
  53 #include <sys/cpupart.h>
  54 #include <sys/lgrp.h>
  55 #include <sys/pg.h>
  56 #include <sys/cmt.h>
  57 #include <sys/bitset.h>
  58 #include <sys/schedctl.h>
  59 #include <sys/atomic.h>
  60 #include <sys/dtrace.h>
  61 #include <sys/sdt.h>
  62 #include <sys/archsystm.h>
  63 #include <sys/ht.h>
  64 
  65 #include <vm/as.h>
  66 
  67 #define BOUND_CPU       0x1
  68 #define BOUND_PARTITION 0x2
  69 #define BOUND_INTR      0x4
  70 
  71 /* Dispatch queue allocation structure and functions */
  72 struct disp_queue_info {
  73         disp_t  *dp;
  74         dispq_t *olddispq;
  75         dispq_t *newdispq;
  76         ulong_t *olddqactmap;
  77         ulong_t *newdqactmap;
  78         int     oldnglobpris;
  79 };
  80 static void     disp_dq_alloc(struct disp_queue_info *dptr, int numpris,
  81     disp_t *dp);
  82 static void     disp_dq_assign(struct disp_queue_info *dptr, int numpris);
  83 static void     disp_dq_free(struct disp_queue_info *dptr);


1103          * queue.
1104          */
1105         if ((curthread->t_state == TS_RUN) && (curthread->t_waitrq == 0)) {
1106                 curthread->t_waitrq = now;
1107         }
1108 
1109         /* restore next thread to previously running microstate */
1110         restore_mstate(next);
1111 
1112         if (dtrace_vtime_active)
1113                 dtrace_vtime_switch(next);
1114 
1115         resume(next);
1116         /*
1117          * The TR_RESUME_END and TR_SWTCH_END trace points
1118          * appear at the end of resume(), because we may not
1119          * return here
1120          */
1121 }
1122 


1123 static void
1124 cpu_resched(cpu_t *cp, pri_t tpri)
1125 {
1126         int     call_poke_cpu = 0;
1127         pri_t   cpupri = cp->cpu_dispatch_pri;
1128 
1129         if (cpupri != CPU_IDLE_PRI && cpupri < tpri) {
1130                 TRACE_2(TR_FAC_DISP, TR_CPU_RESCHED,
1131                     "CPU_RESCHED:Tpri %d Cpupri %d", tpri, cpupri);
1132                 if (tpri >= upreemptpri && cp->cpu_runrun == 0) {
1133                         cp->cpu_runrun = 1;
1134                         aston(cp->cpu_dispthread);
1135                         if (tpri < kpreemptpri && cp != CPU)
1136                                 call_poke_cpu = 1;
1137                 }
1138                 if (tpri >= kpreemptpri && cp->cpu_kprunrun == 0) {
1139                         cp->cpu_kprunrun = 1;
1140                         if (cp != CPU)
1141                                 call_poke_cpu = 1;
1142                 }
1143         }
1144 
1145         /*
1146          * Propagate cpu_runrun, and cpu_kprunrun to global visibility.
1147          */
1148         membar_enter();
1149 


1205 
1206         self = (tp == curthread);
1207 
1208         if (tp->t_bound_cpu || tp->t_weakbound_cpu)
1209                 bound = 1;
1210         else
1211                 bound = 0;
1212 
1213         tpri = DISP_PRIO(tp);
1214         if (ncpus == 1)
1215                 cp = tp->t_cpu;
1216         else if (!bound) {
1217                 if (tpri >= kpqpri) {
1218                         setkpdq(tp, SETKP_BACK);
1219                         return;
1220                 }
1221 
1222                 /*
1223                  * We'll generally let this thread continue to run where
1224                  * it last ran...but will consider migration if:
1225                  * - The thread probably doesn't have much cache warmth.
1226                  * - HT exclusion would prefer us to run elsewhere
1227                  * - The CPU where it last ran is the target of an offline
1228                  *   request.
1229                  * - The thread last ran outside its home lgroup.
1230                  */
1231                 if ((!THREAD_HAS_CACHE_WARMTH(tp)) ||
1232                     !ht_should_run(tp, tp->t_cpu) ||
1233                     (tp->t_cpu == cpu_inmotion) ||
1234                     !LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, tp->t_cpu)) {
1235                         cp = disp_lowpri_cpu(tp->t_cpu, tp, tpri);

1236                 } else {
1237                         cp = tp->t_cpu;
1238                 }
1239 
1240                 if (tp->t_cpupart == cp->cpu_part) {
1241                         int     qlen;
1242 
1243                         /*
1244                          * Perform any CMT load balancing
1245                          */
1246                         cp = cmt_balance(tp, cp);
1247 
1248                         /*
1249                          * Balance across the run queues
1250                          */
1251                         qlen = RUNQ_LEN(cp, tpri);
1252                         if (tpri >= RUNQ_MATCH_PRI &&
1253                             !(tp->t_schedflag & TS_RUNQMATCH))
1254                                 qlen -= RUNQ_MAX_DIFF;
1255                         if (qlen > 0) {
1256                                 cpu_t *newcp;
1257 
1258                                 if (tp->t_lpl->lpl_lgrpid == LGRP_ROOTID) {
1259                                         newcp = cp->cpu_next_part;
1260                                 } else if ((newcp = cp->cpu_next_lpl) == cp) {
1261                                         newcp = cp->cpu_next_part;
1262                                 }
1263 
1264                                 if (ht_should_run(tp, newcp) &&
1265                                     RUNQ_LEN(newcp, tpri) < qlen) {
1266                                         DTRACE_PROBE3(runq__balance,
1267                                             kthread_t *, tp,
1268                                             cpu_t *, cp, cpu_t *, newcp);
1269                                         cp = newcp;
1270                                 }
1271                         }
1272                 } else {
1273                         /*
1274                          * Migrate to a cpu in the new partition.
1275                          */
1276                         cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist, tp,
1277                             tp->t_pri);
1278                 }
1279                 ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1280         } else {
1281                 /*
1282                  * It is possible that t_weakbound_cpu != t_bound_cpu (for
1283                  * a short time until weak binding that existed when the
1284                  * strong binding was established has dropped) so we must
1285                  * favour weak binding over strong.
1286                  */
1287                 cp = tp->t_weakbound_cpu ?
1288                     tp->t_weakbound_cpu : tp->t_bound_cpu;
1289         }
1290         /*
1291          * A thread that is ONPROC may be temporarily placed on the run queue
1292          * but then chosen to run again by disp.  If the thread we're placing on
1293          * the queue is in TS_ONPROC state, don't set its t_waitrq until a
1294          * replacement process is actually scheduled in swtch().  In this
1295          * situation, curthread is the only thread that could be in the ONPROC
1296          * state.
1297          */


1394         }
1395 
1396         if (tp->t_bound_cpu || tp->t_weakbound_cpu)
1397                 bound = 1;
1398         else
1399                 bound = 0;
1400 
1401         tpri = DISP_PRIO(tp);
1402         if (ncpus == 1)
1403                 cp = tp->t_cpu;
1404         else if (!bound) {
1405                 if (tpri >= kpqpri) {
1406                         setkpdq(tp, SETKP_FRONT);
1407                         return;
1408                 }
1409                 cp = tp->t_cpu;
1410                 if (tp->t_cpupart == cp->cpu_part) {
1411                         /*
1412                          * We'll generally let this thread continue to run
1413                          * where it last ran, but will consider migration if:
1414                          * - The thread last ran outside its home lgroup.
1415                          * - The CPU where it last ran is the target of an
1416                          *   offline request (a thread_nomigrate() on the in
1417                          *   motion CPU relies on this when forcing a preempt).
1418                          * - The thread isn't the highest priority thread where
1419                          *   it last ran, and it is considered not likely to
1420                          *   have significant cache warmth.
1421                          */
1422                         if (!LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, cp) ||
1423                             cp == cpu_inmotion ||
1424                             (tpri < cp->cpu_disp->disp_maxrunpri &&
1425                             !THREAD_HAS_CACHE_WARMTH(tp))) {
1426                                 cp = disp_lowpri_cpu(tp->t_cpu, tp, tpri);



1427                         }
1428                 } else {
1429                         /*
1430                          * Migrate to a cpu in the new partition.
1431                          */
1432                         cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist,
1433                             tp, tp->t_pri);
1434                 }
1435                 ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1436         } else {
1437                 /*
1438                  * It is possible that t_weakbound_cpu != t_bound_cpu (for
1439                  * a short time until weak binding that existed when the
1440                  * strong binding was established has dropped) so we must
1441                  * favour weak binding over strong.
1442                  */
1443                 cp = tp->t_weakbound_cpu ?
1444                     tp->t_weakbound_cpu : tp->t_bound_cpu;
1445         }
1446 
1447         /*
1448          * A thread that is ONPROC may be temporarily placed on the run queue
1449          * but then chosen to run again by disp.  If the thread we're placing on
1450          * the queue is in TS_ONPROC state, don't set its t_waitrq until a
1451          * replacement process is actually scheduled in swtch().  In this
1452          * situation, curthread is the only thread that could be in the ONPROC
1453          * state.


1564                 } else {
1565                         ASSERT(dq->dq_last == NULL);
1566                         ASSERT(dq->dq_first == NULL);
1567                         tp->t_link = NULL;
1568                         dq->dq_first = dq->dq_last = tp;
1569                 }
1570                 BT_SET(dp->disp_qactmap, tpri);
1571                 if (tpri > dp->disp_max_unbound_pri)
1572                         dp->disp_max_unbound_pri = tpri;
1573                 if (tpri > dp->disp_maxrunpri) {
1574                         dp->disp_maxrunpri = tpri;
1575                         membar_enter();
1576                 }
1577         }
1578 
1579         cp = tp->t_cpu;
1580         if (tp->t_cpupart != cp->cpu_part) {
1581                 /* migrate to a cpu in the new partition */
1582                 cp = tp->t_cpupart->cp_cpulist;
1583         }
1584         cp = disp_lowpri_cpu(cp, tp, tp->t_pri);
1585         disp_lock_enter_high(&cp->cpu_disp->disp_lock);
1586         ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1587 
1588 #ifndef NPROBE
1589         /* Kernel probe */
1590         if (tnf_tracing_active)
1591                 tnf_thread_queue(tp, cp, tpri);
1592 #endif /* NPROBE */
1593 
1594         if (cp->cpu_chosen_level < tpri)
1595                 cp->cpu_chosen_level = tpri;
1596         cpu_resched(cp, tpri);
1597         disp_lock_exit_high(&cp->cpu_disp->disp_lock);
1598         (*disp_enq_thread)(cp, 0);
1599 }
1600 
1601 /*
1602  * Remove a thread from the dispatcher queue if it is on it.
1603  * It is not an error if it is not found but we return whether
1604  * or not it was found in case the caller wants to check.


2537                  *
2538                  * cp has already been removed from the list of active cpus
2539                  * and tp->t_cpu has been changed so there is no risk of
2540                  * tp ending up back on cp.
2541                  *
2542                  * Called from cpupart_move_cpu:
2543                  *
2544                  * The cpu has moved to a new cpupart.  Any threads that
2545                  * were on it's dispatch queues before the move remain
2546                  * in the old partition and can't run in the new partition.
2547                  */
2548                 ASSERT(tp->t_cpu != cp);
2549                 thread_unlock(tp);
2550 
2551                 disp_lock_enter(&dp->disp_lock);
2552         }
2553         disp_lock_exit(&dp->disp_lock);
2554 }
2555 
2556 /*
2557  * Return a score rating this CPU for running this thread: lower is better.



2558  *
2559  * If curthread is looking for a new CPU, then we ignore cpu_dispatch_pri for
2560  * curcpu (as that's our own priority).






2561  *
2562  * If a cpu is the target of an offline request, then try to avoid it.

2563  *
2564  * Otherwise we'll use double the effective dispatcher priority for the CPU.



2565  *
2566  * We do this so ht_adjust_cpu_score() can increment the score if needed,
2567  * without ending up over-riding a dispatcher priority.
2568  */
2569 static pri_t
2570 cpu_score(cpu_t *cp, kthread_t *tp)
2571 {
2572         pri_t score;
2573 
2574         if (tp == curthread && cp == curthread->t_cpu)
2575                 score = 2 * CPU_IDLE_PRI;
2576         else if (cp == cpu_inmotion)
2577                 score = SHRT_MAX;
2578         else
2579                 score = 2 * cp->cpu_dispatch_pri;
2580 
2581         if (2 * cp->cpu_disp->disp_maxrunpri > score)
2582                 score = 2 * cp->cpu_disp->disp_maxrunpri;
2583         if (2 * cp->cpu_chosen_level > score)
2584                 score = 2 * cp->cpu_chosen_level;
2585 
2586         return (ht_adjust_cpu_score(tp, cp, score));
2587 }
2588 
2589 /*
2590  * disp_lowpri_cpu - find a suitable CPU to run the given thread.
2591  *
2592  * We are looking for a CPU with an effective dispatch priority lower than the
2593  * thread's, so that the thread will run immediately rather than be enqueued.
2594  * For NUMA locality, we prefer "home" CPUs within the thread's ->t_lpl group.
2595  * If we don't find an available CPU there, we will expand our search to include
2596  * wider locality levels. (Note these groups are already divided by CPU
2597  * partition.)
2598  *
2599  * If the thread cannot immediately run on *any* CPU, we'll enqueue ourselves on
2600  * the best home CPU we found.
2601  *
2602  * The hint passed in is used as a starting point so we don't favor CPU 0 or any
2603  * other CPU.  The caller should pass in the most recently used CPU for the
2604  * thread; it's of course possible that this CPU isn't in the home lgroup.
2605  *
2606  * This function must be called at either high SPL, or with preemption disabled,
2607  * so that the "hint" CPU cannot be removed from the online CPU list while we
2608  * are traversing it.
2609  */
2610 cpu_t *
2611 disp_lowpri_cpu(cpu_t *hint, kthread_t *tp, pri_t tpri)
2612 {
2613         cpu_t   *bestcpu;
2614         cpu_t   *besthomecpu;
2615         cpu_t   *cp, *cpstart;
2616 



2617         klgrpset_t      done;

2618 
2619         lpl_t           *lpl_iter, *lpl_leaf;

2620 





2621         ASSERT(hint != NULL);
2622         ASSERT(tp->t_lpl->lpl_ncpu > 0);

2623 









2624         bestcpu = besthomecpu = NULL;
2625         klgrpset_clear(done);

2626 
2627         lpl_iter = tp->t_lpl;
2628 
2629         do {
2630                 pri_t best = SHRT_MAX;
2631                 klgrpset_t cur_set;
2632 

2633                 klgrpset_clear(cur_set);
2634 
2635                 for (int i = 0; i < lpl_iter->lpl_nrset; i++) {
2636                         lpl_leaf = lpl_iter->lpl_rset[i];
2637                         if (klgrpset_ismember(done, lpl_leaf->lpl_lgrpid))
2638                                 continue;
2639 
2640                         klgrpset_add(cur_set, lpl_leaf->lpl_lgrpid);
2641 
2642                         if (hint->cpu_lpl == lpl_leaf)
2643                                 cp = cpstart = hint;
2644                         else
2645                                 cp = cpstart = lpl_leaf->lpl_cpus;
2646 
2647                         do {
2648                                 pri_t score = cpu_score(cp, tp);
2649 
2650                                 if (score < best) {
2651                                         best = score;












2652                                         bestcpu = cp;
2653 
2654                                         /* An idle CPU: we're done. */
2655                                         if (score / 2 == CPU_IDLE_PRI)
2656                                                 goto out;
2657                                 }
2658                         } while ((cp = cp->cpu_next_lpl) != cpstart);
2659                 }
2660 
2661                 if (bestcpu != NULL && tpri > (best / 2))
2662                         goto out;
2663 

2664                 if (besthomecpu == NULL)
2665                         besthomecpu = bestcpu;
2666 
2667                 /*
2668                  * Add the lgrps we just considered to the "done" set
2669                  */
2670                 klgrpset_or(done, cur_set);
2671 
2672         } while ((lpl_iter = lpl_iter->lpl_parent) != NULL);
2673 
2674         /*
2675          * The specified priority isn't high enough to run immediately
2676          * anywhere, so just return the best CPU from the home lgroup.
2677          */
2678         bestcpu = besthomecpu;
2679 
2680 out:
2681         ASSERT((bestcpu->cpu_flags & CPU_QUIESCED) == 0);
2682         return (bestcpu);
2683 }
2684 
2685 /*
2686  * This routine provides the generic idle cpu function for all processors.
2687  * If a processor has some specific code to execute when idle (say, to stop
2688  * the pipeline and save power) then that routine should be defined in the
2689  * processors specific code (module_xx.c) and the global variable idle_cpu
2690  * set to that function.
2691  */
2692 static void
2693 generic_idle_cpu(void)
2694 {
2695 }
2696 
2697 /*ARGSUSED*/
2698 static void
2699 generic_enq_thread(cpu_t *cpu, int bound)
2700 {
2701 }
2702 
2703 cpu_t *
2704 disp_choose_best_cpu(void)
2705 {
2706         kthread_t *t = curthread;
2707         cpu_t *curcpu = CPU;
2708 
2709         ASSERT(t->t_preempt > 0);
2710         ASSERT(t->t_state == TS_ONPROC);
2711         ASSERT(t->t_schedflag & TS_VCPU);
2712 
2713         if (ht_should_run(t, curcpu))
2714                 return (curcpu);
2715 
2716         return (disp_lowpri_cpu(curcpu, t, t->t_pri));
2717 }