1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 
  26 /*
  27  * Copyright (c) 2018, Joyent, Inc. All rights reserved.
  28  */
  29 
  30 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T     */
  31 /*        All Rights Reserved   */
  32 
  33 
  34 #include <sys/types.h>
  35 #include <sys/param.h>
  36 #include <sys/sysmacros.h>
  37 #include <sys/signal.h>
  38 #include <sys/user.h>
  39 #include <sys/systm.h>
  40 #include <sys/sysinfo.h>
  41 #include <sys/var.h>
  42 #include <sys/errno.h>
  43 #include <sys/cmn_err.h>
  44 #include <sys/debug.h>
  45 #include <sys/inline.h>
  46 #include <sys/disp.h>
  47 #include <sys/class.h>
  48 #include <sys/bitmap.h>
  49 #include <sys/kmem.h>
  50 #include <sys/cpuvar.h>
  51 #include <sys/vtrace.h>
  52 #include <sys/tnf.h>
  53 #include <sys/cpupart.h>
  54 #include <sys/lgrp.h>
  55 #include <sys/pg.h>
  56 #include <sys/cmt.h>
  57 #include <sys/bitset.h>
  58 #include <sys/schedctl.h>
  59 #include <sys/atomic.h>
  60 #include <sys/dtrace.h>
  61 #include <sys/sdt.h>
  62 #include <sys/archsystm.h>
  63 #include <sys/ht.h>
  64 
  65 #include <vm/as.h>
  66 
  67 #define BOUND_CPU       0x1
  68 #define BOUND_PARTITION 0x2
  69 #define BOUND_INTR      0x4
  70 
  71 /* Dispatch queue allocation structure and functions */
  72 struct disp_queue_info {
  73         disp_t  *dp;
  74         dispq_t *olddispq;
  75         dispq_t *newdispq;
  76         ulong_t *olddqactmap;
  77         ulong_t *newdqactmap;
  78         int     oldnglobpris;
  79 };
  80 static void     disp_dq_alloc(struct disp_queue_info *dptr, int numpris,
  81     disp_t *dp);
  82 static void     disp_dq_assign(struct disp_queue_info *dptr, int numpris);
  83 static void     disp_dq_free(struct disp_queue_info *dptr);
  84 
  85 /* platform-specific routine to call when processor is idle */
  86 static void     generic_idle_cpu();
  87 void            (*idle_cpu)() = generic_idle_cpu;
  88 
  89 /* routines invoked when a CPU enters/exits the idle loop */
  90 static void     idle_enter();
  91 static void     idle_exit();
  92 
  93 /* platform-specific routine to call when thread is enqueued */
  94 static void     generic_enq_thread(cpu_t *, int);
  95 void            (*disp_enq_thread)(cpu_t *, int) = generic_enq_thread;
  96 
  97 pri_t   kpreemptpri;            /* priority where kernel preemption applies */
  98 pri_t   upreemptpri = 0;        /* priority where normal preemption applies */
  99 pri_t   intr_pri;               /* interrupt thread priority base level */
 100 
 101 #define KPQPRI  -1              /* pri where cpu affinity is dropped for kpq */
 102 pri_t   kpqpri = KPQPRI;        /* can be set in /etc/system */
 103 disp_t  cpu0_disp;              /* boot CPU's dispatch queue */
 104 disp_lock_t     swapped_lock;   /* lock swapped threads and swap queue */
 105 int     nswapped;               /* total number of swapped threads */
 106 void    disp_swapped_enq(kthread_t *tp);
 107 static void     disp_swapped_setrun(kthread_t *tp);
 108 static void     cpu_resched(cpu_t *cp, pri_t tpri);
 109 
 110 /*
 111  * If this is set, only interrupt threads will cause kernel preemptions.
 112  * This is done by changing the value of kpreemptpri.  kpreemptpri
 113  * will either be the max sysclass pri + 1 or the min interrupt pri.
 114  */
 115 int     only_intr_kpreempt;
 116 
 117 extern void set_idle_cpu(int cpun);
 118 extern void unset_idle_cpu(int cpun);
 119 static void setkpdq(kthread_t *tp, int borf);
 120 #define SETKP_BACK      0
 121 #define SETKP_FRONT     1
 122 /*
 123  * Parameter that determines how recently a thread must have run
 124  * on the CPU to be considered loosely-bound to that CPU to reduce
 125  * cold cache effects.  The interval is in hertz.
 126  */
 127 #define RECHOOSE_INTERVAL 3
 128 int     rechoose_interval = RECHOOSE_INTERVAL;
 129 
 130 /*
 131  * Parameter that determines how long (in nanoseconds) a thread must
 132  * be sitting on a run queue before it can be stolen by another CPU
 133  * to reduce migrations.  The interval is in nanoseconds.
 134  *
 135  * The nosteal_nsec should be set by platform code cmp_set_nosteal_interval()
 136  * to an appropriate value.  nosteal_nsec is set to NOSTEAL_UNINITIALIZED
 137  * here indicating it is uninitiallized.
 138  * Setting nosteal_nsec to 0 effectively disables the nosteal 'protection'.
 139  *
 140  */
 141 #define NOSTEAL_UNINITIALIZED   (-1)
 142 hrtime_t nosteal_nsec = NOSTEAL_UNINITIALIZED;
 143 extern void cmp_set_nosteal_interval(void);
 144 
 145 id_t    defaultcid;     /* system "default" class; see dispadmin(1M) */
 146 
 147 disp_lock_t     transition_lock;        /* lock on transitioning threads */
 148 disp_lock_t     stop_lock;              /* lock on stopped threads */
 149 
 150 static void     cpu_dispqalloc(int numpris);
 151 
 152 /*
 153  * This gets returned by disp_getwork/disp_getbest if we couldn't steal
 154  * a thread because it was sitting on its run queue for a very short
 155  * period of time.
 156  */
 157 #define T_DONTSTEAL     (kthread_t *)(-1) /* returned by disp_getwork/getbest */
 158 
 159 static kthread_t        *disp_getwork(cpu_t *to);
 160 static kthread_t        *disp_getbest(disp_t *from);
 161 static kthread_t        *disp_ratify(kthread_t *tp, disp_t *kpq);
 162 
 163 void    swtch_to(kthread_t *);
 164 
 165 /*
 166  * dispatcher and scheduler initialization
 167  */
 168 
 169 /*
 170  * disp_setup - Common code to calculate and allocate dispatcher
 171  *              variables and structures based on the maximum priority.
 172  */
 173 static void
 174 disp_setup(pri_t maxglobpri, pri_t oldnglobpris)
 175 {
 176         pri_t   newnglobpris;
 177 
 178         ASSERT(MUTEX_HELD(&cpu_lock));
 179 
 180         newnglobpris = maxglobpri + 1 + LOCK_LEVEL;
 181 
 182         if (newnglobpris > oldnglobpris) {
 183                 /*
 184                  * Allocate new kp queues for each CPU partition.
 185                  */
 186                 cpupart_kpqalloc(newnglobpris);
 187 
 188                 /*
 189                  * Allocate new dispatch queues for each CPU.
 190                  */
 191                 cpu_dispqalloc(newnglobpris);
 192 
 193                 /*
 194                  * compute new interrupt thread base priority
 195                  */
 196                 intr_pri = maxglobpri;
 197                 if (only_intr_kpreempt) {
 198                         kpreemptpri = intr_pri + 1;
 199                         if (kpqpri == KPQPRI)
 200                                 kpqpri = kpreemptpri;
 201                 }
 202                 v.v_nglobpris = newnglobpris;
 203         }
 204 }
 205 
 206 /*
 207  * dispinit - Called to initialize all loaded classes and the
 208  *            dispatcher framework.
 209  */
 210 void
 211 dispinit(void)
 212 {
 213         id_t    cid;
 214         pri_t   maxglobpri;
 215         pri_t   cl_maxglobpri;
 216 
 217         maxglobpri = -1;
 218 
 219         /*
 220          * Initialize transition lock, which will always be set.
 221          */
 222         DISP_LOCK_INIT(&transition_lock);
 223         disp_lock_enter_high(&transition_lock);
 224         DISP_LOCK_INIT(&stop_lock);
 225 
 226         mutex_enter(&cpu_lock);
 227         CPU->cpu_disp->disp_maxrunpri = -1;
 228         CPU->cpu_disp->disp_max_unbound_pri = -1;
 229 
 230         /*
 231          * Initialize the default CPU partition.
 232          */
 233         cpupart_initialize_default();
 234         /*
 235          * Call the class specific initialization functions for
 236          * all pre-installed schedulers.
 237          *
 238          * We pass the size of a class specific parameter
 239          * buffer to each of the initialization functions
 240          * to try to catch problems with backward compatibility
 241          * of class modules.
 242          *
 243          * For example a new class module running on an old system
 244          * which didn't provide sufficiently large parameter buffers
 245          * would be bad news. Class initialization modules can check for
 246          * this and take action if they detect a problem.
 247          */
 248 
 249         for (cid = 0; cid < nclass; cid++) {
 250                 sclass_t        *sc;
 251 
 252                 sc = &sclass[cid];
 253                 if (SCHED_INSTALLED(sc)) {
 254                         cl_maxglobpri = sc->cl_init(cid, PC_CLPARMSZ,
 255                             &sc->cl_funcs);
 256                         if (cl_maxglobpri > maxglobpri)
 257                                 maxglobpri = cl_maxglobpri;
 258                 }
 259         }
 260         kpreemptpri = (pri_t)v.v_maxsyspri + 1;
 261         if (kpqpri == KPQPRI)
 262                 kpqpri = kpreemptpri;
 263 
 264         ASSERT(maxglobpri >= 0);
 265         disp_setup(maxglobpri, 0);
 266 
 267         mutex_exit(&cpu_lock);
 268 
 269         /*
 270          * Platform specific sticky scheduler setup.
 271          */
 272         if (nosteal_nsec == NOSTEAL_UNINITIALIZED)
 273                 cmp_set_nosteal_interval();
 274 
 275         /*
 276          * Get the default class ID; this may be later modified via
 277          * dispadmin(1M).  This will load the class (normally TS) and that will
 278          * call disp_add(), which is why we had to drop cpu_lock first.
 279          */
 280         if (getcid(defaultclass, &defaultcid) != 0) {
 281                 cmn_err(CE_PANIC, "Couldn't load default scheduling class '%s'",
 282                     defaultclass);
 283         }
 284 }
 285 
 286 /*
 287  * disp_add - Called with class pointer to initialize the dispatcher
 288  *            for a newly loaded class.
 289  */
 290 void
 291 disp_add(sclass_t *clp)
 292 {
 293         pri_t   maxglobpri;
 294         pri_t   cl_maxglobpri;
 295 
 296         mutex_enter(&cpu_lock);
 297         /*
 298          * Initialize the scheduler class.
 299          */
 300         maxglobpri = (pri_t)(v.v_nglobpris - LOCK_LEVEL - 1);
 301         cl_maxglobpri = clp->cl_init(clp - sclass, PC_CLPARMSZ, &clp->cl_funcs);
 302         if (cl_maxglobpri > maxglobpri)
 303                 maxglobpri = cl_maxglobpri;
 304 
 305         /*
 306          * Save old queue information.  Since we're initializing a
 307          * new scheduling class which has just been loaded, then
 308          * the size of the dispq may have changed.  We need to handle
 309          * that here.
 310          */
 311         disp_setup(maxglobpri, v.v_nglobpris);
 312 
 313         mutex_exit(&cpu_lock);
 314 }
 315 
 316 
 317 /*
 318  * For each CPU, allocate new dispatch queues
 319  * with the stated number of priorities.
 320  */
 321 static void
 322 cpu_dispqalloc(int numpris)
 323 {
 324         cpu_t   *cpup;
 325         struct disp_queue_info  *disp_mem;
 326         int i, num;
 327 
 328         ASSERT(MUTEX_HELD(&cpu_lock));
 329 
 330         disp_mem = kmem_zalloc(NCPU *
 331             sizeof (struct disp_queue_info), KM_SLEEP);
 332 
 333         /*
 334          * This routine must allocate all of the memory before stopping
 335          * the cpus because it must not sleep in kmem_alloc while the
 336          * CPUs are stopped.  Locks they hold will not be freed until they
 337          * are restarted.
 338          */
 339         i = 0;
 340         cpup = cpu_list;
 341         do {
 342                 disp_dq_alloc(&disp_mem[i], numpris, cpup->cpu_disp);
 343                 i++;
 344                 cpup = cpup->cpu_next;
 345         } while (cpup != cpu_list);
 346         num = i;
 347 
 348         pause_cpus(NULL, NULL);
 349         for (i = 0; i < num; i++)
 350                 disp_dq_assign(&disp_mem[i], numpris);
 351         start_cpus();
 352 
 353         /*
 354          * I must free all of the memory after starting the cpus because
 355          * I can not risk sleeping in kmem_free while the cpus are stopped.
 356          */
 357         for (i = 0; i < num; i++)
 358                 disp_dq_free(&disp_mem[i]);
 359 
 360         kmem_free(disp_mem, NCPU * sizeof (struct disp_queue_info));
 361 }
 362 
 363 static void
 364 disp_dq_alloc(struct disp_queue_info *dptr, int numpris, disp_t *dp)
 365 {
 366         dptr->newdispq = kmem_zalloc(numpris * sizeof (dispq_t), KM_SLEEP);
 367         dptr->newdqactmap = kmem_zalloc(((numpris / BT_NBIPUL) + 1) *
 368             sizeof (long), KM_SLEEP);
 369         dptr->dp = dp;
 370 }
 371 
 372 static void
 373 disp_dq_assign(struct disp_queue_info *dptr, int numpris)
 374 {
 375         disp_t  *dp;
 376 
 377         dp = dptr->dp;
 378         dptr->olddispq = dp->disp_q;
 379         dptr->olddqactmap = dp->disp_qactmap;
 380         dptr->oldnglobpris = dp->disp_npri;
 381 
 382         ASSERT(dptr->oldnglobpris < numpris);
 383 
 384         if (dptr->olddispq != NULL) {
 385                 /*
 386                  * Use kcopy because bcopy is platform-specific
 387                  * and could block while we might have paused the cpus.
 388                  */
 389                 (void) kcopy(dptr->olddispq, dptr->newdispq,
 390                     dptr->oldnglobpris * sizeof (dispq_t));
 391                 (void) kcopy(dptr->olddqactmap, dptr->newdqactmap,
 392                     ((dptr->oldnglobpris / BT_NBIPUL) + 1) *
 393                     sizeof (long));
 394         }
 395         dp->disp_q = dptr->newdispq;
 396         dp->disp_qactmap = dptr->newdqactmap;
 397         dp->disp_q_limit = &dptr->newdispq[numpris];
 398         dp->disp_npri = numpris;
 399 }
 400 
 401 static void
 402 disp_dq_free(struct disp_queue_info *dptr)
 403 {
 404         if (dptr->olddispq != NULL)
 405                 kmem_free(dptr->olddispq,
 406                     dptr->oldnglobpris * sizeof (dispq_t));
 407         if (dptr->olddqactmap != NULL)
 408                 kmem_free(dptr->olddqactmap,
 409                     ((dptr->oldnglobpris / BT_NBIPUL) + 1) * sizeof (long));
 410 }
 411 
 412 /*
 413  * For a newly created CPU, initialize the dispatch queue.
 414  * This is called before the CPU is known through cpu[] or on any lists.
 415  */
 416 void
 417 disp_cpu_init(cpu_t *cp)
 418 {
 419         disp_t  *dp;
 420         dispq_t *newdispq;
 421         ulong_t *newdqactmap;
 422 
 423         ASSERT(MUTEX_HELD(&cpu_lock));      /* protect dispatcher queue sizes */
 424 
 425         if (cp == cpu0_disp.disp_cpu)
 426                 dp = &cpu0_disp;
 427         else
 428                 dp = kmem_alloc(sizeof (disp_t), KM_SLEEP);
 429         bzero(dp, sizeof (disp_t));
 430         cp->cpu_disp = dp;
 431         dp->disp_cpu = cp;
 432         dp->disp_maxrunpri = -1;
 433         dp->disp_max_unbound_pri = -1;
 434         DISP_LOCK_INIT(&cp->cpu_thread_lock);
 435         /*
 436          * Allocate memory for the dispatcher queue headers
 437          * and the active queue bitmap.
 438          */
 439         newdispq = kmem_zalloc(v.v_nglobpris * sizeof (dispq_t), KM_SLEEP);
 440         newdqactmap = kmem_zalloc(((v.v_nglobpris / BT_NBIPUL) + 1) *
 441             sizeof (long), KM_SLEEP);
 442         dp->disp_q = newdispq;
 443         dp->disp_qactmap = newdqactmap;
 444         dp->disp_q_limit = &newdispq[v.v_nglobpris];
 445         dp->disp_npri = v.v_nglobpris;
 446 }
 447 
 448 void
 449 disp_cpu_fini(cpu_t *cp)
 450 {
 451         ASSERT(MUTEX_HELD(&cpu_lock));
 452 
 453         disp_kp_free(cp->cpu_disp);
 454         if (cp->cpu_disp != &cpu0_disp)
 455                 kmem_free(cp->cpu_disp, sizeof (disp_t));
 456 }
 457 
 458 /*
 459  * Allocate new, larger kpreempt dispatch queue to replace the old one.
 460  */
 461 void
 462 disp_kp_alloc(disp_t *dq, pri_t npri)
 463 {
 464         struct disp_queue_info  mem_info;
 465 
 466         if (npri > dq->disp_npri) {
 467                 /*
 468                  * Allocate memory for the new array.
 469                  */
 470                 disp_dq_alloc(&mem_info, npri, dq);
 471 
 472                 /*
 473                  * We need to copy the old structures to the new
 474                  * and free the old.
 475                  */
 476                 disp_dq_assign(&mem_info, npri);
 477                 disp_dq_free(&mem_info);
 478         }
 479 }
 480 
 481 /*
 482  * Free dispatch queue.
 483  * Used for the kpreempt queues for a removed CPU partition and
 484  * for the per-CPU queues of deleted CPUs.
 485  */
 486 void
 487 disp_kp_free(disp_t *dq)
 488 {
 489         struct disp_queue_info  mem_info;
 490 
 491         mem_info.olddispq = dq->disp_q;
 492         mem_info.olddqactmap = dq->disp_qactmap;
 493         mem_info.oldnglobpris = dq->disp_npri;
 494         disp_dq_free(&mem_info);
 495 }
 496 
 497 /*
 498  * End dispatcher and scheduler initialization.
 499  */
 500 
 501 /*
 502  * See if there's anything to do other than remain idle.
 503  * Return non-zero if there is.
 504  *
 505  * This function must be called with high spl, or with
 506  * kernel preemption disabled to prevent the partition's
 507  * active cpu list from changing while being traversed.
 508  *
 509  * This is essentially a simpler version of disp_getwork()
 510  * to be called by CPUs preparing to "halt".
 511  */
 512 int
 513 disp_anywork(void)
 514 {
 515         cpu_t           *cp = CPU;
 516         cpu_t           *ocp;
 517         volatile int    *local_nrunnable = &cp->cpu_disp->disp_nrunnable;
 518 
 519         if (!(cp->cpu_flags & CPU_OFFLINE)) {
 520                 if (CP_MAXRUNPRI(cp->cpu_part) >= 0)
 521                         return (1);
 522 
 523                 for (ocp = cp->cpu_next_part; ocp != cp;
 524                     ocp = ocp->cpu_next_part) {
 525                         ASSERT(CPU_ACTIVE(ocp));
 526 
 527                         /*
 528                          * Something has appeared on the local run queue.
 529                          */
 530                         if (*local_nrunnable > 0)
 531                                 return (1);
 532                         /*
 533                          * If we encounter another idle CPU that will
 534                          * soon be trolling around through disp_anywork()
 535                          * terminate our walk here and let this other CPU
 536                          * patrol the next part of the list.
 537                          */
 538                         if (ocp->cpu_dispatch_pri == -1 &&
 539                             (ocp->cpu_disp_flags & CPU_DISP_HALTED) == 0)
 540                                 return (0);
 541                         /*
 542                          * Work can be taken from another CPU if:
 543                          *      - There is unbound work on the run queue
 544                          *      - That work isn't a thread undergoing a
 545                          *      - context switch on an otherwise empty queue.
 546                          *      - The CPU isn't running the idle loop.
 547                          */
 548                         if (ocp->cpu_disp->disp_max_unbound_pri != -1 &&
 549                             !((ocp->cpu_disp_flags & CPU_DISP_DONTSTEAL) &&
 550                             ocp->cpu_disp->disp_nrunnable == 1) &&
 551                             ocp->cpu_dispatch_pri != -1)
 552                                 return (1);
 553                 }
 554         }
 555         return (0);
 556 }
 557 
 558 /*
 559  * Called when CPU enters the idle loop
 560  */
 561 static void
 562 idle_enter()
 563 {
 564         cpu_t           *cp = CPU;
 565 
 566         new_cpu_mstate(CMS_IDLE, gethrtime_unscaled());
 567         CPU_STATS_ADDQ(cp, sys, idlethread, 1);
 568         set_idle_cpu(cp->cpu_id);    /* arch-dependent hook */
 569 }
 570 
 571 /*
 572  * Called when CPU exits the idle loop
 573  */
 574 static void
 575 idle_exit()
 576 {
 577         cpu_t           *cp = CPU;
 578 
 579         new_cpu_mstate(CMS_SYSTEM, gethrtime_unscaled());
 580         unset_idle_cpu(cp->cpu_id);  /* arch-dependent hook */
 581 }
 582 
 583 /*
 584  * Idle loop.
 585  */
 586 void
 587 idle()
 588 {
 589         struct cpu      *cp = CPU;              /* pointer to this CPU */
 590         kthread_t       *t;                     /* taken thread */
 591 
 592         idle_enter();
 593 
 594         /*
 595          * Uniprocessor version of idle loop.
 596          * Do this until notified that we're on an actual multiprocessor.
 597          */
 598         while (ncpus == 1) {
 599                 if (cp->cpu_disp->disp_nrunnable == 0) {
 600                         (*idle_cpu)();
 601                         continue;
 602                 }
 603                 idle_exit();
 604                 swtch();
 605 
 606                 idle_enter(); /* returned from swtch */
 607         }
 608 
 609         /*
 610          * Multiprocessor idle loop.
 611          */
 612         for (;;) {
 613                 /*
 614                  * If CPU is completely quiesced by p_online(2), just wait
 615                  * here with minimal bus traffic until put online.
 616                  */
 617                 while (cp->cpu_flags & CPU_QUIESCED)
 618                         (*idle_cpu)();
 619 
 620                 if (cp->cpu_disp->disp_nrunnable != 0) {
 621                         idle_exit();
 622                         swtch();
 623                 } else {
 624                         if (cp->cpu_flags & CPU_OFFLINE)
 625                                 continue;
 626                         if ((t = disp_getwork(cp)) == NULL) {
 627                                 if (cp->cpu_chosen_level != -1) {
 628                                         disp_t *dp = cp->cpu_disp;
 629                                         disp_t *kpq;
 630 
 631                                         disp_lock_enter(&dp->disp_lock);
 632                                         /*
 633                                          * Set kpq under lock to prevent
 634                                          * migration between partitions.
 635                                          */
 636                                         kpq = &cp->cpu_part->cp_kp_queue;
 637                                         if (kpq->disp_maxrunpri == -1)
 638                                                 cp->cpu_chosen_level = -1;
 639                                         disp_lock_exit(&dp->disp_lock);
 640                                 }
 641                                 (*idle_cpu)();
 642                                 continue;
 643                         }
 644                         /*
 645                          * If there was a thread but we couldn't steal
 646                          * it, then keep trying.
 647                          */
 648                         if (t == T_DONTSTEAL)
 649                                 continue;
 650                         idle_exit();
 651                         swtch_to(t);
 652                 }
 653                 idle_enter(); /* returned from swtch/swtch_to */
 654         }
 655 }
 656 
 657 
 658 /*
 659  * Preempt the currently running thread in favor of the highest
 660  * priority thread.  The class of the current thread controls
 661  * where it goes on the dispatcher queues. If panicking, turn
 662  * preemption off.
 663  */
 664 void
 665 preempt()
 666 {
 667         kthread_t       *t = curthread;
 668         klwp_t          *lwp = ttolwp(curthread);
 669 
 670         if (panicstr)
 671                 return;
 672 
 673         TRACE_0(TR_FAC_DISP, TR_PREEMPT_START, "preempt_start");
 674 
 675         thread_lock(t);
 676 
 677         if (t->t_state != TS_ONPROC || t->t_disp_queue != CPU->cpu_disp) {
 678                 /*
 679                  * this thread has already been chosen to be run on
 680                  * another CPU. Clear kprunrun on this CPU since we're
 681                  * already headed for swtch().
 682                  */
 683                 CPU->cpu_kprunrun = 0;
 684                 thread_unlock_nopreempt(t);
 685                 TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end");
 686         } else {
 687                 if (lwp != NULL)
 688                         lwp->lwp_ru.nivcsw++;
 689                 CPU_STATS_ADDQ(CPU, sys, inv_swtch, 1);
 690                 THREAD_TRANSITION(t);
 691                 CL_PREEMPT(t);
 692                 DTRACE_SCHED(preempt);
 693                 thread_unlock_nopreempt(t);
 694 
 695                 TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end");
 696 
 697                 swtch();                /* clears CPU->cpu_runrun via disp() */
 698         }
 699 }
 700 
 701 extern kthread_t *thread_unpin();
 702 
 703 /*
 704  * disp() - find the highest priority thread for this processor to run, and
 705  * set it in TS_ONPROC state so that resume() can be called to run it.
 706  */
 707 static kthread_t *
 708 disp()
 709 {
 710         cpu_t           *cpup;
 711         disp_t          *dp;
 712         kthread_t       *tp;
 713         dispq_t         *dq;
 714         int             maxrunword;
 715         pri_t           pri;
 716         disp_t          *kpq;
 717 
 718         TRACE_0(TR_FAC_DISP, TR_DISP_START, "disp_start");
 719 
 720         cpup = CPU;
 721         /*
 722          * Find the highest priority loaded, runnable thread.
 723          */
 724         dp = cpup->cpu_disp;
 725 
 726 reschedule:
 727         /*
 728          * If there is more important work on the global queue with a better
 729          * priority than the maximum on this CPU, take it now.
 730          */
 731         kpq = &cpup->cpu_part->cp_kp_queue;
 732         while ((pri = kpq->disp_maxrunpri) >= 0 &&
 733             pri >= dp->disp_maxrunpri &&
 734             (cpup->cpu_flags & CPU_OFFLINE) == 0 &&
 735             (tp = disp_getbest(kpq)) != NULL) {
 736                 if (disp_ratify(tp, kpq) != NULL) {
 737                         TRACE_1(TR_FAC_DISP, TR_DISP_END,
 738                             "disp_end:tid %p", tp);
 739                         return (tp);
 740                 }
 741         }
 742 
 743         disp_lock_enter(&dp->disp_lock);
 744         pri = dp->disp_maxrunpri;
 745 
 746         /*
 747          * If there is nothing to run, look at what's runnable on other queues.
 748          * Choose the idle thread if the CPU is quiesced.
 749          * Note that CPUs that have the CPU_OFFLINE flag set can still run
 750          * interrupt threads, which will be the only threads on the CPU's own
 751          * queue, but cannot run threads from other queues.
 752          */
 753         if (pri == -1) {
 754                 if (!(cpup->cpu_flags & CPU_OFFLINE)) {
 755                         disp_lock_exit(&dp->disp_lock);
 756                         if ((tp = disp_getwork(cpup)) == NULL ||
 757                             tp == T_DONTSTEAL) {
 758                                 tp = cpup->cpu_idle_thread;
 759                                 (void) splhigh();
 760                                 THREAD_ONPROC(tp, cpup);
 761                                 cpup->cpu_dispthread = tp;
 762                                 cpup->cpu_dispatch_pri = -1;
 763                                 cpup->cpu_runrun = cpup->cpu_kprunrun = 0;
 764                                 cpup->cpu_chosen_level = -1;
 765                         }
 766                 } else {
 767                         disp_lock_exit_high(&dp->disp_lock);
 768                         tp = cpup->cpu_idle_thread;
 769                         THREAD_ONPROC(tp, cpup);
 770                         cpup->cpu_dispthread = tp;
 771                         cpup->cpu_dispatch_pri = -1;
 772                         cpup->cpu_runrun = cpup->cpu_kprunrun = 0;
 773                         cpup->cpu_chosen_level = -1;
 774                 }
 775                 TRACE_1(TR_FAC_DISP, TR_DISP_END,
 776                     "disp_end:tid %p", tp);
 777                 return (tp);
 778         }
 779 
 780         dq = &dp->disp_q[pri];
 781         tp = dq->dq_first;
 782 
 783         ASSERT(tp != NULL);
 784         ASSERT(tp->t_schedflag & TS_LOAD);       /* thread must be swapped in */
 785 
 786         DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp);
 787 
 788         /*
 789          * Found it so remove it from queue.
 790          */
 791         dp->disp_nrunnable--;
 792         dq->dq_sruncnt--;
 793         if ((dq->dq_first = tp->t_link) == NULL) {
 794                 ulong_t *dqactmap = dp->disp_qactmap;
 795 
 796                 ASSERT(dq->dq_sruncnt == 0);
 797                 dq->dq_last = NULL;
 798 
 799                 /*
 800                  * The queue is empty, so the corresponding bit needs to be
 801                  * turned off in dqactmap.   If nrunnable != 0 just took the
 802                  * last runnable thread off the
 803                  * highest queue, so recompute disp_maxrunpri.
 804                  */
 805                 maxrunword = pri >> BT_ULSHIFT;
 806                 dqactmap[maxrunword] &= ~BT_BIW(pri);
 807 
 808                 if (dp->disp_nrunnable == 0) {
 809                         dp->disp_max_unbound_pri = -1;
 810                         dp->disp_maxrunpri = -1;
 811                 } else {
 812                         int ipri;
 813 
 814                         ipri = bt_gethighbit(dqactmap, maxrunword);
 815                         dp->disp_maxrunpri = ipri;
 816                         if (ipri < dp->disp_max_unbound_pri)
 817                                 dp->disp_max_unbound_pri = ipri;
 818                 }
 819         } else {
 820                 tp->t_link = NULL;
 821         }
 822 
 823         /*
 824          * Set TS_DONT_SWAP flag to prevent another processor from swapping
 825          * out this thread before we have a chance to run it.
 826          * While running, it is protected against swapping by t_lock.
 827          */
 828         tp->t_schedflag |= TS_DONT_SWAP;
 829         cpup->cpu_dispthread = tp;           /* protected by spl only */
 830         cpup->cpu_dispatch_pri = pri;
 831         ASSERT(pri == DISP_PRIO(tp));
 832         thread_onproc(tp, cpup);                /* set t_state to TS_ONPROC */
 833         disp_lock_exit_high(&dp->disp_lock);     /* drop run queue lock */
 834 
 835         ASSERT(tp != NULL);
 836         TRACE_1(TR_FAC_DISP, TR_DISP_END,
 837             "disp_end:tid %p", tp);
 838 
 839         if (disp_ratify(tp, kpq) == NULL)
 840                 goto reschedule;
 841 
 842         return (tp);
 843 }
 844 
 845 /*
 846  * swtch()
 847  *      Find best runnable thread and run it.
 848  *      Called with the current thread already switched to a new state,
 849  *      on a sleep queue, run queue, stopped, and not zombied.
 850  *      May be called at any spl level less than or equal to LOCK_LEVEL.
 851  *      Always drops spl to the base level (spl0()).
 852  */
 853 void
 854 swtch()
 855 {
 856         kthread_t       *t = curthread;
 857         kthread_t       *next;
 858         cpu_t           *cp;
 859 
 860         TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
 861 
 862         if (t->t_flag & T_INTR_THREAD)
 863                 cpu_intr_swtch_enter(t);
 864 
 865         if (t->t_intr != NULL) {
 866                 /*
 867                  * We are an interrupt thread.  Setup and return
 868                  * the interrupted thread to be resumed.
 869                  */
 870                 (void) splhigh();       /* block other scheduler action */
 871                 cp = CPU;               /* now protected against migration */
 872                 ASSERT(CPU_ON_INTR(cp) == 0);   /* not called with PIL > 10 */
 873                 CPU_STATS_ADDQ(cp, sys, pswitch, 1);
 874                 CPU_STATS_ADDQ(cp, sys, intrblk, 1);
 875                 next = thread_unpin();
 876                 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
 877                 resume_from_intr(next);
 878         } else {
 879 #ifdef  DEBUG
 880                 if (t->t_state == TS_ONPROC &&
 881                     t->t_disp_queue->disp_cpu == CPU &&
 882                     t->t_preempt == 0) {
 883                         thread_lock(t);
 884                         ASSERT(t->t_state != TS_ONPROC ||
 885                             t->t_disp_queue->disp_cpu != CPU ||
 886                             t->t_preempt != 0);      /* cannot migrate */
 887                         thread_unlock_nopreempt(t);
 888                 }
 889 #endif  /* DEBUG */
 890                 cp = CPU;
 891                 next = disp();          /* returns with spl high */
 892                 ASSERT(CPU_ON_INTR(cp) == 0);   /* not called with PIL > 10 */
 893 
 894                 /* OK to steal anything left on run queue */
 895                 cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL;
 896 
 897                 if (next != t) {
 898                         hrtime_t now;
 899 
 900                         now = gethrtime_unscaled();
 901                         pg_ev_thread_swtch(cp, now, t, next);
 902 
 903                         /*
 904                          * If t was previously in the TS_ONPROC state,
 905                          * setfrontdq and setbackdq won't have set its t_waitrq.
 906                          * Since we now finally know that we're switching away
 907                          * from this thread, set its t_waitrq if it is on a run
 908                          * queue.
 909                          */
 910                         if ((t->t_state == TS_RUN) && (t->t_waitrq == 0)) {
 911                                 t->t_waitrq = now;
 912                         }
 913 
 914                         /*
 915                          * restore mstate of thread that we are switching to
 916                          */
 917                         restore_mstate(next);
 918 
 919                         CPU_STATS_ADDQ(cp, sys, pswitch, 1);
 920                         cp->cpu_last_swtch = t->t_disp_time = ddi_get_lbolt();
 921                         TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
 922 
 923                         if (dtrace_vtime_active)
 924                                 dtrace_vtime_switch(next);
 925 
 926                         resume(next);
 927                         /*
 928                          * The TR_RESUME_END and TR_SWTCH_END trace points
 929                          * appear at the end of resume(), because we may not
 930                          * return here
 931                          */
 932                 } else {
 933                         if (t->t_flag & T_INTR_THREAD)
 934                                 cpu_intr_swtch_exit(t);
 935                         /*
 936                          * Threads that enqueue themselves on a run queue defer
 937                          * setting t_waitrq. It is then either set in swtch()
 938                          * when the CPU is actually yielded, or not at all if it
 939                          * is remaining on the CPU.
 940                          * There is however a window between where the thread
 941                          * placed itself on a run queue, and where it selects
 942                          * itself in disp(), where a third party (eg. clock()
 943                          * doing tick processing) may have re-enqueued this
 944                          * thread, setting t_waitrq in the process. We detect
 945                          * this race by noticing that despite switching to
 946                          * ourself, our t_waitrq has been set, and should be
 947                          * cleared.
 948                          */
 949                         if (t->t_waitrq != 0)
 950                                 t->t_waitrq = 0;
 951 
 952                         pg_ev_thread_remain(cp, t);
 953 
 954                         DTRACE_SCHED(remain__cpu);
 955                         TRACE_0(TR_FAC_DISP, TR_SWTCH_END, "swtch_end");
 956                         (void) spl0();
 957                 }
 958         }
 959 }
 960 
 961 /*
 962  * swtch_from_zombie()
 963  *      Special case of swtch(), which allows checks for TS_ZOMB to be
 964  *      eliminated from normal resume.
 965  *      Find best runnable thread and run it.
 966  *      Called with the current thread zombied.
 967  *      Zombies cannot migrate, so CPU references are safe.
 968  */
 969 void
 970 swtch_from_zombie()
 971 {
 972         kthread_t       *next;
 973         cpu_t           *cpu = CPU;
 974 
 975         TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
 976 
 977         ASSERT(curthread->t_state == TS_ZOMB);
 978 
 979         next = disp();                  /* returns with spl high */
 980         ASSERT(CPU_ON_INTR(CPU) == 0);  /* not called with PIL > 10 */
 981         CPU_STATS_ADDQ(CPU, sys, pswitch, 1);
 982         ASSERT(next != curthread);
 983         TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
 984 
 985         pg_ev_thread_swtch(cpu, gethrtime_unscaled(), curthread, next);
 986 
 987         restore_mstate(next);
 988 
 989         if (dtrace_vtime_active)
 990                 dtrace_vtime_switch(next);
 991 
 992         resume_from_zombie(next);
 993         /*
 994          * The TR_RESUME_END and TR_SWTCH_END trace points
 995          * appear at the end of resume(), because we certainly will not
 996          * return here
 997          */
 998 }
 999 
1000 #if defined(DEBUG) && (defined(DISP_DEBUG) || defined(lint))
1001 
1002 /*
1003  * search_disp_queues()
1004  *      Search the given dispatch queues for thread tp.
1005  *      Return 1 if tp is found, otherwise return 0.
1006  */
1007 static int
1008 search_disp_queues(disp_t *dp, kthread_t *tp)
1009 {
1010         dispq_t         *dq;
1011         dispq_t         *eq;
1012 
1013         disp_lock_enter_high(&dp->disp_lock);
1014 
1015         for (dq = dp->disp_q, eq = dp->disp_q_limit; dq < eq; ++dq) {
1016                 kthread_t       *rp;
1017 
1018                 ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL);
1019 
1020                 for (rp = dq->dq_first; rp; rp = rp->t_link)
1021                         if (tp == rp) {
1022                                 disp_lock_exit_high(&dp->disp_lock);
1023                                 return (1);
1024                         }
1025         }
1026         disp_lock_exit_high(&dp->disp_lock);
1027 
1028         return (0);
1029 }
1030 
1031 /*
1032  * thread_on_queue()
1033  *      Search all per-CPU dispatch queues and all partition-wide kpreempt
1034  *      queues for thread tp. Return 1 if tp is found, otherwise return 0.
1035  */
1036 static int
1037 thread_on_queue(kthread_t *tp)
1038 {
1039         cpu_t           *cp;
1040         struct cpupart  *part;
1041 
1042         ASSERT(getpil() >= DISP_LEVEL);
1043 
1044         /*
1045          * Search the per-CPU dispatch queues for tp.
1046          */
1047         cp = CPU;
1048         do {
1049                 if (search_disp_queues(cp->cpu_disp, tp))
1050                         return (1);
1051         } while ((cp = cp->cpu_next_onln) != CPU);
1052 
1053         /*
1054          * Search the partition-wide kpreempt queues for tp.
1055          */
1056         part = CPU->cpu_part;
1057         do {
1058                 if (search_disp_queues(&part->cp_kp_queue, tp))
1059                         return (1);
1060         } while ((part = part->cp_next) != CPU->cpu_part);
1061 
1062         return (0);
1063 }
1064 
1065 #else
1066 
1067 #define thread_on_queue(tp)     0       /* ASSERT must be !thread_on_queue */
1068 
1069 #endif  /* DEBUG */
1070 
1071 /*
1072  * like swtch(), but switch to a specified thread taken from another CPU.
1073  *      called with spl high..
1074  */
1075 void
1076 swtch_to(kthread_t *next)
1077 {
1078         cpu_t                   *cp = CPU;
1079         hrtime_t                now;
1080 
1081         TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
1082 
1083         /*
1084          * Update context switch statistics.
1085          */
1086         CPU_STATS_ADDQ(cp, sys, pswitch, 1);
1087 
1088         TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
1089 
1090         now = gethrtime_unscaled();
1091         pg_ev_thread_swtch(cp, now, curthread, next);
1092 
1093         /* OK to steal anything left on run queue */
1094         cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL;
1095 
1096         /* record last execution time */
1097         cp->cpu_last_swtch = curthread->t_disp_time = ddi_get_lbolt();
1098 
1099         /*
1100          * If t was previously in the TS_ONPROC state, setfrontdq and setbackdq
1101          * won't have set its t_waitrq.  Since we now finally know that we're
1102          * switching away from this thread, set its t_waitrq if it is on a run
1103          * queue.
1104          */
1105         if ((curthread->t_state == TS_RUN) && (curthread->t_waitrq == 0)) {
1106                 curthread->t_waitrq = now;
1107         }
1108 
1109         /* restore next thread to previously running microstate */
1110         restore_mstate(next);
1111 
1112         if (dtrace_vtime_active)
1113                 dtrace_vtime_switch(next);
1114 
1115         resume(next);
1116         /*
1117          * The TR_RESUME_END and TR_SWTCH_END trace points
1118          * appear at the end of resume(), because we may not
1119          * return here
1120          */
1121 }
1122 
1123 static void
1124 cpu_resched(cpu_t *cp, pri_t tpri)
1125 {
1126         int     call_poke_cpu = 0;
1127         pri_t   cpupri = cp->cpu_dispatch_pri;
1128 
1129         if (cpupri != CPU_IDLE_PRI && cpupri < tpri) {
1130                 TRACE_2(TR_FAC_DISP, TR_CPU_RESCHED,
1131                     "CPU_RESCHED:Tpri %d Cpupri %d", tpri, cpupri);
1132                 if (tpri >= upreemptpri && cp->cpu_runrun == 0) {
1133                         cp->cpu_runrun = 1;
1134                         aston(cp->cpu_dispthread);
1135                         if (tpri < kpreemptpri && cp != CPU)
1136                                 call_poke_cpu = 1;
1137                 }
1138                 if (tpri >= kpreemptpri && cp->cpu_kprunrun == 0) {
1139                         cp->cpu_kprunrun = 1;
1140                         if (cp != CPU)
1141                                 call_poke_cpu = 1;
1142                 }
1143         }
1144 
1145         /*
1146          * Propagate cpu_runrun, and cpu_kprunrun to global visibility.
1147          */
1148         membar_enter();
1149 
1150         if (call_poke_cpu)
1151                 poke_cpu(cp->cpu_id);
1152 }
1153 
1154 /*
1155  * setbackdq() keeps runqs balanced such that the difference in length
1156  * between the chosen runq and the next one is no more than RUNQ_MAX_DIFF.
1157  * For threads with priorities below RUNQ_MATCH_PRI levels, the runq's lengths
1158  * must match.  When per-thread TS_RUNQMATCH flag is set, setbackdq() will
1159  * try to keep runqs perfectly balanced regardless of the thread priority.
1160  */
1161 #define RUNQ_MATCH_PRI  16      /* pri below which queue lengths must match */
1162 #define RUNQ_MAX_DIFF   2       /* maximum runq length difference */
1163 #define RUNQ_LEN(cp, pri)       ((cp)->cpu_disp->disp_q[pri].dq_sruncnt)
1164 
1165 /*
1166  * Macro that evaluates to true if it is likely that the thread has cache
1167  * warmth. This is based on the amount of time that has elapsed since the
1168  * thread last ran. If that amount of time is less than "rechoose_interval"
1169  * ticks, then we decide that the thread has enough cache warmth to warrant
1170  * some affinity for t->t_cpu.
1171  */
1172 #define THREAD_HAS_CACHE_WARMTH(thread) \
1173         ((thread == curthread) ||       \
1174         ((ddi_get_lbolt() - thread->t_disp_time) <= rechoose_interval))
1175 /*
1176  * Put the specified thread on the back of the dispatcher
1177  * queue corresponding to its current priority.
1178  *
1179  * Called with the thread in transition, onproc or stopped state
1180  * and locked (transition implies locked) and at high spl.
1181  * Returns with the thread in TS_RUN state and still locked.
1182  */
1183 void
1184 setbackdq(kthread_t *tp)
1185 {
1186         dispq_t *dq;
1187         disp_t          *dp;
1188         cpu_t           *cp;
1189         pri_t           tpri;
1190         int             bound;
1191         boolean_t       self;
1192 
1193         ASSERT(THREAD_LOCK_HELD(tp));
1194         ASSERT((tp->t_schedflag & TS_ALLSTART) == 0);
1195         ASSERT(!thread_on_queue(tp));   /* make sure tp isn't on a runq */
1196 
1197         /*
1198          * If thread is "swapped" or on the swap queue don't
1199          * queue it, but wake sched.
1200          */
1201         if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) {
1202                 disp_swapped_setrun(tp);
1203                 return;
1204         }
1205 
1206         self = (tp == curthread);
1207 
1208         if (tp->t_bound_cpu || tp->t_weakbound_cpu)
1209                 bound = 1;
1210         else
1211                 bound = 0;
1212 
1213         tpri = DISP_PRIO(tp);
1214         if (ncpus == 1)
1215                 cp = tp->t_cpu;
1216         else if (!bound) {
1217                 if (tpri >= kpqpri) {
1218                         setkpdq(tp, SETKP_BACK);
1219                         return;
1220                 }
1221 
1222                 /*
1223                  * We'll generally let this thread continue to run where
1224                  * it last ran...but will consider migration if:
1225                  * - The thread probably doesn't have much cache warmth.
1226                  * - HT exclusion would prefer us to run elsewhere
1227                  * - The CPU where it last ran is the target of an offline
1228                  *   request.
1229                  * - The thread last ran outside its home lgroup.
1230                  */
1231                 if ((!THREAD_HAS_CACHE_WARMTH(tp)) ||
1232                     !ht_should_run(tp, tp->t_cpu) ||
1233                     (tp->t_cpu == cpu_inmotion) ||
1234                     !LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, tp->t_cpu)) {
1235                         cp = disp_lowpri_cpu(tp->t_cpu, tp, tpri);
1236                 } else {
1237                         cp = tp->t_cpu;
1238                 }
1239 
1240                 if (tp->t_cpupart == cp->cpu_part) {
1241                         int     qlen;
1242 
1243                         /*
1244                          * Perform any CMT load balancing
1245                          */
1246                         cp = cmt_balance(tp, cp);
1247 
1248                         /*
1249                          * Balance across the run queues
1250                          */
1251                         qlen = RUNQ_LEN(cp, tpri);
1252                         if (tpri >= RUNQ_MATCH_PRI &&
1253                             !(tp->t_schedflag & TS_RUNQMATCH))
1254                                 qlen -= RUNQ_MAX_DIFF;
1255                         if (qlen > 0) {
1256                                 cpu_t *newcp;
1257 
1258                                 if (tp->t_lpl->lpl_lgrpid == LGRP_ROOTID) {
1259                                         newcp = cp->cpu_next_part;
1260                                 } else if ((newcp = cp->cpu_next_lpl) == cp) {
1261                                         newcp = cp->cpu_next_part;
1262                                 }
1263 
1264                                 if (ht_should_run(tp, newcp) &&
1265                                     RUNQ_LEN(newcp, tpri) < qlen) {
1266                                         DTRACE_PROBE3(runq__balance,
1267                                             kthread_t *, tp,
1268                                             cpu_t *, cp, cpu_t *, newcp);
1269                                         cp = newcp;
1270                                 }
1271                         }
1272                 } else {
1273                         /*
1274                          * Migrate to a cpu in the new partition.
1275                          */
1276                         cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist, tp,
1277                             tp->t_pri);
1278                 }
1279                 ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1280         } else {
1281                 /*
1282                  * It is possible that t_weakbound_cpu != t_bound_cpu (for
1283                  * a short time until weak binding that existed when the
1284                  * strong binding was established has dropped) so we must
1285                  * favour weak binding over strong.
1286                  */
1287                 cp = tp->t_weakbound_cpu ?
1288                     tp->t_weakbound_cpu : tp->t_bound_cpu;
1289         }
1290         /*
1291          * A thread that is ONPROC may be temporarily placed on the run queue
1292          * but then chosen to run again by disp.  If the thread we're placing on
1293          * the queue is in TS_ONPROC state, don't set its t_waitrq until a
1294          * replacement process is actually scheduled in swtch().  In this
1295          * situation, curthread is the only thread that could be in the ONPROC
1296          * state.
1297          */
1298         if ((!self) && (tp->t_waitrq == 0)) {
1299                 hrtime_t curtime;
1300 
1301                 curtime = gethrtime_unscaled();
1302                 (void) cpu_update_pct(tp, curtime);
1303                 tp->t_waitrq = curtime;
1304         } else {
1305                 (void) cpu_update_pct(tp, gethrtime_unscaled());
1306         }
1307 
1308         dp = cp->cpu_disp;
1309         disp_lock_enter_high(&dp->disp_lock);
1310 
1311         DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 0);
1312         TRACE_3(TR_FAC_DISP, TR_BACKQ, "setbackdq:pri %d cpu %p tid %p",
1313             tpri, cp, tp);
1314 
1315 #ifndef NPROBE
1316         /* Kernel probe */
1317         if (tnf_tracing_active)
1318                 tnf_thread_queue(tp, cp, tpri);
1319 #endif /* NPROBE */
1320 
1321         ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1322 
1323         THREAD_RUN(tp, &dp->disp_lock);          /* set t_state to TS_RUN */
1324         tp->t_disp_queue = dp;
1325         tp->t_link = NULL;
1326 
1327         dq = &dp->disp_q[tpri];
1328         dp->disp_nrunnable++;
1329         if (!bound)
1330                 dp->disp_steal = 0;
1331         membar_enter();
1332 
1333         if (dq->dq_sruncnt++ != 0) {
1334                 ASSERT(dq->dq_first != NULL);
1335                 dq->dq_last->t_link = tp;
1336                 dq->dq_last = tp;
1337         } else {
1338                 ASSERT(dq->dq_first == NULL);
1339                 ASSERT(dq->dq_last == NULL);
1340                 dq->dq_first = dq->dq_last = tp;
1341                 BT_SET(dp->disp_qactmap, tpri);
1342                 if (tpri > dp->disp_maxrunpri) {
1343                         dp->disp_maxrunpri = tpri;
1344                         membar_enter();
1345                         cpu_resched(cp, tpri);
1346                 }
1347         }
1348 
1349         if (!bound && tpri > dp->disp_max_unbound_pri) {
1350                 if (self && dp->disp_max_unbound_pri == -1 && cp == CPU) {
1351                         /*
1352                          * If there are no other unbound threads on the
1353                          * run queue, don't allow other CPUs to steal
1354                          * this thread while we are in the middle of a
1355                          * context switch. We may just switch to it
1356                          * again right away. CPU_DISP_DONTSTEAL is cleared
1357                          * in swtch and swtch_to.
1358                          */
1359                         cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL;
1360                 }
1361                 dp->disp_max_unbound_pri = tpri;
1362         }
1363         (*disp_enq_thread)(cp, bound);
1364 }
1365 
1366 /*
1367  * Put the specified thread on the front of the dispatcher
1368  * queue corresponding to its current priority.
1369  *
1370  * Called with the thread in transition, onproc or stopped state
1371  * and locked (transition implies locked) and at high spl.
1372  * Returns with the thread in TS_RUN state and still locked.
1373  */
1374 void
1375 setfrontdq(kthread_t *tp)
1376 {
1377         disp_t          *dp;
1378         dispq_t         *dq;
1379         cpu_t           *cp;
1380         pri_t           tpri;
1381         int             bound;
1382 
1383         ASSERT(THREAD_LOCK_HELD(tp));
1384         ASSERT((tp->t_schedflag & TS_ALLSTART) == 0);
1385         ASSERT(!thread_on_queue(tp));   /* make sure tp isn't on a runq */
1386 
1387         /*
1388          * If thread is "swapped" or on the swap queue don't
1389          * queue it, but wake sched.
1390          */
1391         if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) {
1392                 disp_swapped_setrun(tp);
1393                 return;
1394         }
1395 
1396         if (tp->t_bound_cpu || tp->t_weakbound_cpu)
1397                 bound = 1;
1398         else
1399                 bound = 0;
1400 
1401         tpri = DISP_PRIO(tp);
1402         if (ncpus == 1)
1403                 cp = tp->t_cpu;
1404         else if (!bound) {
1405                 if (tpri >= kpqpri) {
1406                         setkpdq(tp, SETKP_FRONT);
1407                         return;
1408                 }
1409                 cp = tp->t_cpu;
1410                 if (tp->t_cpupart == cp->cpu_part) {
1411                         /*
1412                          * We'll generally let this thread continue to run
1413                          * where it last ran, but will consider migration if:
1414                          * - The thread last ran outside its home lgroup.
1415                          * - The CPU where it last ran is the target of an
1416                          *   offline request (a thread_nomigrate() on the in
1417                          *   motion CPU relies on this when forcing a preempt).
1418                          * - The thread isn't the highest priority thread where
1419                          *   it last ran, and it is considered not likely to
1420                          *   have significant cache warmth.
1421                          */
1422                         if (!LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, cp) ||
1423                             cp == cpu_inmotion ||
1424                             (tpri < cp->cpu_disp->disp_maxrunpri &&
1425                             !THREAD_HAS_CACHE_WARMTH(tp))) {
1426                                 cp = disp_lowpri_cpu(tp->t_cpu, tp, tpri);
1427                         }
1428                 } else {
1429                         /*
1430                          * Migrate to a cpu in the new partition.
1431                          */
1432                         cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist,
1433                             tp, tp->t_pri);
1434                 }
1435                 ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1436         } else {
1437                 /*
1438                  * It is possible that t_weakbound_cpu != t_bound_cpu (for
1439                  * a short time until weak binding that existed when the
1440                  * strong binding was established has dropped) so we must
1441                  * favour weak binding over strong.
1442                  */
1443                 cp = tp->t_weakbound_cpu ?
1444                     tp->t_weakbound_cpu : tp->t_bound_cpu;
1445         }
1446 
1447         /*
1448          * A thread that is ONPROC may be temporarily placed on the run queue
1449          * but then chosen to run again by disp.  If the thread we're placing on
1450          * the queue is in TS_ONPROC state, don't set its t_waitrq until a
1451          * replacement process is actually scheduled in swtch().  In this
1452          * situation, curthread is the only thread that could be in the ONPROC
1453          * state.
1454          */
1455         if ((tp != curthread) && (tp->t_waitrq == 0)) {
1456                 hrtime_t curtime;
1457 
1458                 curtime = gethrtime_unscaled();
1459                 (void) cpu_update_pct(tp, curtime);
1460                 tp->t_waitrq = curtime;
1461         } else {
1462                 (void) cpu_update_pct(tp, gethrtime_unscaled());
1463         }
1464 
1465         dp = cp->cpu_disp;
1466         disp_lock_enter_high(&dp->disp_lock);
1467 
1468         TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp);
1469         DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 1);
1470 
1471 #ifndef NPROBE
1472         /* Kernel probe */
1473         if (tnf_tracing_active)
1474                 tnf_thread_queue(tp, cp, tpri);
1475 #endif /* NPROBE */
1476 
1477         ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1478 
1479         THREAD_RUN(tp, &dp->disp_lock);          /* set TS_RUN state and lock */
1480         tp->t_disp_queue = dp;
1481 
1482         dq = &dp->disp_q[tpri];
1483         dp->disp_nrunnable++;
1484         if (!bound)
1485                 dp->disp_steal = 0;
1486         membar_enter();
1487 
1488         if (dq->dq_sruncnt++ != 0) {
1489                 ASSERT(dq->dq_last != NULL);
1490                 tp->t_link = dq->dq_first;
1491                 dq->dq_first = tp;
1492         } else {
1493                 ASSERT(dq->dq_last == NULL);
1494                 ASSERT(dq->dq_first == NULL);
1495                 tp->t_link = NULL;
1496                 dq->dq_first = dq->dq_last = tp;
1497                 BT_SET(dp->disp_qactmap, tpri);
1498                 if (tpri > dp->disp_maxrunpri) {
1499                         dp->disp_maxrunpri = tpri;
1500                         membar_enter();
1501                         cpu_resched(cp, tpri);
1502                 }
1503         }
1504 
1505         if (!bound && tpri > dp->disp_max_unbound_pri) {
1506                 if (tp == curthread && dp->disp_max_unbound_pri == -1 &&
1507                     cp == CPU) {
1508                         /*
1509                          * If there are no other unbound threads on the
1510                          * run queue, don't allow other CPUs to steal
1511                          * this thread while we are in the middle of a
1512                          * context switch. We may just switch to it
1513                          * again right away. CPU_DISP_DONTSTEAL is cleared
1514                          * in swtch and swtch_to.
1515                          */
1516                         cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL;
1517                 }
1518                 dp->disp_max_unbound_pri = tpri;
1519         }
1520         (*disp_enq_thread)(cp, bound);
1521 }
1522 
1523 /*
1524  * Put a high-priority unbound thread on the kp queue
1525  */
1526 static void
1527 setkpdq(kthread_t *tp, int borf)
1528 {
1529         dispq_t *dq;
1530         disp_t  *dp;
1531         cpu_t   *cp;
1532         pri_t   tpri;
1533 
1534         tpri = DISP_PRIO(tp);
1535 
1536         dp = &tp->t_cpupart->cp_kp_queue;
1537         disp_lock_enter_high(&dp->disp_lock);
1538 
1539         TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp);
1540 
1541         ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1542         DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, borf);
1543         THREAD_RUN(tp, &dp->disp_lock);          /* set t_state to TS_RUN */
1544         tp->t_disp_queue = dp;
1545         dp->disp_nrunnable++;
1546         dq = &dp->disp_q[tpri];
1547 
1548         if (dq->dq_sruncnt++ != 0) {
1549                 if (borf == SETKP_BACK) {
1550                         ASSERT(dq->dq_first != NULL);
1551                         tp->t_link = NULL;
1552                         dq->dq_last->t_link = tp;
1553                         dq->dq_last = tp;
1554                 } else {
1555                         ASSERT(dq->dq_last != NULL);
1556                         tp->t_link = dq->dq_first;
1557                         dq->dq_first = tp;
1558                 }
1559         } else {
1560                 if (borf == SETKP_BACK) {
1561                         ASSERT(dq->dq_first == NULL);
1562                         ASSERT(dq->dq_last == NULL);
1563                         dq->dq_first = dq->dq_last = tp;
1564                 } else {
1565                         ASSERT(dq->dq_last == NULL);
1566                         ASSERT(dq->dq_first == NULL);
1567                         tp->t_link = NULL;
1568                         dq->dq_first = dq->dq_last = tp;
1569                 }
1570                 BT_SET(dp->disp_qactmap, tpri);
1571                 if (tpri > dp->disp_max_unbound_pri)
1572                         dp->disp_max_unbound_pri = tpri;
1573                 if (tpri > dp->disp_maxrunpri) {
1574                         dp->disp_maxrunpri = tpri;
1575                         membar_enter();
1576                 }
1577         }
1578 
1579         cp = tp->t_cpu;
1580         if (tp->t_cpupart != cp->cpu_part) {
1581                 /* migrate to a cpu in the new partition */
1582                 cp = tp->t_cpupart->cp_cpulist;
1583         }
1584         cp = disp_lowpri_cpu(cp, tp, tp->t_pri);
1585         disp_lock_enter_high(&cp->cpu_disp->disp_lock);
1586         ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1587 
1588 #ifndef NPROBE
1589         /* Kernel probe */
1590         if (tnf_tracing_active)
1591                 tnf_thread_queue(tp, cp, tpri);
1592 #endif /* NPROBE */
1593 
1594         if (cp->cpu_chosen_level < tpri)
1595                 cp->cpu_chosen_level = tpri;
1596         cpu_resched(cp, tpri);
1597         disp_lock_exit_high(&cp->cpu_disp->disp_lock);
1598         (*disp_enq_thread)(cp, 0);
1599 }
1600 
1601 /*
1602  * Remove a thread from the dispatcher queue if it is on it.
1603  * It is not an error if it is not found but we return whether
1604  * or not it was found in case the caller wants to check.
1605  */
1606 int
1607 dispdeq(kthread_t *tp)
1608 {
1609         disp_t          *dp;
1610         dispq_t         *dq;
1611         kthread_t       *rp;
1612         kthread_t       *trp;
1613         kthread_t       **ptp;
1614         int             tpri;
1615 
1616         ASSERT(THREAD_LOCK_HELD(tp));
1617 
1618         if (tp->t_state != TS_RUN)
1619                 return (0);
1620 
1621         /*
1622          * The thread is "swapped" or is on the swap queue and
1623          * hence no longer on the run queue, so return true.
1624          */
1625         if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD)
1626                 return (1);
1627 
1628         tpri = DISP_PRIO(tp);
1629         dp = tp->t_disp_queue;
1630         ASSERT(tpri < dp->disp_npri);
1631         dq = &dp->disp_q[tpri];
1632         ptp = &dq->dq_first;
1633         rp = *ptp;
1634         trp = NULL;
1635 
1636         ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL);
1637 
1638         /*
1639          * Search for thread in queue.
1640          * Double links would simplify this at the expense of disp/setrun.
1641          */
1642         while (rp != tp && rp != NULL) {
1643                 trp = rp;
1644                 ptp = &trp->t_link;
1645                 rp = trp->t_link;
1646         }
1647 
1648         if (rp == NULL) {
1649                 panic("dispdeq: thread not on queue");
1650         }
1651 
1652         DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp);
1653 
1654         /*
1655          * Found it so remove it from queue.
1656          */
1657         if ((*ptp = rp->t_link) == NULL)
1658                 dq->dq_last = trp;
1659 
1660         dp->disp_nrunnable--;
1661         if (--dq->dq_sruncnt == 0) {
1662                 dp->disp_qactmap[tpri >> BT_ULSHIFT] &= ~BT_BIW(tpri);
1663                 if (dp->disp_nrunnable == 0) {
1664                         dp->disp_max_unbound_pri = -1;
1665                         dp->disp_maxrunpri = -1;
1666                 } else if (tpri == dp->disp_maxrunpri) {
1667                         int ipri;
1668 
1669                         ipri = bt_gethighbit(dp->disp_qactmap,
1670                             dp->disp_maxrunpri >> BT_ULSHIFT);
1671                         if (ipri < dp->disp_max_unbound_pri)
1672                                 dp->disp_max_unbound_pri = ipri;
1673                         dp->disp_maxrunpri = ipri;
1674                 }
1675         }
1676         tp->t_link = NULL;
1677         THREAD_TRANSITION(tp);          /* put in intermediate state */
1678         return (1);
1679 }
1680 
1681 
1682 /*
1683  * dq_sruninc and dq_srundec are public functions for
1684  * incrementing/decrementing the sruncnts when a thread on
1685  * a dispatcher queue is made schedulable/unschedulable by
1686  * resetting the TS_LOAD flag.
1687  *
1688  * The caller MUST have the thread lock and therefore the dispatcher
1689  * queue lock so that the operation which changes
1690  * the flag, the operation that checks the status of the thread to
1691  * determine if it's on a disp queue AND the call to this function
1692  * are one atomic operation with respect to interrupts.
1693  */
1694 
1695 /*
1696  * Called by sched AFTER TS_LOAD flag is set on a swapped, runnable thread.
1697  */
1698 void
1699 dq_sruninc(kthread_t *t)
1700 {
1701         ASSERT(t->t_state == TS_RUN);
1702         ASSERT(t->t_schedflag & TS_LOAD);
1703 
1704         THREAD_TRANSITION(t);
1705         setfrontdq(t);
1706 }
1707 
1708 /*
1709  * See comment on calling conventions above.
1710  * Called by sched BEFORE TS_LOAD flag is cleared on a runnable thread.
1711  */
1712 void
1713 dq_srundec(kthread_t *t)
1714 {
1715         ASSERT(t->t_schedflag & TS_LOAD);
1716 
1717         (void) dispdeq(t);
1718         disp_swapped_enq(t);
1719 }
1720 
1721 /*
1722  * Change the dispatcher lock of thread to the "swapped_lock"
1723  * and return with thread lock still held.
1724  *
1725  * Called with thread_lock held, in transition state, and at high spl.
1726  */
1727 void
1728 disp_swapped_enq(kthread_t *tp)
1729 {
1730         ASSERT(THREAD_LOCK_HELD(tp));
1731         ASSERT(tp->t_schedflag & TS_LOAD);
1732 
1733         switch (tp->t_state) {
1734         case TS_RUN:
1735                 disp_lock_enter_high(&swapped_lock);
1736                 THREAD_SWAP(tp, &swapped_lock);     /* set TS_RUN state and lock */
1737                 break;
1738         case TS_ONPROC:
1739                 disp_lock_enter_high(&swapped_lock);
1740                 THREAD_TRANSITION(tp);
1741                 wake_sched_sec = 1;             /* tell clock to wake sched */
1742                 THREAD_SWAP(tp, &swapped_lock);     /* set TS_RUN state and lock */
1743                 break;
1744         default:
1745                 panic("disp_swapped: tp: %p bad t_state", (void *)tp);
1746         }
1747 }
1748 
1749 /*
1750  * This routine is called by setbackdq/setfrontdq if the thread is
1751  * not loaded or loaded and on the swap queue.
1752  *
1753  * Thread state TS_SLEEP implies that a swapped thread
1754  * has been woken up and needs to be swapped in by the swapper.
1755  *
1756  * Thread state TS_RUN, it implies that the priority of a swapped
1757  * thread is being increased by scheduling class (e.g. ts_update).
1758  */
1759 static void
1760 disp_swapped_setrun(kthread_t *tp)
1761 {
1762         ASSERT(THREAD_LOCK_HELD(tp));
1763         ASSERT((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD);
1764 
1765         switch (tp->t_state) {
1766         case TS_SLEEP:
1767                 disp_lock_enter_high(&swapped_lock);
1768                 /*
1769                  * Wakeup sched immediately (i.e., next tick) if the
1770                  * thread priority is above maxclsyspri.
1771                  */
1772                 if (DISP_PRIO(tp) > maxclsyspri)
1773                         wake_sched = 1;
1774                 else
1775                         wake_sched_sec = 1;
1776                 THREAD_RUN(tp, &swapped_lock); /* set TS_RUN state and lock */
1777                 break;
1778         case TS_RUN:                            /* called from ts_update */
1779                 break;
1780         default:
1781                 panic("disp_swapped_setrun: tp: %p bad t_state", (void *)tp);
1782         }
1783 }
1784 
1785 /*
1786  *      Make a thread give up its processor.  Find the processor on
1787  *      which this thread is executing, and have that processor
1788  *      preempt.
1789  *
1790  *      We allow System Duty Cycle (SDC) threads to be preempted even if
1791  *      they are running at kernel priorities.  To implement this, we always
1792  *      set cpu_kprunrun; this ensures preempt() will be called.  Since SDC
1793  *      calls cpu_surrender() very often, we only preempt if there is anyone
1794  *      competing with us.
1795  */
1796 void
1797 cpu_surrender(kthread_t *tp)
1798 {
1799         cpu_t   *cpup;
1800         int     max_pri;
1801         int     max_run_pri;
1802         klwp_t  *lwp;
1803 
1804         ASSERT(THREAD_LOCK_HELD(tp));
1805 
1806         if (tp->t_state != TS_ONPROC)
1807                 return;
1808         cpup = tp->t_disp_queue->disp_cpu;        /* CPU thread dispatched to */
1809         max_pri = cpup->cpu_disp->disp_maxrunpri; /* best pri of that CPU */
1810         max_run_pri = CP_MAXRUNPRI(cpup->cpu_part);
1811         if (max_pri < max_run_pri)
1812                 max_pri = max_run_pri;
1813 
1814         if (tp->t_cid == sysdccid) {
1815                 uint_t t_pri = DISP_PRIO(tp);
1816                 if (t_pri > max_pri)
1817                         return;         /* we are not competing w/ anyone */
1818                 cpup->cpu_runrun = cpup->cpu_kprunrun = 1;
1819         } else {
1820                 cpup->cpu_runrun = 1;
1821                 if (max_pri >= kpreemptpri && cpup->cpu_kprunrun == 0) {
1822                         cpup->cpu_kprunrun = 1;
1823                 }
1824         }
1825 
1826         /*
1827          * Propagate cpu_runrun, and cpu_kprunrun to global visibility.
1828          */
1829         membar_enter();
1830 
1831         DTRACE_SCHED1(surrender, kthread_t *, tp);
1832 
1833         /*
1834          * Make the target thread take an excursion through trap()
1835          * to do preempt() (unless we're already in trap or post_syscall,
1836          * calling cpu_surrender via CL_TRAPRET).
1837          */
1838         if (tp != curthread || (lwp = tp->t_lwp) == NULL ||
1839             lwp->lwp_state != LWP_USER) {
1840                 aston(tp);
1841                 if (cpup != CPU)
1842                         poke_cpu(cpup->cpu_id);
1843         }
1844         TRACE_2(TR_FAC_DISP, TR_CPU_SURRENDER,
1845             "cpu_surrender:tid %p cpu %p", tp, cpup);
1846 }
1847 
1848 /*
1849  * Commit to and ratify a scheduling decision
1850  */
1851 /*ARGSUSED*/
1852 static kthread_t *
1853 disp_ratify(kthread_t *tp, disp_t *kpq)
1854 {
1855         pri_t   tpri, maxpri;
1856         pri_t   maxkpri;
1857         cpu_t   *cpup;
1858 
1859         ASSERT(tp != NULL);
1860         /*
1861          * Commit to, then ratify scheduling decision
1862          */
1863         cpup = CPU;
1864         if (cpup->cpu_runrun != 0)
1865                 cpup->cpu_runrun = 0;
1866         if (cpup->cpu_kprunrun != 0)
1867                 cpup->cpu_kprunrun = 0;
1868         if (cpup->cpu_chosen_level != -1)
1869                 cpup->cpu_chosen_level = -1;
1870         membar_enter();
1871         tpri = DISP_PRIO(tp);
1872         maxpri = cpup->cpu_disp->disp_maxrunpri;
1873         maxkpri = kpq->disp_maxrunpri;
1874         if (maxpri < maxkpri)
1875                 maxpri = maxkpri;
1876         if (tpri < maxpri) {
1877                 /*
1878                  * should have done better
1879                  * put this one back and indicate to try again
1880                  */
1881                 cpup->cpu_dispthread = curthread;    /* fixup dispthread */
1882                 cpup->cpu_dispatch_pri = DISP_PRIO(curthread);
1883                 thread_lock_high(tp);
1884                 THREAD_TRANSITION(tp);
1885                 setfrontdq(tp);
1886                 thread_unlock_nopreempt(tp);
1887 
1888                 tp = NULL;
1889         }
1890         return (tp);
1891 }
1892 
1893 /*
1894  * See if there is any work on the dispatcher queue for other CPUs.
1895  * If there is, dequeue the best thread and return.
1896  */
1897 static kthread_t *
1898 disp_getwork(cpu_t *cp)
1899 {
1900         cpu_t           *ocp;           /* other CPU */
1901         cpu_t           *ocp_start;
1902         cpu_t           *tcp;           /* target local CPU */
1903         kthread_t       *tp;
1904         kthread_t       *retval = NULL;
1905         pri_t           maxpri;
1906         disp_t          *kpq;           /* kp queue for this partition */
1907         lpl_t           *lpl, *lpl_leaf;
1908         int             leafidx, startidx;
1909         hrtime_t        stealtime;
1910         lgrp_id_t       local_id;
1911 
1912         maxpri = -1;
1913         tcp = NULL;
1914 
1915         kpq = &cp->cpu_part->cp_kp_queue;
1916         while (kpq->disp_maxrunpri >= 0) {
1917                 /*
1918                  * Try to take a thread from the kp_queue.
1919                  */
1920                 tp = (disp_getbest(kpq));
1921                 if (tp)
1922                         return (disp_ratify(tp, kpq));
1923         }
1924 
1925         kpreempt_disable();             /* protect the cpu_active list */
1926 
1927         /*
1928          * Try to find something to do on another CPU's run queue.
1929          * Loop through all other CPUs looking for the one with the highest
1930          * priority unbound thread.
1931          *
1932          * On NUMA machines, the partition's CPUs are consulted in order of
1933          * distance from the current CPU. This way, the first available
1934          * work found is also the closest, and will suffer the least
1935          * from being migrated.
1936          */
1937         lpl = lpl_leaf = cp->cpu_lpl;
1938         local_id = lpl_leaf->lpl_lgrpid;
1939         leafidx = startidx = 0;
1940 
1941         /*
1942          * This loop traverses the lpl hierarchy. Higher level lpls represent
1943          * broader levels of locality
1944          */
1945         do {
1946                 /* This loop iterates over the lpl's leaves */
1947                 do {
1948                         if (lpl_leaf != cp->cpu_lpl)
1949                                 ocp = lpl_leaf->lpl_cpus;
1950                         else
1951                                 ocp = cp->cpu_next_lpl;
1952 
1953                         /* This loop iterates over the CPUs in the leaf */
1954                         ocp_start = ocp;
1955                         do {
1956                                 pri_t pri;
1957 
1958                                 ASSERT(CPU_ACTIVE(ocp));
1959 
1960                                 /*
1961                                  * End our stroll around this lpl if:
1962                                  *
1963                                  * - Something became runnable on the local
1964                                  *   queue...which also ends our stroll around
1965                                  *   the partition.
1966                                  *
1967                                  * - We happen across another idle CPU.
1968                                  *   Since it is patrolling the next portion
1969                                  *   of the lpl's list (assuming it's not
1970                                  *   halted, or busy servicing an interrupt),
1971                                  *   move to the next higher level of locality.
1972                                  */
1973                                 if (cp->cpu_disp->disp_nrunnable != 0) {
1974                                         kpreempt_enable();
1975                                         return (NULL);
1976                                 }
1977                                 if (ocp->cpu_dispatch_pri == -1) {
1978                                         if (ocp->cpu_disp_flags &
1979                                             CPU_DISP_HALTED ||
1980                                             ocp->cpu_intr_actv != 0)
1981                                                 continue;
1982                                         else
1983                                                 goto next_level;
1984                                 }
1985 
1986                                 /*
1987                                  * If there's only one thread and the CPU
1988                                  * is in the middle of a context switch,
1989                                  * or it's currently running the idle thread,
1990                                  * don't steal it.
1991                                  */
1992                                 if ((ocp->cpu_disp_flags &
1993                                     CPU_DISP_DONTSTEAL) &&
1994                                     ocp->cpu_disp->disp_nrunnable == 1)
1995                                         continue;
1996 
1997                                 pri = ocp->cpu_disp->disp_max_unbound_pri;
1998                                 if (pri > maxpri) {
1999                                         /*
2000                                          * Don't steal threads that we attempted
2001                                          * to steal recently until they're ready
2002                                          * to be stolen again.
2003                                          */
2004                                         stealtime = ocp->cpu_disp->disp_steal;
2005                                         if (stealtime == 0 ||
2006                                             stealtime - gethrtime() <= 0) {
2007                                                 maxpri = pri;
2008                                                 tcp = ocp;
2009                                         } else {
2010                                                 /*
2011                                                  * Don't update tcp, just set
2012                                                  * the retval to T_DONTSTEAL, so
2013                                                  * that if no acceptable CPUs
2014                                                  * are found the return value
2015                                                  * will be T_DONTSTEAL rather
2016                                                  * then NULL.
2017                                                  */
2018                                                 retval = T_DONTSTEAL;
2019                                         }
2020                                 }
2021                         } while ((ocp = ocp->cpu_next_lpl) != ocp_start);
2022 
2023                         /*
2024                          * Iterate to the next leaf lpl in the resource set
2025                          * at this level of locality. If we hit the end of
2026                          * the set, wrap back around to the beginning.
2027                          *
2028                          * Note: This iteration is NULL terminated for a reason
2029                          * see lpl_topo_bootstrap() in lgrp.c for details.
2030                          */
2031                         if ((lpl_leaf = lpl->lpl_rset[++leafidx]) == NULL) {
2032                                 leafidx = 0;
2033                                 lpl_leaf = lpl->lpl_rset[leafidx];
2034                         }
2035                 } while (leafidx != startidx);
2036 
2037 next_level:
2038                 /*
2039                  * Expand the search to include farther away CPUs (next
2040                  * locality level). The closer CPUs that have already been
2041                  * checked will be checked again. In doing so, idle CPUs
2042                  * will tend to be more aggresive about stealing from CPUs
2043                  * that are closer (since the closer CPUs will be considered
2044                  * more often).
2045                  * Begin at this level with the CPUs local leaf lpl.
2046                  */
2047                 if ((lpl = lpl->lpl_parent) != NULL) {
2048                         leafidx = startidx = lpl->lpl_id2rset[local_id];
2049                         lpl_leaf = lpl->lpl_rset[leafidx];
2050                 }
2051         } while (!tcp && lpl);
2052 
2053         kpreempt_enable();
2054 
2055         /*
2056          * If another queue looks good, and there is still nothing on
2057          * the local queue, try to transfer one or more threads
2058          * from it to our queue.
2059          */
2060         if (tcp && cp->cpu_disp->disp_nrunnable == 0) {
2061                 tp = disp_getbest(tcp->cpu_disp);
2062                 if (tp == NULL || tp == T_DONTSTEAL)
2063                         return (tp);
2064                 return (disp_ratify(tp, kpq));
2065         }
2066         return (retval);
2067 }
2068 
2069 
2070 /*
2071  * disp_fix_unbound_pri()
2072  *      Determines the maximum priority of unbound threads on the queue.
2073  *      The priority is kept for the queue, but is only increased, never
2074  *      reduced unless some CPU is looking for something on that queue.
2075  *
2076  *      The priority argument is the known upper limit.
2077  *
2078  *      Perhaps this should be kept accurately, but that probably means
2079  *      separate bitmaps for bound and unbound threads.  Since only idled
2080  *      CPUs will have to do this recalculation, it seems better this way.
2081  */
2082 static void
2083 disp_fix_unbound_pri(disp_t *dp, pri_t pri)
2084 {
2085         kthread_t       *tp;
2086         dispq_t         *dq;
2087         ulong_t         *dqactmap = dp->disp_qactmap;
2088         ulong_t         mapword;
2089         int             wx;
2090 
2091         ASSERT(DISP_LOCK_HELD(&dp->disp_lock));
2092 
2093         ASSERT(pri >= 0);                    /* checked by caller */
2094 
2095         /*
2096          * Start the search at the next lowest priority below the supplied
2097          * priority.  This depends on the bitmap implementation.
2098          */
2099         do {
2100                 wx = pri >> BT_ULSHIFT;           /* index of word in map */
2101 
2102                 /*
2103                  * Form mask for all lower priorities in the word.
2104                  */
2105                 mapword = dqactmap[wx] & (BT_BIW(pri) - 1);
2106 
2107                 /*
2108                  * Get next lower active priority.
2109                  */
2110                 if (mapword != 0) {
2111                         pri = (wx << BT_ULSHIFT) + highbit(mapword) - 1;
2112                 } else if (wx > 0) {
2113                         pri = bt_gethighbit(dqactmap, wx - 1); /* sign extend */
2114                         if (pri < 0)
2115                                 break;
2116                 } else {
2117                         pri = -1;
2118                         break;
2119                 }
2120 
2121                 /*
2122                  * Search the queue for unbound, runnable threads.
2123                  */
2124                 dq = &dp->disp_q[pri];
2125                 tp = dq->dq_first;
2126 
2127                 while (tp && (tp->t_bound_cpu || tp->t_weakbound_cpu)) {
2128                         tp = tp->t_link;
2129                 }
2130 
2131                 /*
2132                  * If a thread was found, set the priority and return.
2133                  */
2134         } while (tp == NULL);
2135 
2136         /*
2137          * pri holds the maximum unbound thread priority or -1.
2138          */
2139         if (dp->disp_max_unbound_pri != pri)
2140                 dp->disp_max_unbound_pri = pri;
2141 }
2142 
2143 /*
2144  * disp_adjust_unbound_pri() - thread is becoming unbound, so we should
2145  *      check if the CPU to which is was previously bound should have
2146  *      its disp_max_unbound_pri increased.
2147  */
2148 void
2149 disp_adjust_unbound_pri(kthread_t *tp)
2150 {
2151         disp_t *dp;
2152         pri_t tpri;
2153 
2154         ASSERT(THREAD_LOCK_HELD(tp));
2155 
2156         /*
2157          * Don't do anything if the thread is not bound, or
2158          * currently not runnable or swapped out.
2159          */
2160         if (tp->t_bound_cpu == NULL ||
2161             tp->t_state != TS_RUN ||
2162             tp->t_schedflag & TS_ON_SWAPQ)
2163                 return;
2164 
2165         tpri = DISP_PRIO(tp);
2166         dp = tp->t_bound_cpu->cpu_disp;
2167         ASSERT(tpri >= 0 && tpri < dp->disp_npri);
2168         if (tpri > dp->disp_max_unbound_pri)
2169                 dp->disp_max_unbound_pri = tpri;
2170 }
2171 
2172 /*
2173  * disp_getbest()
2174  *   De-queue the highest priority unbound runnable thread.
2175  *   Returns with the thread unlocked and onproc but at splhigh (like disp()).
2176  *   Returns NULL if nothing found.
2177  *   Returns T_DONTSTEAL if the thread was not stealable.
2178  *   so that the caller will try again later.
2179  *
2180  *   Passed a pointer to a dispatch queue not associated with this CPU, and
2181  *   its type.
2182  */
2183 static kthread_t *
2184 disp_getbest(disp_t *dp)
2185 {
2186         kthread_t       *tp;
2187         dispq_t         *dq;
2188         pri_t           pri;
2189         cpu_t           *cp, *tcp;
2190         boolean_t       allbound;
2191 
2192         disp_lock_enter(&dp->disp_lock);
2193 
2194         /*
2195          * If there is nothing to run, or the CPU is in the middle of a
2196          * context switch of the only thread, return NULL.
2197          */
2198         tcp = dp->disp_cpu;
2199         cp = CPU;
2200         pri = dp->disp_max_unbound_pri;
2201         if (pri == -1 ||
2202             (tcp != NULL && (tcp->cpu_disp_flags & CPU_DISP_DONTSTEAL) &&
2203             tcp->cpu_disp->disp_nrunnable == 1)) {
2204                 disp_lock_exit_nopreempt(&dp->disp_lock);
2205                 return (NULL);
2206         }
2207 
2208         dq = &dp->disp_q[pri];
2209 
2210 
2211         /*
2212          * Assume that all threads are bound on this queue, and change it
2213          * later when we find out that it is not the case.
2214          */
2215         allbound = B_TRUE;
2216         for (tp = dq->dq_first; tp != NULL; tp = tp->t_link) {
2217                 hrtime_t now, nosteal, rqtime;
2218 
2219                 /*
2220                  * Skip over bound threads which could be here even
2221                  * though disp_max_unbound_pri indicated this level.
2222                  */
2223                 if (tp->t_bound_cpu || tp->t_weakbound_cpu)
2224                         continue;
2225 
2226                 /*
2227                  * We've got some unbound threads on this queue, so turn
2228                  * the allbound flag off now.
2229                  */
2230                 allbound = B_FALSE;
2231 
2232                 /*
2233                  * The thread is a candidate for stealing from its run queue. We
2234                  * don't want to steal threads that became runnable just a
2235                  * moment ago. This improves CPU affinity for threads that get
2236                  * preempted for short periods of time and go back on the run
2237                  * queue.
2238                  *
2239                  * We want to let it stay on its run queue if it was only placed
2240                  * there recently and it was running on the same CPU before that
2241                  * to preserve its cache investment. For the thread to remain on
2242                  * its run queue, ALL of the following conditions must be
2243                  * satisfied:
2244                  *
2245                  * - the disp queue should not be the kernel preemption queue
2246                  * - delayed idle stealing should not be disabled
2247                  * - nosteal_nsec should be non-zero
2248                  * - it should run with user priority
2249                  * - it should be on the run queue of the CPU where it was
2250                  *   running before being placed on the run queue
2251                  * - it should be the only thread on the run queue (to prevent
2252                  *   extra scheduling latency for other threads)
2253                  * - it should sit on the run queue for less than per-chip
2254                  *   nosteal interval or global nosteal interval
2255                  * - in case of CPUs with shared cache it should sit in a run
2256                  *   queue of a CPU from a different chip
2257                  *
2258                  * The checks are arranged so that the ones that are faster are
2259                  * placed earlier.
2260                  */
2261                 if (tcp == NULL ||
2262                     pri >= minclsyspri ||
2263                     tp->t_cpu != tcp)
2264                         break;
2265 
2266                 /*
2267                  * Steal immediately if, due to CMT processor architecture
2268                  * migraiton between cp and tcp would incur no performance
2269                  * penalty.
2270                  */
2271                 if (pg_cmt_can_migrate(cp, tcp))
2272                         break;
2273 
2274                 nosteal = nosteal_nsec;
2275                 if (nosteal == 0)
2276                         break;
2277 
2278                 /*
2279                  * Calculate time spent sitting on run queue
2280                  */
2281                 now = gethrtime_unscaled();
2282                 rqtime = now - tp->t_waitrq;
2283                 scalehrtime(&rqtime);
2284 
2285                 /*
2286                  * Steal immediately if the time spent on this run queue is more
2287                  * than allowed nosteal delay.
2288                  *
2289                  * Negative rqtime check is needed here to avoid infinite
2290                  * stealing delays caused by unlikely but not impossible
2291                  * drifts between CPU times on different CPUs.
2292                  */
2293                 if (rqtime > nosteal || rqtime < 0)
2294                         break;
2295 
2296                 DTRACE_PROBE4(nosteal, kthread_t *, tp,
2297                     cpu_t *, tcp, cpu_t *, cp, hrtime_t, rqtime);
2298                 scalehrtime(&now);
2299                 /*
2300                  * Calculate when this thread becomes stealable
2301                  */
2302                 now += (nosteal - rqtime);
2303 
2304                 /*
2305                  * Calculate time when some thread becomes stealable
2306                  */
2307                 if (now < dp->disp_steal)
2308                         dp->disp_steal = now;
2309         }
2310 
2311         /*
2312          * If there were no unbound threads on this queue, find the queue
2313          * where they are and then return later. The value of
2314          * disp_max_unbound_pri is not always accurate because it isn't
2315          * reduced until another idle CPU looks for work.
2316          */
2317         if (allbound)
2318                 disp_fix_unbound_pri(dp, pri);
2319 
2320         /*
2321          * If we reached the end of the queue and found no unbound threads
2322          * then return NULL so that other CPUs will be considered.  If there
2323          * are unbound threads but they cannot yet be stolen, then
2324          * return T_DONTSTEAL and try again later.
2325          */
2326         if (tp == NULL) {
2327                 disp_lock_exit_nopreempt(&dp->disp_lock);
2328                 return (allbound ? NULL : T_DONTSTEAL);
2329         }
2330 
2331         /*
2332          * Found a runnable, unbound thread, so remove it from queue.
2333          * dispdeq() requires that we have the thread locked, and we do,
2334          * by virtue of holding the dispatch queue lock.  dispdeq() will
2335          * put the thread in transition state, thereby dropping the dispq
2336          * lock.
2337          */
2338 
2339 #ifdef DEBUG
2340         {
2341                 int     thread_was_on_queue;
2342 
2343                 thread_was_on_queue = dispdeq(tp);      /* drops disp_lock */
2344                 ASSERT(thread_was_on_queue);
2345         }
2346 
2347 #else /* DEBUG */
2348         (void) dispdeq(tp);                     /* drops disp_lock */
2349 #endif /* DEBUG */
2350 
2351         /*
2352          * Reset the disp_queue steal time - we do not know what is the smallest
2353          * value across the queue is.
2354          */
2355         dp->disp_steal = 0;
2356 
2357         tp->t_schedflag |= TS_DONT_SWAP;
2358 
2359         /*
2360          * Setup thread to run on the current CPU.
2361          */
2362         tp->t_disp_queue = cp->cpu_disp;
2363 
2364         cp->cpu_dispthread = tp;             /* protected by spl only */
2365         cp->cpu_dispatch_pri = pri;
2366 
2367         /*
2368          * There can be a memory synchronization race between disp_getbest()
2369          * and disp_ratify() vs cpu_resched() where cpu_resched() is trying
2370          * to preempt the current thread to run the enqueued thread while
2371          * disp_getbest() and disp_ratify() are changing the current thread
2372          * to the stolen thread. This may lead to a situation where
2373          * cpu_resched() tries to preempt the wrong thread and the
2374          * stolen thread continues to run on the CPU which has been tagged
2375          * for preemption.
2376          * Later the clock thread gets enqueued but doesn't get to run on the
2377          * CPU causing the system to hang.
2378          *
2379          * To avoid this, grabbing and dropping the disp_lock (which does
2380          * a memory barrier) is needed to synchronize the execution of
2381          * cpu_resched() with disp_getbest() and disp_ratify() and
2382          * synchronize the memory read and written by cpu_resched(),
2383          * disp_getbest(), and disp_ratify() with each other.
2384          *  (see CR#6482861 for more details).
2385          */
2386         disp_lock_enter_high(&cp->cpu_disp->disp_lock);
2387         disp_lock_exit_high(&cp->cpu_disp->disp_lock);
2388 
2389         ASSERT(pri == DISP_PRIO(tp));
2390 
2391         DTRACE_PROBE3(steal, kthread_t *, tp, cpu_t *, tcp, cpu_t *, cp);
2392 
2393         thread_onproc(tp, cp);                  /* set t_state to TS_ONPROC */
2394 
2395         /*
2396          * Return with spl high so that swtch() won't need to raise it.
2397          * The disp_lock was dropped by dispdeq().
2398          */
2399 
2400         return (tp);
2401 }
2402 
2403 /*
2404  * disp_bound_common() - common routine for higher level functions
2405  *      that check for bound threads under certain conditions.
2406  *      If 'threadlistsafe' is set then there is no need to acquire
2407  *      pidlock to stop the thread list from changing (eg, if
2408  *      disp_bound_* is called with cpus paused).
2409  */
2410 static int
2411 disp_bound_common(cpu_t *cp, int threadlistsafe, int flag)
2412 {
2413         int             found = 0;
2414         kthread_t       *tp;
2415 
2416         ASSERT(flag);
2417 
2418         if (!threadlistsafe)
2419                 mutex_enter(&pidlock);
2420         tp = curthread;         /* faster than allthreads */
2421         do {
2422                 if (tp->t_state != TS_FREE) {
2423                         /*
2424                          * If an interrupt thread is busy, but the
2425                          * caller doesn't care (i.e. BOUND_INTR is off),
2426                          * then just ignore it and continue through.
2427                          */
2428                         if ((tp->t_flag & T_INTR_THREAD) &&
2429                             !(flag & BOUND_INTR))
2430                                 continue;
2431 
2432                         /*
2433                          * Skip the idle thread for the CPU
2434                          * we're about to set offline.
2435                          */
2436                         if (tp == cp->cpu_idle_thread)
2437                                 continue;
2438 
2439                         /*
2440                          * Skip the pause thread for the CPU
2441                          * we're about to set offline.
2442                          */
2443                         if (tp == cp->cpu_pause_thread)
2444                                 continue;
2445 
2446                         if ((flag & BOUND_CPU) &&
2447                             (tp->t_bound_cpu == cp ||
2448                             tp->t_bind_cpu == cp->cpu_id ||
2449                             tp->t_weakbound_cpu == cp)) {
2450                                 found = 1;
2451                                 break;
2452                         }
2453 
2454                         if ((flag & BOUND_PARTITION) &&
2455                             (tp->t_cpupart == cp->cpu_part)) {
2456                                 found = 1;
2457                                 break;
2458                         }
2459                 }
2460         } while ((tp = tp->t_next) != curthread && found == 0);
2461         if (!threadlistsafe)
2462                 mutex_exit(&pidlock);
2463         return (found);
2464 }
2465 
2466 /*
2467  * disp_bound_threads - return nonzero if threads are bound to the processor.
2468  *      Called infrequently.  Keep this simple.
2469  *      Includes threads that are asleep or stopped but not onproc.
2470  */
2471 int
2472 disp_bound_threads(cpu_t *cp, int threadlistsafe)
2473 {
2474         return (disp_bound_common(cp, threadlistsafe, BOUND_CPU));
2475 }
2476 
2477 /*
2478  * disp_bound_anythreads - return nonzero if _any_ threads are bound
2479  * to the given processor, including interrupt threads.
2480  */
2481 int
2482 disp_bound_anythreads(cpu_t *cp, int threadlistsafe)
2483 {
2484         return (disp_bound_common(cp, threadlistsafe, BOUND_CPU | BOUND_INTR));
2485 }
2486 
2487 /*
2488  * disp_bound_partition - return nonzero if threads are bound to the same
2489  * partition as the processor.
2490  *      Called infrequently.  Keep this simple.
2491  *      Includes threads that are asleep or stopped but not onproc.
2492  */
2493 int
2494 disp_bound_partition(cpu_t *cp, int threadlistsafe)
2495 {
2496         return (disp_bound_common(cp, threadlistsafe, BOUND_PARTITION));
2497 }
2498 
2499 /*
2500  * disp_cpu_inactive - make a CPU inactive by moving all of its unbound
2501  * threads to other CPUs.
2502  */
2503 void
2504 disp_cpu_inactive(cpu_t *cp)
2505 {
2506         kthread_t       *tp;
2507         disp_t          *dp = cp->cpu_disp;
2508         dispq_t         *dq;
2509         pri_t           pri;
2510         int             wasonq;
2511 
2512         disp_lock_enter(&dp->disp_lock);
2513         while ((pri = dp->disp_max_unbound_pri) != -1) {
2514                 dq = &dp->disp_q[pri];
2515                 tp = dq->dq_first;
2516 
2517                 /*
2518                  * Skip over bound threads.
2519                  */
2520                 while (tp != NULL && tp->t_bound_cpu != NULL) {
2521                         tp = tp->t_link;
2522                 }
2523 
2524                 if (tp == NULL) {
2525                         /* disp_max_unbound_pri must be inaccurate, so fix it */
2526                         disp_fix_unbound_pri(dp, pri);
2527                         continue;
2528                 }
2529 
2530                 wasonq = dispdeq(tp);           /* drops disp_lock */
2531                 ASSERT(wasonq);
2532                 ASSERT(tp->t_weakbound_cpu == NULL);
2533 
2534                 setbackdq(tp);
2535                 /*
2536                  * Called from cpu_offline:
2537                  *
2538                  * cp has already been removed from the list of active cpus
2539                  * and tp->t_cpu has been changed so there is no risk of
2540                  * tp ending up back on cp.
2541                  *
2542                  * Called from cpupart_move_cpu:
2543                  *
2544                  * The cpu has moved to a new cpupart.  Any threads that
2545                  * were on it's dispatch queues before the move remain
2546                  * in the old partition and can't run in the new partition.
2547                  */
2548                 ASSERT(tp->t_cpu != cp);
2549                 thread_unlock(tp);
2550 
2551                 disp_lock_enter(&dp->disp_lock);
2552         }
2553         disp_lock_exit(&dp->disp_lock);
2554 }
2555 
2556 /*
2557  * Return a score rating this CPU for running this thread: lower is better.
2558  *
2559  * If curthread is looking for a new CPU, then we ignore cpu_dispatch_pri for
2560  * curcpu (as that's our own priority).
2561  *
2562  * If a cpu is the target of an offline request, then try to avoid it.
2563  *
2564  * Otherwise we'll use double the effective dispatcher priority for the CPU.
2565  *
2566  * We do this so ht_adjust_cpu_score() can increment the score if needed,
2567  * without ending up over-riding a dispatcher priority.
2568  */
2569 static pri_t
2570 cpu_score(cpu_t *cp, kthread_t *tp)
2571 {
2572         pri_t score;
2573 
2574         if (tp == curthread && cp == curthread->t_cpu)
2575                 score = 2 * CPU_IDLE_PRI;
2576         else if (cp == cpu_inmotion)
2577                 score = SHRT_MAX;
2578         else
2579                 score = 2 * cp->cpu_dispatch_pri;
2580 
2581         if (2 * cp->cpu_disp->disp_maxrunpri > score)
2582                 score = 2 * cp->cpu_disp->disp_maxrunpri;
2583         if (2 * cp->cpu_chosen_level > score)
2584                 score = 2 * cp->cpu_chosen_level;
2585 
2586         return (ht_adjust_cpu_score(tp, cp, score));
2587 }
2588 
2589 /*
2590  * disp_lowpri_cpu - find a suitable CPU to run the given thread.
2591  *
2592  * We are looking for a CPU with an effective dispatch priority lower than the
2593  * thread's, so that the thread will run immediately rather than be enqueued.
2594  * For NUMA locality, we prefer "home" CPUs within the thread's ->t_lpl group.
2595  * If we don't find an available CPU there, we will expand our search to include
2596  * wider locality levels. (Note these groups are already divided by CPU
2597  * partition.)
2598  *
2599  * If the thread cannot immediately run on *any* CPU, we'll enqueue ourselves on
2600  * the best home CPU we found.
2601  *
2602  * The hint passed in is used as a starting point so we don't favor CPU 0 or any
2603  * other CPU.  The caller should pass in the most recently used CPU for the
2604  * thread; it's of course possible that this CPU isn't in the home lgroup.
2605  *
2606  * This function must be called at either high SPL, or with preemption disabled,
2607  * so that the "hint" CPU cannot be removed from the online CPU list while we
2608  * are traversing it.
2609  */
2610 cpu_t *
2611 disp_lowpri_cpu(cpu_t *hint, kthread_t *tp, pri_t tpri)
2612 {
2613         cpu_t   *bestcpu;
2614         cpu_t   *besthomecpu;
2615         cpu_t   *cp, *cpstart;
2616 
2617         klgrpset_t      done;
2618 
2619         lpl_t           *lpl_iter, *lpl_leaf;
2620 
2621         ASSERT(hint != NULL);
2622         ASSERT(tp->t_lpl->lpl_ncpu > 0);
2623 
2624         bestcpu = besthomecpu = NULL;
2625         klgrpset_clear(done);
2626 
2627         lpl_iter = tp->t_lpl;
2628 
2629         do {
2630                 pri_t best = SHRT_MAX;
2631                 klgrpset_t cur_set;
2632 
2633                 klgrpset_clear(cur_set);
2634 
2635                 for (int i = 0; i < lpl_iter->lpl_nrset; i++) {
2636                         lpl_leaf = lpl_iter->lpl_rset[i];
2637                         if (klgrpset_ismember(done, lpl_leaf->lpl_lgrpid))
2638                                 continue;
2639 
2640                         klgrpset_add(cur_set, lpl_leaf->lpl_lgrpid);
2641 
2642                         if (hint->cpu_lpl == lpl_leaf)
2643                                 cp = cpstart = hint;
2644                         else
2645                                 cp = cpstart = lpl_leaf->lpl_cpus;
2646 
2647                         do {
2648                                 pri_t score = cpu_score(cp, tp);
2649 
2650                                 if (score < best) {
2651                                         best = score;
2652                                         bestcpu = cp;
2653 
2654                                         /* An idle CPU: we're done. */
2655                                         if (score / 2 == CPU_IDLE_PRI)
2656                                                 goto out;
2657                                 }
2658                         } while ((cp = cp->cpu_next_lpl) != cpstart);
2659                 }
2660 
2661                 if (bestcpu != NULL && tpri > (best / 2))
2662                         goto out;
2663 
2664                 if (besthomecpu == NULL)
2665                         besthomecpu = bestcpu;
2666 
2667                 /*
2668                  * Add the lgrps we just considered to the "done" set
2669                  */
2670                 klgrpset_or(done, cur_set);
2671 
2672         } while ((lpl_iter = lpl_iter->lpl_parent) != NULL);
2673 
2674         /*
2675          * The specified priority isn't high enough to run immediately
2676          * anywhere, so just return the best CPU from the home lgroup.
2677          */
2678         bestcpu = besthomecpu;
2679 
2680 out:
2681         ASSERT((bestcpu->cpu_flags & CPU_QUIESCED) == 0);
2682         return (bestcpu);
2683 }
2684 
2685 /*
2686  * This routine provides the generic idle cpu function for all processors.
2687  * If a processor has some specific code to execute when idle (say, to stop
2688  * the pipeline and save power) then that routine should be defined in the
2689  * processors specific code (module_xx.c) and the global variable idle_cpu
2690  * set to that function.
2691  */
2692 static void
2693 generic_idle_cpu(void)
2694 {
2695 }
2696 
2697 /*ARGSUSED*/
2698 static void
2699 generic_enq_thread(cpu_t *cpu, int bound)
2700 {
2701 }
2702 
2703 cpu_t *
2704 disp_choose_best_cpu(void)
2705 {
2706         kthread_t *t = curthread;
2707         cpu_t *curcpu = CPU;
2708 
2709         ASSERT(t->t_preempt > 0);
2710         ASSERT(t->t_state == TS_ONPROC);
2711         ASSERT(t->t_schedflag & TS_VCPU);
2712 
2713         if (ht_should_run(t, curcpu))
2714                 return (curcpu);
2715 
2716         return (disp_lowpri_cpu(curcpu, t, t->t_pri));
2717 }