1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26 /*
27 * Copyright (c) 2018, Joyent, Inc. All rights reserved.
28 */
29
30 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
31 /* All Rights Reserved */
32
33
34 #include <sys/types.h>
35 #include <sys/param.h>
36 #include <sys/sysmacros.h>
37 #include <sys/signal.h>
38 #include <sys/user.h>
39 #include <sys/systm.h>
40 #include <sys/sysinfo.h>
41 #include <sys/var.h>
42 #include <sys/errno.h>
43 #include <sys/cmn_err.h>
44 #include <sys/debug.h>
45 #include <sys/inline.h>
46 #include <sys/disp.h>
47 #include <sys/class.h>
48 #include <sys/bitmap.h>
49 #include <sys/kmem.h>
50 #include <sys/cpuvar.h>
51 #include <sys/vtrace.h>
52 #include <sys/tnf.h>
53 #include <sys/cpupart.h>
54 #include <sys/lgrp.h>
55 #include <sys/pg.h>
56 #include <sys/cmt.h>
57 #include <sys/bitset.h>
58 #include <sys/schedctl.h>
59 #include <sys/atomic.h>
60 #include <sys/dtrace.h>
61 #include <sys/sdt.h>
62 #include <sys/archsystm.h>
63 #include <sys/ht.h>
64
65 #include <vm/as.h>
66
67 #define BOUND_CPU 0x1
68 #define BOUND_PARTITION 0x2
69 #define BOUND_INTR 0x4
70
71 /* Dispatch queue allocation structure and functions */
72 struct disp_queue_info {
73 disp_t *dp;
74 dispq_t *olddispq;
75 dispq_t *newdispq;
76 ulong_t *olddqactmap;
77 ulong_t *newdqactmap;
78 int oldnglobpris;
79 };
80 static void disp_dq_alloc(struct disp_queue_info *dptr, int numpris,
81 disp_t *dp);
82 static void disp_dq_assign(struct disp_queue_info *dptr, int numpris);
83 static void disp_dq_free(struct disp_queue_info *dptr);
84
85 /* platform-specific routine to call when processor is idle */
86 static void generic_idle_cpu();
87 void (*idle_cpu)() = generic_idle_cpu;
88
89 /* routines invoked when a CPU enters/exits the idle loop */
90 static void idle_enter();
91 static void idle_exit();
92
93 /* platform-specific routine to call when thread is enqueued */
94 static void generic_enq_thread(cpu_t *, int);
95 void (*disp_enq_thread)(cpu_t *, int) = generic_enq_thread;
96
97 pri_t kpreemptpri; /* priority where kernel preemption applies */
98 pri_t upreemptpri = 0; /* priority where normal preemption applies */
99 pri_t intr_pri; /* interrupt thread priority base level */
100
101 #define KPQPRI -1 /* pri where cpu affinity is dropped for kpq */
102 pri_t kpqpri = KPQPRI; /* can be set in /etc/system */
103 disp_t cpu0_disp; /* boot CPU's dispatch queue */
104 disp_lock_t swapped_lock; /* lock swapped threads and swap queue */
105 int nswapped; /* total number of swapped threads */
106 void disp_swapped_enq(kthread_t *tp);
107 static void disp_swapped_setrun(kthread_t *tp);
108 static void cpu_resched(cpu_t *cp, pri_t tpri);
109
110 /*
111 * If this is set, only interrupt threads will cause kernel preemptions.
112 * This is done by changing the value of kpreemptpri. kpreemptpri
113 * will either be the max sysclass pri + 1 or the min interrupt pri.
114 */
115 int only_intr_kpreempt;
116
117 extern void set_idle_cpu(int cpun);
118 extern void unset_idle_cpu(int cpun);
119 static void setkpdq(kthread_t *tp, int borf);
120 #define SETKP_BACK 0
121 #define SETKP_FRONT 1
122 /*
123 * Parameter that determines how recently a thread must have run
124 * on the CPU to be considered loosely-bound to that CPU to reduce
125 * cold cache effects. The interval is in hertz.
126 */
127 #define RECHOOSE_INTERVAL 3
128 int rechoose_interval = RECHOOSE_INTERVAL;
129
130 /*
131 * Parameter that determines how long (in nanoseconds) a thread must
132 * be sitting on a run queue before it can be stolen by another CPU
133 * to reduce migrations. The interval is in nanoseconds.
134 *
135 * The nosteal_nsec should be set by platform code cmp_set_nosteal_interval()
136 * to an appropriate value. nosteal_nsec is set to NOSTEAL_UNINITIALIZED
137 * here indicating it is uninitiallized.
138 * Setting nosteal_nsec to 0 effectively disables the nosteal 'protection'.
139 *
140 */
141 #define NOSTEAL_UNINITIALIZED (-1)
142 hrtime_t nosteal_nsec = NOSTEAL_UNINITIALIZED;
143 extern void cmp_set_nosteal_interval(void);
144
145 id_t defaultcid; /* system "default" class; see dispadmin(1M) */
146
147 disp_lock_t transition_lock; /* lock on transitioning threads */
148 disp_lock_t stop_lock; /* lock on stopped threads */
149
150 static void cpu_dispqalloc(int numpris);
151
152 /*
153 * This gets returned by disp_getwork/disp_getbest if we couldn't steal
154 * a thread because it was sitting on its run queue for a very short
155 * period of time.
156 */
157 #define T_DONTSTEAL (kthread_t *)(-1) /* returned by disp_getwork/getbest */
158
159 static kthread_t *disp_getwork(cpu_t *to);
160 static kthread_t *disp_getbest(disp_t *from);
161 static kthread_t *disp_ratify(kthread_t *tp, disp_t *kpq);
162
163 void swtch_to(kthread_t *);
164
165 /*
166 * dispatcher and scheduler initialization
167 */
168
169 /*
170 * disp_setup - Common code to calculate and allocate dispatcher
171 * variables and structures based on the maximum priority.
172 */
173 static void
174 disp_setup(pri_t maxglobpri, pri_t oldnglobpris)
175 {
176 pri_t newnglobpris;
177
178 ASSERT(MUTEX_HELD(&cpu_lock));
179
180 newnglobpris = maxglobpri + 1 + LOCK_LEVEL;
181
182 if (newnglobpris > oldnglobpris) {
183 /*
184 * Allocate new kp queues for each CPU partition.
185 */
186 cpupart_kpqalloc(newnglobpris);
187
188 /*
189 * Allocate new dispatch queues for each CPU.
190 */
191 cpu_dispqalloc(newnglobpris);
192
193 /*
194 * compute new interrupt thread base priority
195 */
196 intr_pri = maxglobpri;
197 if (only_intr_kpreempt) {
198 kpreemptpri = intr_pri + 1;
199 if (kpqpri == KPQPRI)
200 kpqpri = kpreemptpri;
201 }
202 v.v_nglobpris = newnglobpris;
203 }
204 }
205
206 /*
207 * dispinit - Called to initialize all loaded classes and the
208 * dispatcher framework.
209 */
210 void
211 dispinit(void)
212 {
213 id_t cid;
214 pri_t maxglobpri;
215 pri_t cl_maxglobpri;
216
217 maxglobpri = -1;
218
219 /*
220 * Initialize transition lock, which will always be set.
221 */
222 DISP_LOCK_INIT(&transition_lock);
223 disp_lock_enter_high(&transition_lock);
224 DISP_LOCK_INIT(&stop_lock);
225
226 mutex_enter(&cpu_lock);
227 CPU->cpu_disp->disp_maxrunpri = -1;
228 CPU->cpu_disp->disp_max_unbound_pri = -1;
229
230 /*
231 * Initialize the default CPU partition.
232 */
233 cpupart_initialize_default();
234 /*
235 * Call the class specific initialization functions for
236 * all pre-installed schedulers.
237 *
238 * We pass the size of a class specific parameter
239 * buffer to each of the initialization functions
240 * to try to catch problems with backward compatibility
241 * of class modules.
242 *
243 * For example a new class module running on an old system
244 * which didn't provide sufficiently large parameter buffers
245 * would be bad news. Class initialization modules can check for
246 * this and take action if they detect a problem.
247 */
248
249 for (cid = 0; cid < nclass; cid++) {
250 sclass_t *sc;
251
252 sc = &sclass[cid];
253 if (SCHED_INSTALLED(sc)) {
254 cl_maxglobpri = sc->cl_init(cid, PC_CLPARMSZ,
255 &sc->cl_funcs);
256 if (cl_maxglobpri > maxglobpri)
257 maxglobpri = cl_maxglobpri;
258 }
259 }
260 kpreemptpri = (pri_t)v.v_maxsyspri + 1;
261 if (kpqpri == KPQPRI)
262 kpqpri = kpreemptpri;
263
264 ASSERT(maxglobpri >= 0);
265 disp_setup(maxglobpri, 0);
266
267 mutex_exit(&cpu_lock);
268
269 /*
270 * Platform specific sticky scheduler setup.
271 */
272 if (nosteal_nsec == NOSTEAL_UNINITIALIZED)
273 cmp_set_nosteal_interval();
274
275 /*
276 * Get the default class ID; this may be later modified via
277 * dispadmin(1M). This will load the class (normally TS) and that will
278 * call disp_add(), which is why we had to drop cpu_lock first.
279 */
280 if (getcid(defaultclass, &defaultcid) != 0) {
281 cmn_err(CE_PANIC, "Couldn't load default scheduling class '%s'",
282 defaultclass);
283 }
284 }
285
286 /*
287 * disp_add - Called with class pointer to initialize the dispatcher
288 * for a newly loaded class.
289 */
290 void
291 disp_add(sclass_t *clp)
292 {
293 pri_t maxglobpri;
294 pri_t cl_maxglobpri;
295
296 mutex_enter(&cpu_lock);
297 /*
298 * Initialize the scheduler class.
299 */
300 maxglobpri = (pri_t)(v.v_nglobpris - LOCK_LEVEL - 1);
301 cl_maxglobpri = clp->cl_init(clp - sclass, PC_CLPARMSZ, &clp->cl_funcs);
302 if (cl_maxglobpri > maxglobpri)
303 maxglobpri = cl_maxglobpri;
304
305 /*
306 * Save old queue information. Since we're initializing a
307 * new scheduling class which has just been loaded, then
308 * the size of the dispq may have changed. We need to handle
309 * that here.
310 */
311 disp_setup(maxglobpri, v.v_nglobpris);
312
313 mutex_exit(&cpu_lock);
314 }
315
316
317 /*
318 * For each CPU, allocate new dispatch queues
319 * with the stated number of priorities.
320 */
321 static void
322 cpu_dispqalloc(int numpris)
323 {
324 cpu_t *cpup;
325 struct disp_queue_info *disp_mem;
326 int i, num;
327
328 ASSERT(MUTEX_HELD(&cpu_lock));
329
330 disp_mem = kmem_zalloc(NCPU *
331 sizeof (struct disp_queue_info), KM_SLEEP);
332
333 /*
334 * This routine must allocate all of the memory before stopping
335 * the cpus because it must not sleep in kmem_alloc while the
336 * CPUs are stopped. Locks they hold will not be freed until they
337 * are restarted.
338 */
339 i = 0;
340 cpup = cpu_list;
341 do {
342 disp_dq_alloc(&disp_mem[i], numpris, cpup->cpu_disp);
343 i++;
344 cpup = cpup->cpu_next;
345 } while (cpup != cpu_list);
346 num = i;
347
348 pause_cpus(NULL, NULL);
349 for (i = 0; i < num; i++)
350 disp_dq_assign(&disp_mem[i], numpris);
351 start_cpus();
352
353 /*
354 * I must free all of the memory after starting the cpus because
355 * I can not risk sleeping in kmem_free while the cpus are stopped.
356 */
357 for (i = 0; i < num; i++)
358 disp_dq_free(&disp_mem[i]);
359
360 kmem_free(disp_mem, NCPU * sizeof (struct disp_queue_info));
361 }
362
363 static void
364 disp_dq_alloc(struct disp_queue_info *dptr, int numpris, disp_t *dp)
365 {
366 dptr->newdispq = kmem_zalloc(numpris * sizeof (dispq_t), KM_SLEEP);
367 dptr->newdqactmap = kmem_zalloc(((numpris / BT_NBIPUL) + 1) *
368 sizeof (long), KM_SLEEP);
369 dptr->dp = dp;
370 }
371
372 static void
373 disp_dq_assign(struct disp_queue_info *dptr, int numpris)
374 {
375 disp_t *dp;
376
377 dp = dptr->dp;
378 dptr->olddispq = dp->disp_q;
379 dptr->olddqactmap = dp->disp_qactmap;
380 dptr->oldnglobpris = dp->disp_npri;
381
382 ASSERT(dptr->oldnglobpris < numpris);
383
384 if (dptr->olddispq != NULL) {
385 /*
386 * Use kcopy because bcopy is platform-specific
387 * and could block while we might have paused the cpus.
388 */
389 (void) kcopy(dptr->olddispq, dptr->newdispq,
390 dptr->oldnglobpris * sizeof (dispq_t));
391 (void) kcopy(dptr->olddqactmap, dptr->newdqactmap,
392 ((dptr->oldnglobpris / BT_NBIPUL) + 1) *
393 sizeof (long));
394 }
395 dp->disp_q = dptr->newdispq;
396 dp->disp_qactmap = dptr->newdqactmap;
397 dp->disp_q_limit = &dptr->newdispq[numpris];
398 dp->disp_npri = numpris;
399 }
400
401 static void
402 disp_dq_free(struct disp_queue_info *dptr)
403 {
404 if (dptr->olddispq != NULL)
405 kmem_free(dptr->olddispq,
406 dptr->oldnglobpris * sizeof (dispq_t));
407 if (dptr->olddqactmap != NULL)
408 kmem_free(dptr->olddqactmap,
409 ((dptr->oldnglobpris / BT_NBIPUL) + 1) * sizeof (long));
410 }
411
412 /*
413 * For a newly created CPU, initialize the dispatch queue.
414 * This is called before the CPU is known through cpu[] or on any lists.
415 */
416 void
417 disp_cpu_init(cpu_t *cp)
418 {
419 disp_t *dp;
420 dispq_t *newdispq;
421 ulong_t *newdqactmap;
422
423 ASSERT(MUTEX_HELD(&cpu_lock)); /* protect dispatcher queue sizes */
424
425 if (cp == cpu0_disp.disp_cpu)
426 dp = &cpu0_disp;
427 else
428 dp = kmem_alloc(sizeof (disp_t), KM_SLEEP);
429 bzero(dp, sizeof (disp_t));
430 cp->cpu_disp = dp;
431 dp->disp_cpu = cp;
432 dp->disp_maxrunpri = -1;
433 dp->disp_max_unbound_pri = -1;
434 DISP_LOCK_INIT(&cp->cpu_thread_lock);
435 /*
436 * Allocate memory for the dispatcher queue headers
437 * and the active queue bitmap.
438 */
439 newdispq = kmem_zalloc(v.v_nglobpris * sizeof (dispq_t), KM_SLEEP);
440 newdqactmap = kmem_zalloc(((v.v_nglobpris / BT_NBIPUL) + 1) *
441 sizeof (long), KM_SLEEP);
442 dp->disp_q = newdispq;
443 dp->disp_qactmap = newdqactmap;
444 dp->disp_q_limit = &newdispq[v.v_nglobpris];
445 dp->disp_npri = v.v_nglobpris;
446 }
447
448 void
449 disp_cpu_fini(cpu_t *cp)
450 {
451 ASSERT(MUTEX_HELD(&cpu_lock));
452
453 disp_kp_free(cp->cpu_disp);
454 if (cp->cpu_disp != &cpu0_disp)
455 kmem_free(cp->cpu_disp, sizeof (disp_t));
456 }
457
458 /*
459 * Allocate new, larger kpreempt dispatch queue to replace the old one.
460 */
461 void
462 disp_kp_alloc(disp_t *dq, pri_t npri)
463 {
464 struct disp_queue_info mem_info;
465
466 if (npri > dq->disp_npri) {
467 /*
468 * Allocate memory for the new array.
469 */
470 disp_dq_alloc(&mem_info, npri, dq);
471
472 /*
473 * We need to copy the old structures to the new
474 * and free the old.
475 */
476 disp_dq_assign(&mem_info, npri);
477 disp_dq_free(&mem_info);
478 }
479 }
480
481 /*
482 * Free dispatch queue.
483 * Used for the kpreempt queues for a removed CPU partition and
484 * for the per-CPU queues of deleted CPUs.
485 */
486 void
487 disp_kp_free(disp_t *dq)
488 {
489 struct disp_queue_info mem_info;
490
491 mem_info.olddispq = dq->disp_q;
492 mem_info.olddqactmap = dq->disp_qactmap;
493 mem_info.oldnglobpris = dq->disp_npri;
494 disp_dq_free(&mem_info);
495 }
496
497 /*
498 * End dispatcher and scheduler initialization.
499 */
500
501 /*
502 * See if there's anything to do other than remain idle.
503 * Return non-zero if there is.
504 *
505 * This function must be called with high spl, or with
506 * kernel preemption disabled to prevent the partition's
507 * active cpu list from changing while being traversed.
508 *
509 * This is essentially a simpler version of disp_getwork()
510 * to be called by CPUs preparing to "halt".
511 */
512 int
513 disp_anywork(void)
514 {
515 cpu_t *cp = CPU;
516 cpu_t *ocp;
517 volatile int *local_nrunnable = &cp->cpu_disp->disp_nrunnable;
518
519 if (!(cp->cpu_flags & CPU_OFFLINE)) {
520 if (CP_MAXRUNPRI(cp->cpu_part) >= 0)
521 return (1);
522
523 for (ocp = cp->cpu_next_part; ocp != cp;
524 ocp = ocp->cpu_next_part) {
525 ASSERT(CPU_ACTIVE(ocp));
526
527 /*
528 * Something has appeared on the local run queue.
529 */
530 if (*local_nrunnable > 0)
531 return (1);
532 /*
533 * If we encounter another idle CPU that will
534 * soon be trolling around through disp_anywork()
535 * terminate our walk here and let this other CPU
536 * patrol the next part of the list.
537 */
538 if (ocp->cpu_dispatch_pri == -1 &&
539 (ocp->cpu_disp_flags & CPU_DISP_HALTED) == 0)
540 return (0);
541 /*
542 * Work can be taken from another CPU if:
543 * - There is unbound work on the run queue
544 * - That work isn't a thread undergoing a
545 * - context switch on an otherwise empty queue.
546 * - The CPU isn't running the idle loop.
547 */
548 if (ocp->cpu_disp->disp_max_unbound_pri != -1 &&
549 !((ocp->cpu_disp_flags & CPU_DISP_DONTSTEAL) &&
550 ocp->cpu_disp->disp_nrunnable == 1) &&
551 ocp->cpu_dispatch_pri != -1)
552 return (1);
553 }
554 }
555 return (0);
556 }
557
558 /*
559 * Called when CPU enters the idle loop
560 */
561 static void
562 idle_enter()
563 {
564 cpu_t *cp = CPU;
565
566 new_cpu_mstate(CMS_IDLE, gethrtime_unscaled());
567 CPU_STATS_ADDQ(cp, sys, idlethread, 1);
568 set_idle_cpu(cp->cpu_id); /* arch-dependent hook */
569 }
570
571 /*
572 * Called when CPU exits the idle loop
573 */
574 static void
575 idle_exit()
576 {
577 cpu_t *cp = CPU;
578
579 new_cpu_mstate(CMS_SYSTEM, gethrtime_unscaled());
580 unset_idle_cpu(cp->cpu_id); /* arch-dependent hook */
581 }
582
583 /*
584 * Idle loop.
585 */
586 void
587 idle()
588 {
589 struct cpu *cp = CPU; /* pointer to this CPU */
590 kthread_t *t; /* taken thread */
591
592 idle_enter();
593
594 /*
595 * Uniprocessor version of idle loop.
596 * Do this until notified that we're on an actual multiprocessor.
597 */
598 while (ncpus == 1) {
599 if (cp->cpu_disp->disp_nrunnable == 0) {
600 (*idle_cpu)();
601 continue;
602 }
603 idle_exit();
604 swtch();
605
606 idle_enter(); /* returned from swtch */
607 }
608
609 /*
610 * Multiprocessor idle loop.
611 */
612 for (;;) {
613 /*
614 * If CPU is completely quiesced by p_online(2), just wait
615 * here with minimal bus traffic until put online.
616 */
617 while (cp->cpu_flags & CPU_QUIESCED)
618 (*idle_cpu)();
619
620 if (cp->cpu_disp->disp_nrunnable != 0) {
621 idle_exit();
622 swtch();
623 } else {
624 if (cp->cpu_flags & CPU_OFFLINE)
625 continue;
626 if ((t = disp_getwork(cp)) == NULL) {
627 if (cp->cpu_chosen_level != -1) {
628 disp_t *dp = cp->cpu_disp;
629 disp_t *kpq;
630
631 disp_lock_enter(&dp->disp_lock);
632 /*
633 * Set kpq under lock to prevent
634 * migration between partitions.
635 */
636 kpq = &cp->cpu_part->cp_kp_queue;
637 if (kpq->disp_maxrunpri == -1)
638 cp->cpu_chosen_level = -1;
639 disp_lock_exit(&dp->disp_lock);
640 }
641 (*idle_cpu)();
642 continue;
643 }
644 /*
645 * If there was a thread but we couldn't steal
646 * it, then keep trying.
647 */
648 if (t == T_DONTSTEAL)
649 continue;
650 idle_exit();
651 swtch_to(t);
652 }
653 idle_enter(); /* returned from swtch/swtch_to */
654 }
655 }
656
657
658 /*
659 * Preempt the currently running thread in favor of the highest
660 * priority thread. The class of the current thread controls
661 * where it goes on the dispatcher queues. If panicking, turn
662 * preemption off.
663 */
664 void
665 preempt()
666 {
667 kthread_t *t = curthread;
668 klwp_t *lwp = ttolwp(curthread);
669
670 if (panicstr)
671 return;
672
673 TRACE_0(TR_FAC_DISP, TR_PREEMPT_START, "preempt_start");
674
675 thread_lock(t);
676
677 if (t->t_state != TS_ONPROC || t->t_disp_queue != CPU->cpu_disp) {
678 /*
679 * this thread has already been chosen to be run on
680 * another CPU. Clear kprunrun on this CPU since we're
681 * already headed for swtch().
682 */
683 CPU->cpu_kprunrun = 0;
684 thread_unlock_nopreempt(t);
685 TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end");
686 } else {
687 if (lwp != NULL)
688 lwp->lwp_ru.nivcsw++;
689 CPU_STATS_ADDQ(CPU, sys, inv_swtch, 1);
690 THREAD_TRANSITION(t);
691 CL_PREEMPT(t);
692 DTRACE_SCHED(preempt);
693 thread_unlock_nopreempt(t);
694
695 TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end");
696
697 swtch(); /* clears CPU->cpu_runrun via disp() */
698 }
699 }
700
701 extern kthread_t *thread_unpin();
702
703 /*
704 * disp() - find the highest priority thread for this processor to run, and
705 * set it in TS_ONPROC state so that resume() can be called to run it.
706 */
707 static kthread_t *
708 disp()
709 {
710 cpu_t *cpup;
711 disp_t *dp;
712 kthread_t *tp;
713 dispq_t *dq;
714 int maxrunword;
715 pri_t pri;
716 disp_t *kpq;
717
718 TRACE_0(TR_FAC_DISP, TR_DISP_START, "disp_start");
719
720 cpup = CPU;
721 /*
722 * Find the highest priority loaded, runnable thread.
723 */
724 dp = cpup->cpu_disp;
725
726 reschedule:
727 /*
728 * If there is more important work on the global queue with a better
729 * priority than the maximum on this CPU, take it now.
730 */
731 kpq = &cpup->cpu_part->cp_kp_queue;
732 while ((pri = kpq->disp_maxrunpri) >= 0 &&
733 pri >= dp->disp_maxrunpri &&
734 (cpup->cpu_flags & CPU_OFFLINE) == 0 &&
735 (tp = disp_getbest(kpq)) != NULL) {
736 if (disp_ratify(tp, kpq) != NULL) {
737 TRACE_1(TR_FAC_DISP, TR_DISP_END,
738 "disp_end:tid %p", tp);
739 return (tp);
740 }
741 }
742
743 disp_lock_enter(&dp->disp_lock);
744 pri = dp->disp_maxrunpri;
745
746 /*
747 * If there is nothing to run, look at what's runnable on other queues.
748 * Choose the idle thread if the CPU is quiesced.
749 * Note that CPUs that have the CPU_OFFLINE flag set can still run
750 * interrupt threads, which will be the only threads on the CPU's own
751 * queue, but cannot run threads from other queues.
752 */
753 if (pri == -1) {
754 if (!(cpup->cpu_flags & CPU_OFFLINE)) {
755 disp_lock_exit(&dp->disp_lock);
756 if ((tp = disp_getwork(cpup)) == NULL ||
757 tp == T_DONTSTEAL) {
758 tp = cpup->cpu_idle_thread;
759 (void) splhigh();
760 THREAD_ONPROC(tp, cpup);
761 cpup->cpu_dispthread = tp;
762 cpup->cpu_dispatch_pri = -1;
763 cpup->cpu_runrun = cpup->cpu_kprunrun = 0;
764 cpup->cpu_chosen_level = -1;
765 }
766 } else {
767 disp_lock_exit_high(&dp->disp_lock);
768 tp = cpup->cpu_idle_thread;
769 THREAD_ONPROC(tp, cpup);
770 cpup->cpu_dispthread = tp;
771 cpup->cpu_dispatch_pri = -1;
772 cpup->cpu_runrun = cpup->cpu_kprunrun = 0;
773 cpup->cpu_chosen_level = -1;
774 }
775 TRACE_1(TR_FAC_DISP, TR_DISP_END,
776 "disp_end:tid %p", tp);
777 return (tp);
778 }
779
780 dq = &dp->disp_q[pri];
781 tp = dq->dq_first;
782
783 ASSERT(tp != NULL);
784 ASSERT(tp->t_schedflag & TS_LOAD); /* thread must be swapped in */
785
786 DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp);
787
788 /*
789 * Found it so remove it from queue.
790 */
791 dp->disp_nrunnable--;
792 dq->dq_sruncnt--;
793 if ((dq->dq_first = tp->t_link) == NULL) {
794 ulong_t *dqactmap = dp->disp_qactmap;
795
796 ASSERT(dq->dq_sruncnt == 0);
797 dq->dq_last = NULL;
798
799 /*
800 * The queue is empty, so the corresponding bit needs to be
801 * turned off in dqactmap. If nrunnable != 0 just took the
802 * last runnable thread off the
803 * highest queue, so recompute disp_maxrunpri.
804 */
805 maxrunword = pri >> BT_ULSHIFT;
806 dqactmap[maxrunword] &= ~BT_BIW(pri);
807
808 if (dp->disp_nrunnable == 0) {
809 dp->disp_max_unbound_pri = -1;
810 dp->disp_maxrunpri = -1;
811 } else {
812 int ipri;
813
814 ipri = bt_gethighbit(dqactmap, maxrunword);
815 dp->disp_maxrunpri = ipri;
816 if (ipri < dp->disp_max_unbound_pri)
817 dp->disp_max_unbound_pri = ipri;
818 }
819 } else {
820 tp->t_link = NULL;
821 }
822
823 /*
824 * Set TS_DONT_SWAP flag to prevent another processor from swapping
825 * out this thread before we have a chance to run it.
826 * While running, it is protected against swapping by t_lock.
827 */
828 tp->t_schedflag |= TS_DONT_SWAP;
829 cpup->cpu_dispthread = tp; /* protected by spl only */
830 cpup->cpu_dispatch_pri = pri;
831 ASSERT(pri == DISP_PRIO(tp));
832 thread_onproc(tp, cpup); /* set t_state to TS_ONPROC */
833 disp_lock_exit_high(&dp->disp_lock); /* drop run queue lock */
834
835 ASSERT(tp != NULL);
836 TRACE_1(TR_FAC_DISP, TR_DISP_END,
837 "disp_end:tid %p", tp);
838
839 if (disp_ratify(tp, kpq) == NULL)
840 goto reschedule;
841
842 return (tp);
843 }
844
845 /*
846 * swtch()
847 * Find best runnable thread and run it.
848 * Called with the current thread already switched to a new state,
849 * on a sleep queue, run queue, stopped, and not zombied.
850 * May be called at any spl level less than or equal to LOCK_LEVEL.
851 * Always drops spl to the base level (spl0()).
852 */
853 void
854 swtch()
855 {
856 kthread_t *t = curthread;
857 kthread_t *next;
858 cpu_t *cp;
859
860 TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
861
862 if (t->t_flag & T_INTR_THREAD)
863 cpu_intr_swtch_enter(t);
864
865 if (t->t_intr != NULL) {
866 /*
867 * We are an interrupt thread. Setup and return
868 * the interrupted thread to be resumed.
869 */
870 (void) splhigh(); /* block other scheduler action */
871 cp = CPU; /* now protected against migration */
872 ASSERT(CPU_ON_INTR(cp) == 0); /* not called with PIL > 10 */
873 CPU_STATS_ADDQ(cp, sys, pswitch, 1);
874 CPU_STATS_ADDQ(cp, sys, intrblk, 1);
875 next = thread_unpin();
876 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
877 resume_from_intr(next);
878 } else {
879 #ifdef DEBUG
880 if (t->t_state == TS_ONPROC &&
881 t->t_disp_queue->disp_cpu == CPU &&
882 t->t_preempt == 0) {
883 thread_lock(t);
884 ASSERT(t->t_state != TS_ONPROC ||
885 t->t_disp_queue->disp_cpu != CPU ||
886 t->t_preempt != 0); /* cannot migrate */
887 thread_unlock_nopreempt(t);
888 }
889 #endif /* DEBUG */
890 cp = CPU;
891 next = disp(); /* returns with spl high */
892 ASSERT(CPU_ON_INTR(cp) == 0); /* not called with PIL > 10 */
893
894 /* OK to steal anything left on run queue */
895 cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL;
896
897 if (next != t) {
898 hrtime_t now;
899
900 now = gethrtime_unscaled();
901 pg_ev_thread_swtch(cp, now, t, next);
902
903 /*
904 * If t was previously in the TS_ONPROC state,
905 * setfrontdq and setbackdq won't have set its t_waitrq.
906 * Since we now finally know that we're switching away
907 * from this thread, set its t_waitrq if it is on a run
908 * queue.
909 */
910 if ((t->t_state == TS_RUN) && (t->t_waitrq == 0)) {
911 t->t_waitrq = now;
912 }
913
914 /*
915 * restore mstate of thread that we are switching to
916 */
917 restore_mstate(next);
918
919 CPU_STATS_ADDQ(cp, sys, pswitch, 1);
920 cp->cpu_last_swtch = t->t_disp_time = ddi_get_lbolt();
921 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
922
923 if (dtrace_vtime_active)
924 dtrace_vtime_switch(next);
925
926 resume(next);
927 /*
928 * The TR_RESUME_END and TR_SWTCH_END trace points
929 * appear at the end of resume(), because we may not
930 * return here
931 */
932 } else {
933 if (t->t_flag & T_INTR_THREAD)
934 cpu_intr_swtch_exit(t);
935 /*
936 * Threads that enqueue themselves on a run queue defer
937 * setting t_waitrq. It is then either set in swtch()
938 * when the CPU is actually yielded, or not at all if it
939 * is remaining on the CPU.
940 * There is however a window between where the thread
941 * placed itself on a run queue, and where it selects
942 * itself in disp(), where a third party (eg. clock()
943 * doing tick processing) may have re-enqueued this
944 * thread, setting t_waitrq in the process. We detect
945 * this race by noticing that despite switching to
946 * ourself, our t_waitrq has been set, and should be
947 * cleared.
948 */
949 if (t->t_waitrq != 0)
950 t->t_waitrq = 0;
951
952 pg_ev_thread_remain(cp, t);
953
954 DTRACE_SCHED(remain__cpu);
955 TRACE_0(TR_FAC_DISP, TR_SWTCH_END, "swtch_end");
956 (void) spl0();
957 }
958 }
959 }
960
961 /*
962 * swtch_from_zombie()
963 * Special case of swtch(), which allows checks for TS_ZOMB to be
964 * eliminated from normal resume.
965 * Find best runnable thread and run it.
966 * Called with the current thread zombied.
967 * Zombies cannot migrate, so CPU references are safe.
968 */
969 void
970 swtch_from_zombie()
971 {
972 kthread_t *next;
973 cpu_t *cpu = CPU;
974
975 TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
976
977 ASSERT(curthread->t_state == TS_ZOMB);
978
979 next = disp(); /* returns with spl high */
980 ASSERT(CPU_ON_INTR(CPU) == 0); /* not called with PIL > 10 */
981 CPU_STATS_ADDQ(CPU, sys, pswitch, 1);
982 ASSERT(next != curthread);
983 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
984
985 pg_ev_thread_swtch(cpu, gethrtime_unscaled(), curthread, next);
986
987 restore_mstate(next);
988
989 if (dtrace_vtime_active)
990 dtrace_vtime_switch(next);
991
992 resume_from_zombie(next);
993 /*
994 * The TR_RESUME_END and TR_SWTCH_END trace points
995 * appear at the end of resume(), because we certainly will not
996 * return here
997 */
998 }
999
1000 #if defined(DEBUG) && (defined(DISP_DEBUG) || defined(lint))
1001
1002 /*
1003 * search_disp_queues()
1004 * Search the given dispatch queues for thread tp.
1005 * Return 1 if tp is found, otherwise return 0.
1006 */
1007 static int
1008 search_disp_queues(disp_t *dp, kthread_t *tp)
1009 {
1010 dispq_t *dq;
1011 dispq_t *eq;
1012
1013 disp_lock_enter_high(&dp->disp_lock);
1014
1015 for (dq = dp->disp_q, eq = dp->disp_q_limit; dq < eq; ++dq) {
1016 kthread_t *rp;
1017
1018 ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL);
1019
1020 for (rp = dq->dq_first; rp; rp = rp->t_link)
1021 if (tp == rp) {
1022 disp_lock_exit_high(&dp->disp_lock);
1023 return (1);
1024 }
1025 }
1026 disp_lock_exit_high(&dp->disp_lock);
1027
1028 return (0);
1029 }
1030
1031 /*
1032 * thread_on_queue()
1033 * Search all per-CPU dispatch queues and all partition-wide kpreempt
1034 * queues for thread tp. Return 1 if tp is found, otherwise return 0.
1035 */
1036 static int
1037 thread_on_queue(kthread_t *tp)
1038 {
1039 cpu_t *cp;
1040 struct cpupart *part;
1041
1042 ASSERT(getpil() >= DISP_LEVEL);
1043
1044 /*
1045 * Search the per-CPU dispatch queues for tp.
1046 */
1047 cp = CPU;
1048 do {
1049 if (search_disp_queues(cp->cpu_disp, tp))
1050 return (1);
1051 } while ((cp = cp->cpu_next_onln) != CPU);
1052
1053 /*
1054 * Search the partition-wide kpreempt queues for tp.
1055 */
1056 part = CPU->cpu_part;
1057 do {
1058 if (search_disp_queues(&part->cp_kp_queue, tp))
1059 return (1);
1060 } while ((part = part->cp_next) != CPU->cpu_part);
1061
1062 return (0);
1063 }
1064
1065 #else
1066
1067 #define thread_on_queue(tp) 0 /* ASSERT must be !thread_on_queue */
1068
1069 #endif /* DEBUG */
1070
1071 /*
1072 * like swtch(), but switch to a specified thread taken from another CPU.
1073 * called with spl high..
1074 */
1075 void
1076 swtch_to(kthread_t *next)
1077 {
1078 cpu_t *cp = CPU;
1079 hrtime_t now;
1080
1081 TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
1082
1083 /*
1084 * Update context switch statistics.
1085 */
1086 CPU_STATS_ADDQ(cp, sys, pswitch, 1);
1087
1088 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
1089
1090 now = gethrtime_unscaled();
1091 pg_ev_thread_swtch(cp, now, curthread, next);
1092
1093 /* OK to steal anything left on run queue */
1094 cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL;
1095
1096 /* record last execution time */
1097 cp->cpu_last_swtch = curthread->t_disp_time = ddi_get_lbolt();
1098
1099 /*
1100 * If t was previously in the TS_ONPROC state, setfrontdq and setbackdq
1101 * won't have set its t_waitrq. Since we now finally know that we're
1102 * switching away from this thread, set its t_waitrq if it is on a run
1103 * queue.
1104 */
1105 if ((curthread->t_state == TS_RUN) && (curthread->t_waitrq == 0)) {
1106 curthread->t_waitrq = now;
1107 }
1108
1109 /* restore next thread to previously running microstate */
1110 restore_mstate(next);
1111
1112 if (dtrace_vtime_active)
1113 dtrace_vtime_switch(next);
1114
1115 resume(next);
1116 /*
1117 * The TR_RESUME_END and TR_SWTCH_END trace points
1118 * appear at the end of resume(), because we may not
1119 * return here
1120 */
1121 }
1122
1123 static void
1124 cpu_resched(cpu_t *cp, pri_t tpri)
1125 {
1126 int call_poke_cpu = 0;
1127 pri_t cpupri = cp->cpu_dispatch_pri;
1128
1129 if (cpupri != CPU_IDLE_PRI && cpupri < tpri) {
1130 TRACE_2(TR_FAC_DISP, TR_CPU_RESCHED,
1131 "CPU_RESCHED:Tpri %d Cpupri %d", tpri, cpupri);
1132 if (tpri >= upreemptpri && cp->cpu_runrun == 0) {
1133 cp->cpu_runrun = 1;
1134 aston(cp->cpu_dispthread);
1135 if (tpri < kpreemptpri && cp != CPU)
1136 call_poke_cpu = 1;
1137 }
1138 if (tpri >= kpreemptpri && cp->cpu_kprunrun == 0) {
1139 cp->cpu_kprunrun = 1;
1140 if (cp != CPU)
1141 call_poke_cpu = 1;
1142 }
1143 }
1144
1145 /*
1146 * Propagate cpu_runrun, and cpu_kprunrun to global visibility.
1147 */
1148 membar_enter();
1149
1150 if (call_poke_cpu)
1151 poke_cpu(cp->cpu_id);
1152 }
1153
1154 /*
1155 * setbackdq() keeps runqs balanced such that the difference in length
1156 * between the chosen runq and the next one is no more than RUNQ_MAX_DIFF.
1157 * For threads with priorities below RUNQ_MATCH_PRI levels, the runq's lengths
1158 * must match. When per-thread TS_RUNQMATCH flag is set, setbackdq() will
1159 * try to keep runqs perfectly balanced regardless of the thread priority.
1160 */
1161 #define RUNQ_MATCH_PRI 16 /* pri below which queue lengths must match */
1162 #define RUNQ_MAX_DIFF 2 /* maximum runq length difference */
1163 #define RUNQ_LEN(cp, pri) ((cp)->cpu_disp->disp_q[pri].dq_sruncnt)
1164
1165 /*
1166 * Macro that evaluates to true if it is likely that the thread has cache
1167 * warmth. This is based on the amount of time that has elapsed since the
1168 * thread last ran. If that amount of time is less than "rechoose_interval"
1169 * ticks, then we decide that the thread has enough cache warmth to warrant
1170 * some affinity for t->t_cpu.
1171 */
1172 #define THREAD_HAS_CACHE_WARMTH(thread) \
1173 ((thread == curthread) || \
1174 ((ddi_get_lbolt() - thread->t_disp_time) <= rechoose_interval))
1175 /*
1176 * Put the specified thread on the back of the dispatcher
1177 * queue corresponding to its current priority.
1178 *
1179 * Called with the thread in transition, onproc or stopped state
1180 * and locked (transition implies locked) and at high spl.
1181 * Returns with the thread in TS_RUN state and still locked.
1182 */
1183 void
1184 setbackdq(kthread_t *tp)
1185 {
1186 dispq_t *dq;
1187 disp_t *dp;
1188 cpu_t *cp;
1189 pri_t tpri;
1190 int bound;
1191 boolean_t self;
1192
1193 ASSERT(THREAD_LOCK_HELD(tp));
1194 ASSERT((tp->t_schedflag & TS_ALLSTART) == 0);
1195 ASSERT(!thread_on_queue(tp)); /* make sure tp isn't on a runq */
1196
1197 /*
1198 * If thread is "swapped" or on the swap queue don't
1199 * queue it, but wake sched.
1200 */
1201 if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) {
1202 disp_swapped_setrun(tp);
1203 return;
1204 }
1205
1206 self = (tp == curthread);
1207
1208 if (tp->t_bound_cpu || tp->t_weakbound_cpu)
1209 bound = 1;
1210 else
1211 bound = 0;
1212
1213 tpri = DISP_PRIO(tp);
1214 if (ncpus == 1)
1215 cp = tp->t_cpu;
1216 else if (!bound) {
1217 if (tpri >= kpqpri) {
1218 setkpdq(tp, SETKP_BACK);
1219 return;
1220 }
1221
1222 /*
1223 * We'll generally let this thread continue to run where
1224 * it last ran...but will consider migration if:
1225 * - The thread probably doesn't have much cache warmth.
1226 * - HT exclusion would prefer us to run elsewhere
1227 * - The CPU where it last ran is the target of an offline
1228 * request.
1229 * - The thread last ran outside its home lgroup.
1230 */
1231 if ((!THREAD_HAS_CACHE_WARMTH(tp)) ||
1232 !ht_should_run(tp, tp->t_cpu) ||
1233 (tp->t_cpu == cpu_inmotion) ||
1234 !LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, tp->t_cpu)) {
1235 cp = disp_lowpri_cpu(tp->t_cpu, tp, tpri);
1236 } else {
1237 cp = tp->t_cpu;
1238 }
1239
1240 if (tp->t_cpupart == cp->cpu_part) {
1241 int qlen;
1242
1243 /*
1244 * Perform any CMT load balancing
1245 */
1246 cp = cmt_balance(tp, cp);
1247
1248 /*
1249 * Balance across the run queues
1250 */
1251 qlen = RUNQ_LEN(cp, tpri);
1252 if (tpri >= RUNQ_MATCH_PRI &&
1253 !(tp->t_schedflag & TS_RUNQMATCH))
1254 qlen -= RUNQ_MAX_DIFF;
1255 if (qlen > 0) {
1256 cpu_t *newcp;
1257
1258 if (tp->t_lpl->lpl_lgrpid == LGRP_ROOTID) {
1259 newcp = cp->cpu_next_part;
1260 } else if ((newcp = cp->cpu_next_lpl) == cp) {
1261 newcp = cp->cpu_next_part;
1262 }
1263
1264 if (ht_should_run(tp, newcp) &&
1265 RUNQ_LEN(newcp, tpri) < qlen) {
1266 DTRACE_PROBE3(runq__balance,
1267 kthread_t *, tp,
1268 cpu_t *, cp, cpu_t *, newcp);
1269 cp = newcp;
1270 }
1271 }
1272 } else {
1273 /*
1274 * Migrate to a cpu in the new partition.
1275 */
1276 cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist, tp,
1277 tp->t_pri);
1278 }
1279 ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1280 } else {
1281 /*
1282 * It is possible that t_weakbound_cpu != t_bound_cpu (for
1283 * a short time until weak binding that existed when the
1284 * strong binding was established has dropped) so we must
1285 * favour weak binding over strong.
1286 */
1287 cp = tp->t_weakbound_cpu ?
1288 tp->t_weakbound_cpu : tp->t_bound_cpu;
1289 }
1290 /*
1291 * A thread that is ONPROC may be temporarily placed on the run queue
1292 * but then chosen to run again by disp. If the thread we're placing on
1293 * the queue is in TS_ONPROC state, don't set its t_waitrq until a
1294 * replacement process is actually scheduled in swtch(). In this
1295 * situation, curthread is the only thread that could be in the ONPROC
1296 * state.
1297 */
1298 if ((!self) && (tp->t_waitrq == 0)) {
1299 hrtime_t curtime;
1300
1301 curtime = gethrtime_unscaled();
1302 (void) cpu_update_pct(tp, curtime);
1303 tp->t_waitrq = curtime;
1304 } else {
1305 (void) cpu_update_pct(tp, gethrtime_unscaled());
1306 }
1307
1308 dp = cp->cpu_disp;
1309 disp_lock_enter_high(&dp->disp_lock);
1310
1311 DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 0);
1312 TRACE_3(TR_FAC_DISP, TR_BACKQ, "setbackdq:pri %d cpu %p tid %p",
1313 tpri, cp, tp);
1314
1315 #ifndef NPROBE
1316 /* Kernel probe */
1317 if (tnf_tracing_active)
1318 tnf_thread_queue(tp, cp, tpri);
1319 #endif /* NPROBE */
1320
1321 ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1322
1323 THREAD_RUN(tp, &dp->disp_lock); /* set t_state to TS_RUN */
1324 tp->t_disp_queue = dp;
1325 tp->t_link = NULL;
1326
1327 dq = &dp->disp_q[tpri];
1328 dp->disp_nrunnable++;
1329 if (!bound)
1330 dp->disp_steal = 0;
1331 membar_enter();
1332
1333 if (dq->dq_sruncnt++ != 0) {
1334 ASSERT(dq->dq_first != NULL);
1335 dq->dq_last->t_link = tp;
1336 dq->dq_last = tp;
1337 } else {
1338 ASSERT(dq->dq_first == NULL);
1339 ASSERT(dq->dq_last == NULL);
1340 dq->dq_first = dq->dq_last = tp;
1341 BT_SET(dp->disp_qactmap, tpri);
1342 if (tpri > dp->disp_maxrunpri) {
1343 dp->disp_maxrunpri = tpri;
1344 membar_enter();
1345 cpu_resched(cp, tpri);
1346 }
1347 }
1348
1349 if (!bound && tpri > dp->disp_max_unbound_pri) {
1350 if (self && dp->disp_max_unbound_pri == -1 && cp == CPU) {
1351 /*
1352 * If there are no other unbound threads on the
1353 * run queue, don't allow other CPUs to steal
1354 * this thread while we are in the middle of a
1355 * context switch. We may just switch to it
1356 * again right away. CPU_DISP_DONTSTEAL is cleared
1357 * in swtch and swtch_to.
1358 */
1359 cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL;
1360 }
1361 dp->disp_max_unbound_pri = tpri;
1362 }
1363 (*disp_enq_thread)(cp, bound);
1364 }
1365
1366 /*
1367 * Put the specified thread on the front of the dispatcher
1368 * queue corresponding to its current priority.
1369 *
1370 * Called with the thread in transition, onproc or stopped state
1371 * and locked (transition implies locked) and at high spl.
1372 * Returns with the thread in TS_RUN state and still locked.
1373 */
1374 void
1375 setfrontdq(kthread_t *tp)
1376 {
1377 disp_t *dp;
1378 dispq_t *dq;
1379 cpu_t *cp;
1380 pri_t tpri;
1381 int bound;
1382
1383 ASSERT(THREAD_LOCK_HELD(tp));
1384 ASSERT((tp->t_schedflag & TS_ALLSTART) == 0);
1385 ASSERT(!thread_on_queue(tp)); /* make sure tp isn't on a runq */
1386
1387 /*
1388 * If thread is "swapped" or on the swap queue don't
1389 * queue it, but wake sched.
1390 */
1391 if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) {
1392 disp_swapped_setrun(tp);
1393 return;
1394 }
1395
1396 if (tp->t_bound_cpu || tp->t_weakbound_cpu)
1397 bound = 1;
1398 else
1399 bound = 0;
1400
1401 tpri = DISP_PRIO(tp);
1402 if (ncpus == 1)
1403 cp = tp->t_cpu;
1404 else if (!bound) {
1405 if (tpri >= kpqpri) {
1406 setkpdq(tp, SETKP_FRONT);
1407 return;
1408 }
1409 cp = tp->t_cpu;
1410 if (tp->t_cpupart == cp->cpu_part) {
1411 /*
1412 * We'll generally let this thread continue to run
1413 * where it last ran, but will consider migration if:
1414 * - The thread last ran outside its home lgroup.
1415 * - The CPU where it last ran is the target of an
1416 * offline request (a thread_nomigrate() on the in
1417 * motion CPU relies on this when forcing a preempt).
1418 * - The thread isn't the highest priority thread where
1419 * it last ran, and it is considered not likely to
1420 * have significant cache warmth.
1421 */
1422 if (!LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, cp) ||
1423 cp == cpu_inmotion ||
1424 (tpri < cp->cpu_disp->disp_maxrunpri &&
1425 !THREAD_HAS_CACHE_WARMTH(tp))) {
1426 cp = disp_lowpri_cpu(tp->t_cpu, tp, tpri);
1427 }
1428 } else {
1429 /*
1430 * Migrate to a cpu in the new partition.
1431 */
1432 cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist,
1433 tp, tp->t_pri);
1434 }
1435 ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1436 } else {
1437 /*
1438 * It is possible that t_weakbound_cpu != t_bound_cpu (for
1439 * a short time until weak binding that existed when the
1440 * strong binding was established has dropped) so we must
1441 * favour weak binding over strong.
1442 */
1443 cp = tp->t_weakbound_cpu ?
1444 tp->t_weakbound_cpu : tp->t_bound_cpu;
1445 }
1446
1447 /*
1448 * A thread that is ONPROC may be temporarily placed on the run queue
1449 * but then chosen to run again by disp. If the thread we're placing on
1450 * the queue is in TS_ONPROC state, don't set its t_waitrq until a
1451 * replacement process is actually scheduled in swtch(). In this
1452 * situation, curthread is the only thread that could be in the ONPROC
1453 * state.
1454 */
1455 if ((tp != curthread) && (tp->t_waitrq == 0)) {
1456 hrtime_t curtime;
1457
1458 curtime = gethrtime_unscaled();
1459 (void) cpu_update_pct(tp, curtime);
1460 tp->t_waitrq = curtime;
1461 } else {
1462 (void) cpu_update_pct(tp, gethrtime_unscaled());
1463 }
1464
1465 dp = cp->cpu_disp;
1466 disp_lock_enter_high(&dp->disp_lock);
1467
1468 TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp);
1469 DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 1);
1470
1471 #ifndef NPROBE
1472 /* Kernel probe */
1473 if (tnf_tracing_active)
1474 tnf_thread_queue(tp, cp, tpri);
1475 #endif /* NPROBE */
1476
1477 ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1478
1479 THREAD_RUN(tp, &dp->disp_lock); /* set TS_RUN state and lock */
1480 tp->t_disp_queue = dp;
1481
1482 dq = &dp->disp_q[tpri];
1483 dp->disp_nrunnable++;
1484 if (!bound)
1485 dp->disp_steal = 0;
1486 membar_enter();
1487
1488 if (dq->dq_sruncnt++ != 0) {
1489 ASSERT(dq->dq_last != NULL);
1490 tp->t_link = dq->dq_first;
1491 dq->dq_first = tp;
1492 } else {
1493 ASSERT(dq->dq_last == NULL);
1494 ASSERT(dq->dq_first == NULL);
1495 tp->t_link = NULL;
1496 dq->dq_first = dq->dq_last = tp;
1497 BT_SET(dp->disp_qactmap, tpri);
1498 if (tpri > dp->disp_maxrunpri) {
1499 dp->disp_maxrunpri = tpri;
1500 membar_enter();
1501 cpu_resched(cp, tpri);
1502 }
1503 }
1504
1505 if (!bound && tpri > dp->disp_max_unbound_pri) {
1506 if (tp == curthread && dp->disp_max_unbound_pri == -1 &&
1507 cp == CPU) {
1508 /*
1509 * If there are no other unbound threads on the
1510 * run queue, don't allow other CPUs to steal
1511 * this thread while we are in the middle of a
1512 * context switch. We may just switch to it
1513 * again right away. CPU_DISP_DONTSTEAL is cleared
1514 * in swtch and swtch_to.
1515 */
1516 cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL;
1517 }
1518 dp->disp_max_unbound_pri = tpri;
1519 }
1520 (*disp_enq_thread)(cp, bound);
1521 }
1522
1523 /*
1524 * Put a high-priority unbound thread on the kp queue
1525 */
1526 static void
1527 setkpdq(kthread_t *tp, int borf)
1528 {
1529 dispq_t *dq;
1530 disp_t *dp;
1531 cpu_t *cp;
1532 pri_t tpri;
1533
1534 tpri = DISP_PRIO(tp);
1535
1536 dp = &tp->t_cpupart->cp_kp_queue;
1537 disp_lock_enter_high(&dp->disp_lock);
1538
1539 TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp);
1540
1541 ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1542 DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, borf);
1543 THREAD_RUN(tp, &dp->disp_lock); /* set t_state to TS_RUN */
1544 tp->t_disp_queue = dp;
1545 dp->disp_nrunnable++;
1546 dq = &dp->disp_q[tpri];
1547
1548 if (dq->dq_sruncnt++ != 0) {
1549 if (borf == SETKP_BACK) {
1550 ASSERT(dq->dq_first != NULL);
1551 tp->t_link = NULL;
1552 dq->dq_last->t_link = tp;
1553 dq->dq_last = tp;
1554 } else {
1555 ASSERT(dq->dq_last != NULL);
1556 tp->t_link = dq->dq_first;
1557 dq->dq_first = tp;
1558 }
1559 } else {
1560 if (borf == SETKP_BACK) {
1561 ASSERT(dq->dq_first == NULL);
1562 ASSERT(dq->dq_last == NULL);
1563 dq->dq_first = dq->dq_last = tp;
1564 } else {
1565 ASSERT(dq->dq_last == NULL);
1566 ASSERT(dq->dq_first == NULL);
1567 tp->t_link = NULL;
1568 dq->dq_first = dq->dq_last = tp;
1569 }
1570 BT_SET(dp->disp_qactmap, tpri);
1571 if (tpri > dp->disp_max_unbound_pri)
1572 dp->disp_max_unbound_pri = tpri;
1573 if (tpri > dp->disp_maxrunpri) {
1574 dp->disp_maxrunpri = tpri;
1575 membar_enter();
1576 }
1577 }
1578
1579 cp = tp->t_cpu;
1580 if (tp->t_cpupart != cp->cpu_part) {
1581 /* migrate to a cpu in the new partition */
1582 cp = tp->t_cpupart->cp_cpulist;
1583 }
1584 cp = disp_lowpri_cpu(cp, tp, tp->t_pri);
1585 disp_lock_enter_high(&cp->cpu_disp->disp_lock);
1586 ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1587
1588 #ifndef NPROBE
1589 /* Kernel probe */
1590 if (tnf_tracing_active)
1591 tnf_thread_queue(tp, cp, tpri);
1592 #endif /* NPROBE */
1593
1594 if (cp->cpu_chosen_level < tpri)
1595 cp->cpu_chosen_level = tpri;
1596 cpu_resched(cp, tpri);
1597 disp_lock_exit_high(&cp->cpu_disp->disp_lock);
1598 (*disp_enq_thread)(cp, 0);
1599 }
1600
1601 /*
1602 * Remove a thread from the dispatcher queue if it is on it.
1603 * It is not an error if it is not found but we return whether
1604 * or not it was found in case the caller wants to check.
1605 */
1606 int
1607 dispdeq(kthread_t *tp)
1608 {
1609 disp_t *dp;
1610 dispq_t *dq;
1611 kthread_t *rp;
1612 kthread_t *trp;
1613 kthread_t **ptp;
1614 int tpri;
1615
1616 ASSERT(THREAD_LOCK_HELD(tp));
1617
1618 if (tp->t_state != TS_RUN)
1619 return (0);
1620
1621 /*
1622 * The thread is "swapped" or is on the swap queue and
1623 * hence no longer on the run queue, so return true.
1624 */
1625 if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD)
1626 return (1);
1627
1628 tpri = DISP_PRIO(tp);
1629 dp = tp->t_disp_queue;
1630 ASSERT(tpri < dp->disp_npri);
1631 dq = &dp->disp_q[tpri];
1632 ptp = &dq->dq_first;
1633 rp = *ptp;
1634 trp = NULL;
1635
1636 ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL);
1637
1638 /*
1639 * Search for thread in queue.
1640 * Double links would simplify this at the expense of disp/setrun.
1641 */
1642 while (rp != tp && rp != NULL) {
1643 trp = rp;
1644 ptp = &trp->t_link;
1645 rp = trp->t_link;
1646 }
1647
1648 if (rp == NULL) {
1649 panic("dispdeq: thread not on queue");
1650 }
1651
1652 DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp);
1653
1654 /*
1655 * Found it so remove it from queue.
1656 */
1657 if ((*ptp = rp->t_link) == NULL)
1658 dq->dq_last = trp;
1659
1660 dp->disp_nrunnable--;
1661 if (--dq->dq_sruncnt == 0) {
1662 dp->disp_qactmap[tpri >> BT_ULSHIFT] &= ~BT_BIW(tpri);
1663 if (dp->disp_nrunnable == 0) {
1664 dp->disp_max_unbound_pri = -1;
1665 dp->disp_maxrunpri = -1;
1666 } else if (tpri == dp->disp_maxrunpri) {
1667 int ipri;
1668
1669 ipri = bt_gethighbit(dp->disp_qactmap,
1670 dp->disp_maxrunpri >> BT_ULSHIFT);
1671 if (ipri < dp->disp_max_unbound_pri)
1672 dp->disp_max_unbound_pri = ipri;
1673 dp->disp_maxrunpri = ipri;
1674 }
1675 }
1676 tp->t_link = NULL;
1677 THREAD_TRANSITION(tp); /* put in intermediate state */
1678 return (1);
1679 }
1680
1681
1682 /*
1683 * dq_sruninc and dq_srundec are public functions for
1684 * incrementing/decrementing the sruncnts when a thread on
1685 * a dispatcher queue is made schedulable/unschedulable by
1686 * resetting the TS_LOAD flag.
1687 *
1688 * The caller MUST have the thread lock and therefore the dispatcher
1689 * queue lock so that the operation which changes
1690 * the flag, the operation that checks the status of the thread to
1691 * determine if it's on a disp queue AND the call to this function
1692 * are one atomic operation with respect to interrupts.
1693 */
1694
1695 /*
1696 * Called by sched AFTER TS_LOAD flag is set on a swapped, runnable thread.
1697 */
1698 void
1699 dq_sruninc(kthread_t *t)
1700 {
1701 ASSERT(t->t_state == TS_RUN);
1702 ASSERT(t->t_schedflag & TS_LOAD);
1703
1704 THREAD_TRANSITION(t);
1705 setfrontdq(t);
1706 }
1707
1708 /*
1709 * See comment on calling conventions above.
1710 * Called by sched BEFORE TS_LOAD flag is cleared on a runnable thread.
1711 */
1712 void
1713 dq_srundec(kthread_t *t)
1714 {
1715 ASSERT(t->t_schedflag & TS_LOAD);
1716
1717 (void) dispdeq(t);
1718 disp_swapped_enq(t);
1719 }
1720
1721 /*
1722 * Change the dispatcher lock of thread to the "swapped_lock"
1723 * and return with thread lock still held.
1724 *
1725 * Called with thread_lock held, in transition state, and at high spl.
1726 */
1727 void
1728 disp_swapped_enq(kthread_t *tp)
1729 {
1730 ASSERT(THREAD_LOCK_HELD(tp));
1731 ASSERT(tp->t_schedflag & TS_LOAD);
1732
1733 switch (tp->t_state) {
1734 case TS_RUN:
1735 disp_lock_enter_high(&swapped_lock);
1736 THREAD_SWAP(tp, &swapped_lock); /* set TS_RUN state and lock */
1737 break;
1738 case TS_ONPROC:
1739 disp_lock_enter_high(&swapped_lock);
1740 THREAD_TRANSITION(tp);
1741 wake_sched_sec = 1; /* tell clock to wake sched */
1742 THREAD_SWAP(tp, &swapped_lock); /* set TS_RUN state and lock */
1743 break;
1744 default:
1745 panic("disp_swapped: tp: %p bad t_state", (void *)tp);
1746 }
1747 }
1748
1749 /*
1750 * This routine is called by setbackdq/setfrontdq if the thread is
1751 * not loaded or loaded and on the swap queue.
1752 *
1753 * Thread state TS_SLEEP implies that a swapped thread
1754 * has been woken up and needs to be swapped in by the swapper.
1755 *
1756 * Thread state TS_RUN, it implies that the priority of a swapped
1757 * thread is being increased by scheduling class (e.g. ts_update).
1758 */
1759 static void
1760 disp_swapped_setrun(kthread_t *tp)
1761 {
1762 ASSERT(THREAD_LOCK_HELD(tp));
1763 ASSERT((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD);
1764
1765 switch (tp->t_state) {
1766 case TS_SLEEP:
1767 disp_lock_enter_high(&swapped_lock);
1768 /*
1769 * Wakeup sched immediately (i.e., next tick) if the
1770 * thread priority is above maxclsyspri.
1771 */
1772 if (DISP_PRIO(tp) > maxclsyspri)
1773 wake_sched = 1;
1774 else
1775 wake_sched_sec = 1;
1776 THREAD_RUN(tp, &swapped_lock); /* set TS_RUN state and lock */
1777 break;
1778 case TS_RUN: /* called from ts_update */
1779 break;
1780 default:
1781 panic("disp_swapped_setrun: tp: %p bad t_state", (void *)tp);
1782 }
1783 }
1784
1785 /*
1786 * Make a thread give up its processor. Find the processor on
1787 * which this thread is executing, and have that processor
1788 * preempt.
1789 *
1790 * We allow System Duty Cycle (SDC) threads to be preempted even if
1791 * they are running at kernel priorities. To implement this, we always
1792 * set cpu_kprunrun; this ensures preempt() will be called. Since SDC
1793 * calls cpu_surrender() very often, we only preempt if there is anyone
1794 * competing with us.
1795 */
1796 void
1797 cpu_surrender(kthread_t *tp)
1798 {
1799 cpu_t *cpup;
1800 int max_pri;
1801 int max_run_pri;
1802 klwp_t *lwp;
1803
1804 ASSERT(THREAD_LOCK_HELD(tp));
1805
1806 if (tp->t_state != TS_ONPROC)
1807 return;
1808 cpup = tp->t_disp_queue->disp_cpu; /* CPU thread dispatched to */
1809 max_pri = cpup->cpu_disp->disp_maxrunpri; /* best pri of that CPU */
1810 max_run_pri = CP_MAXRUNPRI(cpup->cpu_part);
1811 if (max_pri < max_run_pri)
1812 max_pri = max_run_pri;
1813
1814 if (tp->t_cid == sysdccid) {
1815 uint_t t_pri = DISP_PRIO(tp);
1816 if (t_pri > max_pri)
1817 return; /* we are not competing w/ anyone */
1818 cpup->cpu_runrun = cpup->cpu_kprunrun = 1;
1819 } else {
1820 cpup->cpu_runrun = 1;
1821 if (max_pri >= kpreemptpri && cpup->cpu_kprunrun == 0) {
1822 cpup->cpu_kprunrun = 1;
1823 }
1824 }
1825
1826 /*
1827 * Propagate cpu_runrun, and cpu_kprunrun to global visibility.
1828 */
1829 membar_enter();
1830
1831 DTRACE_SCHED1(surrender, kthread_t *, tp);
1832
1833 /*
1834 * Make the target thread take an excursion through trap()
1835 * to do preempt() (unless we're already in trap or post_syscall,
1836 * calling cpu_surrender via CL_TRAPRET).
1837 */
1838 if (tp != curthread || (lwp = tp->t_lwp) == NULL ||
1839 lwp->lwp_state != LWP_USER) {
1840 aston(tp);
1841 if (cpup != CPU)
1842 poke_cpu(cpup->cpu_id);
1843 }
1844 TRACE_2(TR_FAC_DISP, TR_CPU_SURRENDER,
1845 "cpu_surrender:tid %p cpu %p", tp, cpup);
1846 }
1847
1848 /*
1849 * Commit to and ratify a scheduling decision
1850 */
1851 /*ARGSUSED*/
1852 static kthread_t *
1853 disp_ratify(kthread_t *tp, disp_t *kpq)
1854 {
1855 pri_t tpri, maxpri;
1856 pri_t maxkpri;
1857 cpu_t *cpup;
1858
1859 ASSERT(tp != NULL);
1860 /*
1861 * Commit to, then ratify scheduling decision
1862 */
1863 cpup = CPU;
1864 if (cpup->cpu_runrun != 0)
1865 cpup->cpu_runrun = 0;
1866 if (cpup->cpu_kprunrun != 0)
1867 cpup->cpu_kprunrun = 0;
1868 if (cpup->cpu_chosen_level != -1)
1869 cpup->cpu_chosen_level = -1;
1870 membar_enter();
1871 tpri = DISP_PRIO(tp);
1872 maxpri = cpup->cpu_disp->disp_maxrunpri;
1873 maxkpri = kpq->disp_maxrunpri;
1874 if (maxpri < maxkpri)
1875 maxpri = maxkpri;
1876 if (tpri < maxpri) {
1877 /*
1878 * should have done better
1879 * put this one back and indicate to try again
1880 */
1881 cpup->cpu_dispthread = curthread; /* fixup dispthread */
1882 cpup->cpu_dispatch_pri = DISP_PRIO(curthread);
1883 thread_lock_high(tp);
1884 THREAD_TRANSITION(tp);
1885 setfrontdq(tp);
1886 thread_unlock_nopreempt(tp);
1887
1888 tp = NULL;
1889 }
1890 return (tp);
1891 }
1892
1893 /*
1894 * See if there is any work on the dispatcher queue for other CPUs.
1895 * If there is, dequeue the best thread and return.
1896 */
1897 static kthread_t *
1898 disp_getwork(cpu_t *cp)
1899 {
1900 cpu_t *ocp; /* other CPU */
1901 cpu_t *ocp_start;
1902 cpu_t *tcp; /* target local CPU */
1903 kthread_t *tp;
1904 kthread_t *retval = NULL;
1905 pri_t maxpri;
1906 disp_t *kpq; /* kp queue for this partition */
1907 lpl_t *lpl, *lpl_leaf;
1908 int leafidx, startidx;
1909 hrtime_t stealtime;
1910 lgrp_id_t local_id;
1911
1912 maxpri = -1;
1913 tcp = NULL;
1914
1915 kpq = &cp->cpu_part->cp_kp_queue;
1916 while (kpq->disp_maxrunpri >= 0) {
1917 /*
1918 * Try to take a thread from the kp_queue.
1919 */
1920 tp = (disp_getbest(kpq));
1921 if (tp)
1922 return (disp_ratify(tp, kpq));
1923 }
1924
1925 kpreempt_disable(); /* protect the cpu_active list */
1926
1927 /*
1928 * Try to find something to do on another CPU's run queue.
1929 * Loop through all other CPUs looking for the one with the highest
1930 * priority unbound thread.
1931 *
1932 * On NUMA machines, the partition's CPUs are consulted in order of
1933 * distance from the current CPU. This way, the first available
1934 * work found is also the closest, and will suffer the least
1935 * from being migrated.
1936 */
1937 lpl = lpl_leaf = cp->cpu_lpl;
1938 local_id = lpl_leaf->lpl_lgrpid;
1939 leafidx = startidx = 0;
1940
1941 /*
1942 * This loop traverses the lpl hierarchy. Higher level lpls represent
1943 * broader levels of locality
1944 */
1945 do {
1946 /* This loop iterates over the lpl's leaves */
1947 do {
1948 if (lpl_leaf != cp->cpu_lpl)
1949 ocp = lpl_leaf->lpl_cpus;
1950 else
1951 ocp = cp->cpu_next_lpl;
1952
1953 /* This loop iterates over the CPUs in the leaf */
1954 ocp_start = ocp;
1955 do {
1956 pri_t pri;
1957
1958 ASSERT(CPU_ACTIVE(ocp));
1959
1960 /*
1961 * End our stroll around this lpl if:
1962 *
1963 * - Something became runnable on the local
1964 * queue...which also ends our stroll around
1965 * the partition.
1966 *
1967 * - We happen across another idle CPU.
1968 * Since it is patrolling the next portion
1969 * of the lpl's list (assuming it's not
1970 * halted, or busy servicing an interrupt),
1971 * move to the next higher level of locality.
1972 */
1973 if (cp->cpu_disp->disp_nrunnable != 0) {
1974 kpreempt_enable();
1975 return (NULL);
1976 }
1977 if (ocp->cpu_dispatch_pri == -1) {
1978 if (ocp->cpu_disp_flags &
1979 CPU_DISP_HALTED ||
1980 ocp->cpu_intr_actv != 0)
1981 continue;
1982 else
1983 goto next_level;
1984 }
1985
1986 /*
1987 * If there's only one thread and the CPU
1988 * is in the middle of a context switch,
1989 * or it's currently running the idle thread,
1990 * don't steal it.
1991 */
1992 if ((ocp->cpu_disp_flags &
1993 CPU_DISP_DONTSTEAL) &&
1994 ocp->cpu_disp->disp_nrunnable == 1)
1995 continue;
1996
1997 pri = ocp->cpu_disp->disp_max_unbound_pri;
1998 if (pri > maxpri) {
1999 /*
2000 * Don't steal threads that we attempted
2001 * to steal recently until they're ready
2002 * to be stolen again.
2003 */
2004 stealtime = ocp->cpu_disp->disp_steal;
2005 if (stealtime == 0 ||
2006 stealtime - gethrtime() <= 0) {
2007 maxpri = pri;
2008 tcp = ocp;
2009 } else {
2010 /*
2011 * Don't update tcp, just set
2012 * the retval to T_DONTSTEAL, so
2013 * that if no acceptable CPUs
2014 * are found the return value
2015 * will be T_DONTSTEAL rather
2016 * then NULL.
2017 */
2018 retval = T_DONTSTEAL;
2019 }
2020 }
2021 } while ((ocp = ocp->cpu_next_lpl) != ocp_start);
2022
2023 /*
2024 * Iterate to the next leaf lpl in the resource set
2025 * at this level of locality. If we hit the end of
2026 * the set, wrap back around to the beginning.
2027 *
2028 * Note: This iteration is NULL terminated for a reason
2029 * see lpl_topo_bootstrap() in lgrp.c for details.
2030 */
2031 if ((lpl_leaf = lpl->lpl_rset[++leafidx]) == NULL) {
2032 leafidx = 0;
2033 lpl_leaf = lpl->lpl_rset[leafidx];
2034 }
2035 } while (leafidx != startidx);
2036
2037 next_level:
2038 /*
2039 * Expand the search to include farther away CPUs (next
2040 * locality level). The closer CPUs that have already been
2041 * checked will be checked again. In doing so, idle CPUs
2042 * will tend to be more aggresive about stealing from CPUs
2043 * that are closer (since the closer CPUs will be considered
2044 * more often).
2045 * Begin at this level with the CPUs local leaf lpl.
2046 */
2047 if ((lpl = lpl->lpl_parent) != NULL) {
2048 leafidx = startidx = lpl->lpl_id2rset[local_id];
2049 lpl_leaf = lpl->lpl_rset[leafidx];
2050 }
2051 } while (!tcp && lpl);
2052
2053 kpreempt_enable();
2054
2055 /*
2056 * If another queue looks good, and there is still nothing on
2057 * the local queue, try to transfer one or more threads
2058 * from it to our queue.
2059 */
2060 if (tcp && cp->cpu_disp->disp_nrunnable == 0) {
2061 tp = disp_getbest(tcp->cpu_disp);
2062 if (tp == NULL || tp == T_DONTSTEAL)
2063 return (tp);
2064 return (disp_ratify(tp, kpq));
2065 }
2066 return (retval);
2067 }
2068
2069
2070 /*
2071 * disp_fix_unbound_pri()
2072 * Determines the maximum priority of unbound threads on the queue.
2073 * The priority is kept for the queue, but is only increased, never
2074 * reduced unless some CPU is looking for something on that queue.
2075 *
2076 * The priority argument is the known upper limit.
2077 *
2078 * Perhaps this should be kept accurately, but that probably means
2079 * separate bitmaps for bound and unbound threads. Since only idled
2080 * CPUs will have to do this recalculation, it seems better this way.
2081 */
2082 static void
2083 disp_fix_unbound_pri(disp_t *dp, pri_t pri)
2084 {
2085 kthread_t *tp;
2086 dispq_t *dq;
2087 ulong_t *dqactmap = dp->disp_qactmap;
2088 ulong_t mapword;
2089 int wx;
2090
2091 ASSERT(DISP_LOCK_HELD(&dp->disp_lock));
2092
2093 ASSERT(pri >= 0); /* checked by caller */
2094
2095 /*
2096 * Start the search at the next lowest priority below the supplied
2097 * priority. This depends on the bitmap implementation.
2098 */
2099 do {
2100 wx = pri >> BT_ULSHIFT; /* index of word in map */
2101
2102 /*
2103 * Form mask for all lower priorities in the word.
2104 */
2105 mapword = dqactmap[wx] & (BT_BIW(pri) - 1);
2106
2107 /*
2108 * Get next lower active priority.
2109 */
2110 if (mapword != 0) {
2111 pri = (wx << BT_ULSHIFT) + highbit(mapword) - 1;
2112 } else if (wx > 0) {
2113 pri = bt_gethighbit(dqactmap, wx - 1); /* sign extend */
2114 if (pri < 0)
2115 break;
2116 } else {
2117 pri = -1;
2118 break;
2119 }
2120
2121 /*
2122 * Search the queue for unbound, runnable threads.
2123 */
2124 dq = &dp->disp_q[pri];
2125 tp = dq->dq_first;
2126
2127 while (tp && (tp->t_bound_cpu || tp->t_weakbound_cpu)) {
2128 tp = tp->t_link;
2129 }
2130
2131 /*
2132 * If a thread was found, set the priority and return.
2133 */
2134 } while (tp == NULL);
2135
2136 /*
2137 * pri holds the maximum unbound thread priority or -1.
2138 */
2139 if (dp->disp_max_unbound_pri != pri)
2140 dp->disp_max_unbound_pri = pri;
2141 }
2142
2143 /*
2144 * disp_adjust_unbound_pri() - thread is becoming unbound, so we should
2145 * check if the CPU to which is was previously bound should have
2146 * its disp_max_unbound_pri increased.
2147 */
2148 void
2149 disp_adjust_unbound_pri(kthread_t *tp)
2150 {
2151 disp_t *dp;
2152 pri_t tpri;
2153
2154 ASSERT(THREAD_LOCK_HELD(tp));
2155
2156 /*
2157 * Don't do anything if the thread is not bound, or
2158 * currently not runnable or swapped out.
2159 */
2160 if (tp->t_bound_cpu == NULL ||
2161 tp->t_state != TS_RUN ||
2162 tp->t_schedflag & TS_ON_SWAPQ)
2163 return;
2164
2165 tpri = DISP_PRIO(tp);
2166 dp = tp->t_bound_cpu->cpu_disp;
2167 ASSERT(tpri >= 0 && tpri < dp->disp_npri);
2168 if (tpri > dp->disp_max_unbound_pri)
2169 dp->disp_max_unbound_pri = tpri;
2170 }
2171
2172 /*
2173 * disp_getbest()
2174 * De-queue the highest priority unbound runnable thread.
2175 * Returns with the thread unlocked and onproc but at splhigh (like disp()).
2176 * Returns NULL if nothing found.
2177 * Returns T_DONTSTEAL if the thread was not stealable.
2178 * so that the caller will try again later.
2179 *
2180 * Passed a pointer to a dispatch queue not associated with this CPU, and
2181 * its type.
2182 */
2183 static kthread_t *
2184 disp_getbest(disp_t *dp)
2185 {
2186 kthread_t *tp;
2187 dispq_t *dq;
2188 pri_t pri;
2189 cpu_t *cp, *tcp;
2190 boolean_t allbound;
2191
2192 disp_lock_enter(&dp->disp_lock);
2193
2194 /*
2195 * If there is nothing to run, or the CPU is in the middle of a
2196 * context switch of the only thread, return NULL.
2197 */
2198 tcp = dp->disp_cpu;
2199 cp = CPU;
2200 pri = dp->disp_max_unbound_pri;
2201 if (pri == -1 ||
2202 (tcp != NULL && (tcp->cpu_disp_flags & CPU_DISP_DONTSTEAL) &&
2203 tcp->cpu_disp->disp_nrunnable == 1)) {
2204 disp_lock_exit_nopreempt(&dp->disp_lock);
2205 return (NULL);
2206 }
2207
2208 dq = &dp->disp_q[pri];
2209
2210
2211 /*
2212 * Assume that all threads are bound on this queue, and change it
2213 * later when we find out that it is not the case.
2214 */
2215 allbound = B_TRUE;
2216 for (tp = dq->dq_first; tp != NULL; tp = tp->t_link) {
2217 hrtime_t now, nosteal, rqtime;
2218
2219 /*
2220 * Skip over bound threads which could be here even
2221 * though disp_max_unbound_pri indicated this level.
2222 */
2223 if (tp->t_bound_cpu || tp->t_weakbound_cpu)
2224 continue;
2225
2226 /*
2227 * We've got some unbound threads on this queue, so turn
2228 * the allbound flag off now.
2229 */
2230 allbound = B_FALSE;
2231
2232 /*
2233 * The thread is a candidate for stealing from its run queue. We
2234 * don't want to steal threads that became runnable just a
2235 * moment ago. This improves CPU affinity for threads that get
2236 * preempted for short periods of time and go back on the run
2237 * queue.
2238 *
2239 * We want to let it stay on its run queue if it was only placed
2240 * there recently and it was running on the same CPU before that
2241 * to preserve its cache investment. For the thread to remain on
2242 * its run queue, ALL of the following conditions must be
2243 * satisfied:
2244 *
2245 * - the disp queue should not be the kernel preemption queue
2246 * - delayed idle stealing should not be disabled
2247 * - nosteal_nsec should be non-zero
2248 * - it should run with user priority
2249 * - it should be on the run queue of the CPU where it was
2250 * running before being placed on the run queue
2251 * - it should be the only thread on the run queue (to prevent
2252 * extra scheduling latency for other threads)
2253 * - it should sit on the run queue for less than per-chip
2254 * nosteal interval or global nosteal interval
2255 * - in case of CPUs with shared cache it should sit in a run
2256 * queue of a CPU from a different chip
2257 *
2258 * The checks are arranged so that the ones that are faster are
2259 * placed earlier.
2260 */
2261 if (tcp == NULL ||
2262 pri >= minclsyspri ||
2263 tp->t_cpu != tcp)
2264 break;
2265
2266 /*
2267 * Steal immediately if, due to CMT processor architecture
2268 * migraiton between cp and tcp would incur no performance
2269 * penalty.
2270 */
2271 if (pg_cmt_can_migrate(cp, tcp))
2272 break;
2273
2274 nosteal = nosteal_nsec;
2275 if (nosteal == 0)
2276 break;
2277
2278 /*
2279 * Calculate time spent sitting on run queue
2280 */
2281 now = gethrtime_unscaled();
2282 rqtime = now - tp->t_waitrq;
2283 scalehrtime(&rqtime);
2284
2285 /*
2286 * Steal immediately if the time spent on this run queue is more
2287 * than allowed nosteal delay.
2288 *
2289 * Negative rqtime check is needed here to avoid infinite
2290 * stealing delays caused by unlikely but not impossible
2291 * drifts between CPU times on different CPUs.
2292 */
2293 if (rqtime > nosteal || rqtime < 0)
2294 break;
2295
2296 DTRACE_PROBE4(nosteal, kthread_t *, tp,
2297 cpu_t *, tcp, cpu_t *, cp, hrtime_t, rqtime);
2298 scalehrtime(&now);
2299 /*
2300 * Calculate when this thread becomes stealable
2301 */
2302 now += (nosteal - rqtime);
2303
2304 /*
2305 * Calculate time when some thread becomes stealable
2306 */
2307 if (now < dp->disp_steal)
2308 dp->disp_steal = now;
2309 }
2310
2311 /*
2312 * If there were no unbound threads on this queue, find the queue
2313 * where they are and then return later. The value of
2314 * disp_max_unbound_pri is not always accurate because it isn't
2315 * reduced until another idle CPU looks for work.
2316 */
2317 if (allbound)
2318 disp_fix_unbound_pri(dp, pri);
2319
2320 /*
2321 * If we reached the end of the queue and found no unbound threads
2322 * then return NULL so that other CPUs will be considered. If there
2323 * are unbound threads but they cannot yet be stolen, then
2324 * return T_DONTSTEAL and try again later.
2325 */
2326 if (tp == NULL) {
2327 disp_lock_exit_nopreempt(&dp->disp_lock);
2328 return (allbound ? NULL : T_DONTSTEAL);
2329 }
2330
2331 /*
2332 * Found a runnable, unbound thread, so remove it from queue.
2333 * dispdeq() requires that we have the thread locked, and we do,
2334 * by virtue of holding the dispatch queue lock. dispdeq() will
2335 * put the thread in transition state, thereby dropping the dispq
2336 * lock.
2337 */
2338
2339 #ifdef DEBUG
2340 {
2341 int thread_was_on_queue;
2342
2343 thread_was_on_queue = dispdeq(tp); /* drops disp_lock */
2344 ASSERT(thread_was_on_queue);
2345 }
2346
2347 #else /* DEBUG */
2348 (void) dispdeq(tp); /* drops disp_lock */
2349 #endif /* DEBUG */
2350
2351 /*
2352 * Reset the disp_queue steal time - we do not know what is the smallest
2353 * value across the queue is.
2354 */
2355 dp->disp_steal = 0;
2356
2357 tp->t_schedflag |= TS_DONT_SWAP;
2358
2359 /*
2360 * Setup thread to run on the current CPU.
2361 */
2362 tp->t_disp_queue = cp->cpu_disp;
2363
2364 cp->cpu_dispthread = tp; /* protected by spl only */
2365 cp->cpu_dispatch_pri = pri;
2366
2367 /*
2368 * There can be a memory synchronization race between disp_getbest()
2369 * and disp_ratify() vs cpu_resched() where cpu_resched() is trying
2370 * to preempt the current thread to run the enqueued thread while
2371 * disp_getbest() and disp_ratify() are changing the current thread
2372 * to the stolen thread. This may lead to a situation where
2373 * cpu_resched() tries to preempt the wrong thread and the
2374 * stolen thread continues to run on the CPU which has been tagged
2375 * for preemption.
2376 * Later the clock thread gets enqueued but doesn't get to run on the
2377 * CPU causing the system to hang.
2378 *
2379 * To avoid this, grabbing and dropping the disp_lock (which does
2380 * a memory barrier) is needed to synchronize the execution of
2381 * cpu_resched() with disp_getbest() and disp_ratify() and
2382 * synchronize the memory read and written by cpu_resched(),
2383 * disp_getbest(), and disp_ratify() with each other.
2384 * (see CR#6482861 for more details).
2385 */
2386 disp_lock_enter_high(&cp->cpu_disp->disp_lock);
2387 disp_lock_exit_high(&cp->cpu_disp->disp_lock);
2388
2389 ASSERT(pri == DISP_PRIO(tp));
2390
2391 DTRACE_PROBE3(steal, kthread_t *, tp, cpu_t *, tcp, cpu_t *, cp);
2392
2393 thread_onproc(tp, cp); /* set t_state to TS_ONPROC */
2394
2395 /*
2396 * Return with spl high so that swtch() won't need to raise it.
2397 * The disp_lock was dropped by dispdeq().
2398 */
2399
2400 return (tp);
2401 }
2402
2403 /*
2404 * disp_bound_common() - common routine for higher level functions
2405 * that check for bound threads under certain conditions.
2406 * If 'threadlistsafe' is set then there is no need to acquire
2407 * pidlock to stop the thread list from changing (eg, if
2408 * disp_bound_* is called with cpus paused).
2409 */
2410 static int
2411 disp_bound_common(cpu_t *cp, int threadlistsafe, int flag)
2412 {
2413 int found = 0;
2414 kthread_t *tp;
2415
2416 ASSERT(flag);
2417
2418 if (!threadlistsafe)
2419 mutex_enter(&pidlock);
2420 tp = curthread; /* faster than allthreads */
2421 do {
2422 if (tp->t_state != TS_FREE) {
2423 /*
2424 * If an interrupt thread is busy, but the
2425 * caller doesn't care (i.e. BOUND_INTR is off),
2426 * then just ignore it and continue through.
2427 */
2428 if ((tp->t_flag & T_INTR_THREAD) &&
2429 !(flag & BOUND_INTR))
2430 continue;
2431
2432 /*
2433 * Skip the idle thread for the CPU
2434 * we're about to set offline.
2435 */
2436 if (tp == cp->cpu_idle_thread)
2437 continue;
2438
2439 /*
2440 * Skip the pause thread for the CPU
2441 * we're about to set offline.
2442 */
2443 if (tp == cp->cpu_pause_thread)
2444 continue;
2445
2446 if ((flag & BOUND_CPU) &&
2447 (tp->t_bound_cpu == cp ||
2448 tp->t_bind_cpu == cp->cpu_id ||
2449 tp->t_weakbound_cpu == cp)) {
2450 found = 1;
2451 break;
2452 }
2453
2454 if ((flag & BOUND_PARTITION) &&
2455 (tp->t_cpupart == cp->cpu_part)) {
2456 found = 1;
2457 break;
2458 }
2459 }
2460 } while ((tp = tp->t_next) != curthread && found == 0);
2461 if (!threadlistsafe)
2462 mutex_exit(&pidlock);
2463 return (found);
2464 }
2465
2466 /*
2467 * disp_bound_threads - return nonzero if threads are bound to the processor.
2468 * Called infrequently. Keep this simple.
2469 * Includes threads that are asleep or stopped but not onproc.
2470 */
2471 int
2472 disp_bound_threads(cpu_t *cp, int threadlistsafe)
2473 {
2474 return (disp_bound_common(cp, threadlistsafe, BOUND_CPU));
2475 }
2476
2477 /*
2478 * disp_bound_anythreads - return nonzero if _any_ threads are bound
2479 * to the given processor, including interrupt threads.
2480 */
2481 int
2482 disp_bound_anythreads(cpu_t *cp, int threadlistsafe)
2483 {
2484 return (disp_bound_common(cp, threadlistsafe, BOUND_CPU | BOUND_INTR));
2485 }
2486
2487 /*
2488 * disp_bound_partition - return nonzero if threads are bound to the same
2489 * partition as the processor.
2490 * Called infrequently. Keep this simple.
2491 * Includes threads that are asleep or stopped but not onproc.
2492 */
2493 int
2494 disp_bound_partition(cpu_t *cp, int threadlistsafe)
2495 {
2496 return (disp_bound_common(cp, threadlistsafe, BOUND_PARTITION));
2497 }
2498
2499 /*
2500 * disp_cpu_inactive - make a CPU inactive by moving all of its unbound
2501 * threads to other CPUs.
2502 */
2503 void
2504 disp_cpu_inactive(cpu_t *cp)
2505 {
2506 kthread_t *tp;
2507 disp_t *dp = cp->cpu_disp;
2508 dispq_t *dq;
2509 pri_t pri;
2510 int wasonq;
2511
2512 disp_lock_enter(&dp->disp_lock);
2513 while ((pri = dp->disp_max_unbound_pri) != -1) {
2514 dq = &dp->disp_q[pri];
2515 tp = dq->dq_first;
2516
2517 /*
2518 * Skip over bound threads.
2519 */
2520 while (tp != NULL && tp->t_bound_cpu != NULL) {
2521 tp = tp->t_link;
2522 }
2523
2524 if (tp == NULL) {
2525 /* disp_max_unbound_pri must be inaccurate, so fix it */
2526 disp_fix_unbound_pri(dp, pri);
2527 continue;
2528 }
2529
2530 wasonq = dispdeq(tp); /* drops disp_lock */
2531 ASSERT(wasonq);
2532 ASSERT(tp->t_weakbound_cpu == NULL);
2533
2534 setbackdq(tp);
2535 /*
2536 * Called from cpu_offline:
2537 *
2538 * cp has already been removed from the list of active cpus
2539 * and tp->t_cpu has been changed so there is no risk of
2540 * tp ending up back on cp.
2541 *
2542 * Called from cpupart_move_cpu:
2543 *
2544 * The cpu has moved to a new cpupart. Any threads that
2545 * were on it's dispatch queues before the move remain
2546 * in the old partition and can't run in the new partition.
2547 */
2548 ASSERT(tp->t_cpu != cp);
2549 thread_unlock(tp);
2550
2551 disp_lock_enter(&dp->disp_lock);
2552 }
2553 disp_lock_exit(&dp->disp_lock);
2554 }
2555
2556 /*
2557 * Return a score rating this CPU for running this thread: lower is better.
2558 *
2559 * If curthread is looking for a new CPU, then we ignore cpu_dispatch_pri for
2560 * curcpu (as that's our own priority).
2561 *
2562 * If a cpu is the target of an offline request, then try to avoid it.
2563 *
2564 * Otherwise we'll use double the effective dispatcher priority for the CPU.
2565 *
2566 * We do this so ht_adjust_cpu_score() can increment the score if needed,
2567 * without ending up over-riding a dispatcher priority.
2568 */
2569 static pri_t
2570 cpu_score(cpu_t *cp, kthread_t *tp)
2571 {
2572 pri_t score;
2573
2574 if (tp == curthread && cp == curthread->t_cpu)
2575 score = 2 * CPU_IDLE_PRI;
2576 else if (cp == cpu_inmotion)
2577 score = SHRT_MAX;
2578 else
2579 score = 2 * cp->cpu_dispatch_pri;
2580
2581 if (2 * cp->cpu_disp->disp_maxrunpri > score)
2582 score = 2 * cp->cpu_disp->disp_maxrunpri;
2583 if (2 * cp->cpu_chosen_level > score)
2584 score = 2 * cp->cpu_chosen_level;
2585
2586 return (ht_adjust_cpu_score(tp, cp, score));
2587 }
2588
2589 /*
2590 * disp_lowpri_cpu - find a suitable CPU to run the given thread.
2591 *
2592 * We are looking for a CPU with an effective dispatch priority lower than the
2593 * thread's, so that the thread will run immediately rather than be enqueued.
2594 * For NUMA locality, we prefer "home" CPUs within the thread's ->t_lpl group.
2595 * If we don't find an available CPU there, we will expand our search to include
2596 * wider locality levels. (Note these groups are already divided by CPU
2597 * partition.)
2598 *
2599 * If the thread cannot immediately run on *any* CPU, we'll enqueue ourselves on
2600 * the best home CPU we found.
2601 *
2602 * The hint passed in is used as a starting point so we don't favor CPU 0 or any
2603 * other CPU. The caller should pass in the most recently used CPU for the
2604 * thread; it's of course possible that this CPU isn't in the home lgroup.
2605 *
2606 * This function must be called at either high SPL, or with preemption disabled,
2607 * so that the "hint" CPU cannot be removed from the online CPU list while we
2608 * are traversing it.
2609 */
2610 cpu_t *
2611 disp_lowpri_cpu(cpu_t *hint, kthread_t *tp, pri_t tpri)
2612 {
2613 cpu_t *bestcpu;
2614 cpu_t *besthomecpu;
2615 cpu_t *cp, *cpstart;
2616
2617 klgrpset_t done;
2618
2619 lpl_t *lpl_iter, *lpl_leaf;
2620
2621 ASSERT(hint != NULL);
2622 ASSERT(tp->t_lpl->lpl_ncpu > 0);
2623
2624 bestcpu = besthomecpu = NULL;
2625 klgrpset_clear(done);
2626
2627 lpl_iter = tp->t_lpl;
2628
2629 do {
2630 pri_t best = SHRT_MAX;
2631 klgrpset_t cur_set;
2632
2633 klgrpset_clear(cur_set);
2634
2635 for (int i = 0; i < lpl_iter->lpl_nrset; i++) {
2636 lpl_leaf = lpl_iter->lpl_rset[i];
2637 if (klgrpset_ismember(done, lpl_leaf->lpl_lgrpid))
2638 continue;
2639
2640 klgrpset_add(cur_set, lpl_leaf->lpl_lgrpid);
2641
2642 if (hint->cpu_lpl == lpl_leaf)
2643 cp = cpstart = hint;
2644 else
2645 cp = cpstart = lpl_leaf->lpl_cpus;
2646
2647 do {
2648 pri_t score = cpu_score(cp, tp);
2649
2650 if (score < best) {
2651 best = score;
2652 bestcpu = cp;
2653
2654 /* An idle CPU: we're done. */
2655 if (score / 2 == CPU_IDLE_PRI)
2656 goto out;
2657 }
2658 } while ((cp = cp->cpu_next_lpl) != cpstart);
2659 }
2660
2661 if (bestcpu != NULL && tpri > (best / 2))
2662 goto out;
2663
2664 if (besthomecpu == NULL)
2665 besthomecpu = bestcpu;
2666
2667 /*
2668 * Add the lgrps we just considered to the "done" set
2669 */
2670 klgrpset_or(done, cur_set);
2671
2672 } while ((lpl_iter = lpl_iter->lpl_parent) != NULL);
2673
2674 /*
2675 * The specified priority isn't high enough to run immediately
2676 * anywhere, so just return the best CPU from the home lgroup.
2677 */
2678 bestcpu = besthomecpu;
2679
2680 out:
2681 ASSERT((bestcpu->cpu_flags & CPU_QUIESCED) == 0);
2682 return (bestcpu);
2683 }
2684
2685 /*
2686 * This routine provides the generic idle cpu function for all processors.
2687 * If a processor has some specific code to execute when idle (say, to stop
2688 * the pipeline and save power) then that routine should be defined in the
2689 * processors specific code (module_xx.c) and the global variable idle_cpu
2690 * set to that function.
2691 */
2692 static void
2693 generic_idle_cpu(void)
2694 {
2695 }
2696
2697 /*ARGSUSED*/
2698 static void
2699 generic_enq_thread(cpu_t *cpu, int bound)
2700 {
2701 }
2702
2703 cpu_t *
2704 disp_choose_best_cpu(void)
2705 {
2706 kthread_t *t = curthread;
2707 cpu_t *curcpu = CPU;
2708
2709 ASSERT(t->t_preempt > 0);
2710 ASSERT(t->t_state == TS_ONPROC);
2711 ASSERT(t->t_schedflag & TS_VCPU);
2712
2713 if (ht_should_run(t, curcpu))
2714 return (curcpu);
2715
2716 return (disp_lowpri_cpu(curcpu, t, t->t_pri));
2717 }