Print this page
10924 Need mitigation of L1TF (CVE-2018-3646)
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Peter Tribble <peter.tribble@gmail.com>
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/disp/disp.c
+++ new/usr/src/uts/common/disp/disp.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
↓ open down ↓ |
15 lines elided |
↑ open up ↑ |
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 23 * Use is subject to license terms.
24 24 */
25 25
26 +/*
27 + * Copyright (c) 2018, Joyent, Inc. All rights reserved.
28 + */
29 +
26 30 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
27 31 /* All Rights Reserved */
28 32
29 33
30 34 #include <sys/types.h>
31 35 #include <sys/param.h>
32 36 #include <sys/sysmacros.h>
33 37 #include <sys/signal.h>
34 38 #include <sys/user.h>
35 39 #include <sys/systm.h>
36 40 #include <sys/sysinfo.h>
37 41 #include <sys/var.h>
38 42 #include <sys/errno.h>
39 43 #include <sys/cmn_err.h>
40 44 #include <sys/debug.h>
41 45 #include <sys/inline.h>
42 46 #include <sys/disp.h>
43 47 #include <sys/class.h>
44 48 #include <sys/bitmap.h>
45 49 #include <sys/kmem.h>
46 50 #include <sys/cpuvar.h>
47 51 #include <sys/vtrace.h>
48 52 #include <sys/tnf.h>
↓ open down ↓ |
13 lines elided |
↑ open up ↑ |
49 53 #include <sys/cpupart.h>
50 54 #include <sys/lgrp.h>
51 55 #include <sys/pg.h>
52 56 #include <sys/cmt.h>
53 57 #include <sys/bitset.h>
54 58 #include <sys/schedctl.h>
55 59 #include <sys/atomic.h>
56 60 #include <sys/dtrace.h>
57 61 #include <sys/sdt.h>
58 62 #include <sys/archsystm.h>
63 +#include <sys/ht.h>
59 64
60 65 #include <vm/as.h>
61 66
62 67 #define BOUND_CPU 0x1
63 68 #define BOUND_PARTITION 0x2
64 69 #define BOUND_INTR 0x4
65 70
66 71 /* Dispatch queue allocation structure and functions */
67 72 struct disp_queue_info {
68 73 disp_t *dp;
69 74 dispq_t *olddispq;
70 75 dispq_t *newdispq;
71 76 ulong_t *olddqactmap;
72 77 ulong_t *newdqactmap;
73 78 int oldnglobpris;
74 79 };
75 80 static void disp_dq_alloc(struct disp_queue_info *dptr, int numpris,
76 81 disp_t *dp);
77 82 static void disp_dq_assign(struct disp_queue_info *dptr, int numpris);
78 83 static void disp_dq_free(struct disp_queue_info *dptr);
79 84
80 85 /* platform-specific routine to call when processor is idle */
81 86 static void generic_idle_cpu();
82 87 void (*idle_cpu)() = generic_idle_cpu;
83 88
84 89 /* routines invoked when a CPU enters/exits the idle loop */
85 90 static void idle_enter();
86 91 static void idle_exit();
87 92
88 93 /* platform-specific routine to call when thread is enqueued */
89 94 static void generic_enq_thread(cpu_t *, int);
90 95 void (*disp_enq_thread)(cpu_t *, int) = generic_enq_thread;
91 96
92 97 pri_t kpreemptpri; /* priority where kernel preemption applies */
93 98 pri_t upreemptpri = 0; /* priority where normal preemption applies */
94 99 pri_t intr_pri; /* interrupt thread priority base level */
95 100
96 101 #define KPQPRI -1 /* pri where cpu affinity is dropped for kpq */
97 102 pri_t kpqpri = KPQPRI; /* can be set in /etc/system */
98 103 disp_t cpu0_disp; /* boot CPU's dispatch queue */
99 104 disp_lock_t swapped_lock; /* lock swapped threads and swap queue */
100 105 int nswapped; /* total number of swapped threads */
101 106 void disp_swapped_enq(kthread_t *tp);
102 107 static void disp_swapped_setrun(kthread_t *tp);
103 108 static void cpu_resched(cpu_t *cp, pri_t tpri);
104 109
105 110 /*
106 111 * If this is set, only interrupt threads will cause kernel preemptions.
107 112 * This is done by changing the value of kpreemptpri. kpreemptpri
108 113 * will either be the max sysclass pri + 1 or the min interrupt pri.
109 114 */
110 115 int only_intr_kpreempt;
111 116
112 117 extern void set_idle_cpu(int cpun);
113 118 extern void unset_idle_cpu(int cpun);
114 119 static void setkpdq(kthread_t *tp, int borf);
115 120 #define SETKP_BACK 0
116 121 #define SETKP_FRONT 1
117 122 /*
118 123 * Parameter that determines how recently a thread must have run
119 124 * on the CPU to be considered loosely-bound to that CPU to reduce
120 125 * cold cache effects. The interval is in hertz.
121 126 */
122 127 #define RECHOOSE_INTERVAL 3
123 128 int rechoose_interval = RECHOOSE_INTERVAL;
124 129
125 130 /*
126 131 * Parameter that determines how long (in nanoseconds) a thread must
127 132 * be sitting on a run queue before it can be stolen by another CPU
128 133 * to reduce migrations. The interval is in nanoseconds.
129 134 *
130 135 * The nosteal_nsec should be set by platform code cmp_set_nosteal_interval()
131 136 * to an appropriate value. nosteal_nsec is set to NOSTEAL_UNINITIALIZED
132 137 * here indicating it is uninitiallized.
133 138 * Setting nosteal_nsec to 0 effectively disables the nosteal 'protection'.
134 139 *
135 140 */
136 141 #define NOSTEAL_UNINITIALIZED (-1)
137 142 hrtime_t nosteal_nsec = NOSTEAL_UNINITIALIZED;
138 143 extern void cmp_set_nosteal_interval(void);
139 144
140 145 id_t defaultcid; /* system "default" class; see dispadmin(1M) */
141 146
142 147 disp_lock_t transition_lock; /* lock on transitioning threads */
143 148 disp_lock_t stop_lock; /* lock on stopped threads */
144 149
145 150 static void cpu_dispqalloc(int numpris);
146 151
147 152 /*
148 153 * This gets returned by disp_getwork/disp_getbest if we couldn't steal
149 154 * a thread because it was sitting on its run queue for a very short
150 155 * period of time.
151 156 */
152 157 #define T_DONTSTEAL (kthread_t *)(-1) /* returned by disp_getwork/getbest */
153 158
154 159 static kthread_t *disp_getwork(cpu_t *to);
155 160 static kthread_t *disp_getbest(disp_t *from);
156 161 static kthread_t *disp_ratify(kthread_t *tp, disp_t *kpq);
157 162
158 163 void swtch_to(kthread_t *);
159 164
160 165 /*
161 166 * dispatcher and scheduler initialization
162 167 */
163 168
164 169 /*
165 170 * disp_setup - Common code to calculate and allocate dispatcher
166 171 * variables and structures based on the maximum priority.
167 172 */
168 173 static void
169 174 disp_setup(pri_t maxglobpri, pri_t oldnglobpris)
170 175 {
171 176 pri_t newnglobpris;
172 177
173 178 ASSERT(MUTEX_HELD(&cpu_lock));
174 179
175 180 newnglobpris = maxglobpri + 1 + LOCK_LEVEL;
176 181
177 182 if (newnglobpris > oldnglobpris) {
178 183 /*
179 184 * Allocate new kp queues for each CPU partition.
180 185 */
181 186 cpupart_kpqalloc(newnglobpris);
182 187
183 188 /*
184 189 * Allocate new dispatch queues for each CPU.
185 190 */
186 191 cpu_dispqalloc(newnglobpris);
187 192
188 193 /*
189 194 * compute new interrupt thread base priority
190 195 */
191 196 intr_pri = maxglobpri;
192 197 if (only_intr_kpreempt) {
193 198 kpreemptpri = intr_pri + 1;
194 199 if (kpqpri == KPQPRI)
195 200 kpqpri = kpreemptpri;
196 201 }
197 202 v.v_nglobpris = newnglobpris;
198 203 }
199 204 }
200 205
201 206 /*
202 207 * dispinit - Called to initialize all loaded classes and the
203 208 * dispatcher framework.
204 209 */
205 210 void
206 211 dispinit(void)
207 212 {
208 213 id_t cid;
209 214 pri_t maxglobpri;
210 215 pri_t cl_maxglobpri;
211 216
212 217 maxglobpri = -1;
213 218
214 219 /*
215 220 * Initialize transition lock, which will always be set.
216 221 */
217 222 DISP_LOCK_INIT(&transition_lock);
218 223 disp_lock_enter_high(&transition_lock);
219 224 DISP_LOCK_INIT(&stop_lock);
220 225
221 226 mutex_enter(&cpu_lock);
222 227 CPU->cpu_disp->disp_maxrunpri = -1;
223 228 CPU->cpu_disp->disp_max_unbound_pri = -1;
224 229
225 230 /*
226 231 * Initialize the default CPU partition.
227 232 */
228 233 cpupart_initialize_default();
229 234 /*
230 235 * Call the class specific initialization functions for
231 236 * all pre-installed schedulers.
232 237 *
233 238 * We pass the size of a class specific parameter
234 239 * buffer to each of the initialization functions
235 240 * to try to catch problems with backward compatibility
236 241 * of class modules.
237 242 *
238 243 * For example a new class module running on an old system
239 244 * which didn't provide sufficiently large parameter buffers
240 245 * would be bad news. Class initialization modules can check for
241 246 * this and take action if they detect a problem.
242 247 */
243 248
244 249 for (cid = 0; cid < nclass; cid++) {
245 250 sclass_t *sc;
246 251
247 252 sc = &sclass[cid];
248 253 if (SCHED_INSTALLED(sc)) {
249 254 cl_maxglobpri = sc->cl_init(cid, PC_CLPARMSZ,
250 255 &sc->cl_funcs);
251 256 if (cl_maxglobpri > maxglobpri)
252 257 maxglobpri = cl_maxglobpri;
253 258 }
254 259 }
255 260 kpreemptpri = (pri_t)v.v_maxsyspri + 1;
256 261 if (kpqpri == KPQPRI)
257 262 kpqpri = kpreemptpri;
258 263
259 264 ASSERT(maxglobpri >= 0);
260 265 disp_setup(maxglobpri, 0);
261 266
262 267 mutex_exit(&cpu_lock);
263 268
264 269 /*
265 270 * Platform specific sticky scheduler setup.
266 271 */
267 272 if (nosteal_nsec == NOSTEAL_UNINITIALIZED)
268 273 cmp_set_nosteal_interval();
269 274
270 275 /*
271 276 * Get the default class ID; this may be later modified via
272 277 * dispadmin(1M). This will load the class (normally TS) and that will
273 278 * call disp_add(), which is why we had to drop cpu_lock first.
274 279 */
275 280 if (getcid(defaultclass, &defaultcid) != 0) {
276 281 cmn_err(CE_PANIC, "Couldn't load default scheduling class '%s'",
277 282 defaultclass);
278 283 }
279 284 }
280 285
281 286 /*
282 287 * disp_add - Called with class pointer to initialize the dispatcher
283 288 * for a newly loaded class.
284 289 */
285 290 void
286 291 disp_add(sclass_t *clp)
287 292 {
288 293 pri_t maxglobpri;
289 294 pri_t cl_maxglobpri;
290 295
291 296 mutex_enter(&cpu_lock);
292 297 /*
293 298 * Initialize the scheduler class.
294 299 */
295 300 maxglobpri = (pri_t)(v.v_nglobpris - LOCK_LEVEL - 1);
296 301 cl_maxglobpri = clp->cl_init(clp - sclass, PC_CLPARMSZ, &clp->cl_funcs);
297 302 if (cl_maxglobpri > maxglobpri)
298 303 maxglobpri = cl_maxglobpri;
299 304
300 305 /*
301 306 * Save old queue information. Since we're initializing a
302 307 * new scheduling class which has just been loaded, then
303 308 * the size of the dispq may have changed. We need to handle
304 309 * that here.
305 310 */
306 311 disp_setup(maxglobpri, v.v_nglobpris);
307 312
308 313 mutex_exit(&cpu_lock);
309 314 }
310 315
311 316
312 317 /*
313 318 * For each CPU, allocate new dispatch queues
314 319 * with the stated number of priorities.
315 320 */
316 321 static void
317 322 cpu_dispqalloc(int numpris)
318 323 {
319 324 cpu_t *cpup;
320 325 struct disp_queue_info *disp_mem;
321 326 int i, num;
322 327
323 328 ASSERT(MUTEX_HELD(&cpu_lock));
324 329
325 330 disp_mem = kmem_zalloc(NCPU *
326 331 sizeof (struct disp_queue_info), KM_SLEEP);
327 332
328 333 /*
329 334 * This routine must allocate all of the memory before stopping
330 335 * the cpus because it must not sleep in kmem_alloc while the
331 336 * CPUs are stopped. Locks they hold will not be freed until they
332 337 * are restarted.
333 338 */
334 339 i = 0;
335 340 cpup = cpu_list;
336 341 do {
337 342 disp_dq_alloc(&disp_mem[i], numpris, cpup->cpu_disp);
338 343 i++;
339 344 cpup = cpup->cpu_next;
340 345 } while (cpup != cpu_list);
341 346 num = i;
342 347
343 348 pause_cpus(NULL, NULL);
344 349 for (i = 0; i < num; i++)
345 350 disp_dq_assign(&disp_mem[i], numpris);
346 351 start_cpus();
347 352
348 353 /*
349 354 * I must free all of the memory after starting the cpus because
350 355 * I can not risk sleeping in kmem_free while the cpus are stopped.
351 356 */
352 357 for (i = 0; i < num; i++)
353 358 disp_dq_free(&disp_mem[i]);
354 359
355 360 kmem_free(disp_mem, NCPU * sizeof (struct disp_queue_info));
356 361 }
357 362
358 363 static void
359 364 disp_dq_alloc(struct disp_queue_info *dptr, int numpris, disp_t *dp)
360 365 {
361 366 dptr->newdispq = kmem_zalloc(numpris * sizeof (dispq_t), KM_SLEEP);
362 367 dptr->newdqactmap = kmem_zalloc(((numpris / BT_NBIPUL) + 1) *
363 368 sizeof (long), KM_SLEEP);
364 369 dptr->dp = dp;
365 370 }
366 371
367 372 static void
368 373 disp_dq_assign(struct disp_queue_info *dptr, int numpris)
369 374 {
370 375 disp_t *dp;
371 376
372 377 dp = dptr->dp;
373 378 dptr->olddispq = dp->disp_q;
374 379 dptr->olddqactmap = dp->disp_qactmap;
375 380 dptr->oldnglobpris = dp->disp_npri;
376 381
377 382 ASSERT(dptr->oldnglobpris < numpris);
378 383
379 384 if (dptr->olddispq != NULL) {
380 385 /*
381 386 * Use kcopy because bcopy is platform-specific
382 387 * and could block while we might have paused the cpus.
383 388 */
384 389 (void) kcopy(dptr->olddispq, dptr->newdispq,
385 390 dptr->oldnglobpris * sizeof (dispq_t));
386 391 (void) kcopy(dptr->olddqactmap, dptr->newdqactmap,
387 392 ((dptr->oldnglobpris / BT_NBIPUL) + 1) *
388 393 sizeof (long));
389 394 }
390 395 dp->disp_q = dptr->newdispq;
391 396 dp->disp_qactmap = dptr->newdqactmap;
392 397 dp->disp_q_limit = &dptr->newdispq[numpris];
393 398 dp->disp_npri = numpris;
394 399 }
395 400
396 401 static void
397 402 disp_dq_free(struct disp_queue_info *dptr)
398 403 {
399 404 if (dptr->olddispq != NULL)
400 405 kmem_free(dptr->olddispq,
401 406 dptr->oldnglobpris * sizeof (dispq_t));
402 407 if (dptr->olddqactmap != NULL)
403 408 kmem_free(dptr->olddqactmap,
404 409 ((dptr->oldnglobpris / BT_NBIPUL) + 1) * sizeof (long));
405 410 }
406 411
407 412 /*
408 413 * For a newly created CPU, initialize the dispatch queue.
409 414 * This is called before the CPU is known through cpu[] or on any lists.
410 415 */
411 416 void
412 417 disp_cpu_init(cpu_t *cp)
413 418 {
414 419 disp_t *dp;
415 420 dispq_t *newdispq;
416 421 ulong_t *newdqactmap;
417 422
418 423 ASSERT(MUTEX_HELD(&cpu_lock)); /* protect dispatcher queue sizes */
419 424
420 425 if (cp == cpu0_disp.disp_cpu)
421 426 dp = &cpu0_disp;
422 427 else
423 428 dp = kmem_alloc(sizeof (disp_t), KM_SLEEP);
424 429 bzero(dp, sizeof (disp_t));
425 430 cp->cpu_disp = dp;
426 431 dp->disp_cpu = cp;
427 432 dp->disp_maxrunpri = -1;
428 433 dp->disp_max_unbound_pri = -1;
429 434 DISP_LOCK_INIT(&cp->cpu_thread_lock);
430 435 /*
431 436 * Allocate memory for the dispatcher queue headers
432 437 * and the active queue bitmap.
433 438 */
434 439 newdispq = kmem_zalloc(v.v_nglobpris * sizeof (dispq_t), KM_SLEEP);
435 440 newdqactmap = kmem_zalloc(((v.v_nglobpris / BT_NBIPUL) + 1) *
436 441 sizeof (long), KM_SLEEP);
437 442 dp->disp_q = newdispq;
438 443 dp->disp_qactmap = newdqactmap;
439 444 dp->disp_q_limit = &newdispq[v.v_nglobpris];
440 445 dp->disp_npri = v.v_nglobpris;
441 446 }
442 447
443 448 void
444 449 disp_cpu_fini(cpu_t *cp)
445 450 {
446 451 ASSERT(MUTEX_HELD(&cpu_lock));
447 452
448 453 disp_kp_free(cp->cpu_disp);
449 454 if (cp->cpu_disp != &cpu0_disp)
450 455 kmem_free(cp->cpu_disp, sizeof (disp_t));
451 456 }
452 457
453 458 /*
454 459 * Allocate new, larger kpreempt dispatch queue to replace the old one.
455 460 */
456 461 void
457 462 disp_kp_alloc(disp_t *dq, pri_t npri)
458 463 {
459 464 struct disp_queue_info mem_info;
460 465
461 466 if (npri > dq->disp_npri) {
462 467 /*
463 468 * Allocate memory for the new array.
464 469 */
465 470 disp_dq_alloc(&mem_info, npri, dq);
466 471
467 472 /*
468 473 * We need to copy the old structures to the new
469 474 * and free the old.
470 475 */
471 476 disp_dq_assign(&mem_info, npri);
472 477 disp_dq_free(&mem_info);
473 478 }
474 479 }
475 480
476 481 /*
477 482 * Free dispatch queue.
478 483 * Used for the kpreempt queues for a removed CPU partition and
479 484 * for the per-CPU queues of deleted CPUs.
480 485 */
481 486 void
482 487 disp_kp_free(disp_t *dq)
483 488 {
484 489 struct disp_queue_info mem_info;
485 490
486 491 mem_info.olddispq = dq->disp_q;
487 492 mem_info.olddqactmap = dq->disp_qactmap;
488 493 mem_info.oldnglobpris = dq->disp_npri;
489 494 disp_dq_free(&mem_info);
490 495 }
491 496
492 497 /*
493 498 * End dispatcher and scheduler initialization.
494 499 */
495 500
496 501 /*
497 502 * See if there's anything to do other than remain idle.
498 503 * Return non-zero if there is.
499 504 *
500 505 * This function must be called with high spl, or with
501 506 * kernel preemption disabled to prevent the partition's
502 507 * active cpu list from changing while being traversed.
503 508 *
504 509 * This is essentially a simpler version of disp_getwork()
505 510 * to be called by CPUs preparing to "halt".
506 511 */
507 512 int
508 513 disp_anywork(void)
509 514 {
510 515 cpu_t *cp = CPU;
511 516 cpu_t *ocp;
512 517 volatile int *local_nrunnable = &cp->cpu_disp->disp_nrunnable;
513 518
514 519 if (!(cp->cpu_flags & CPU_OFFLINE)) {
515 520 if (CP_MAXRUNPRI(cp->cpu_part) >= 0)
516 521 return (1);
517 522
518 523 for (ocp = cp->cpu_next_part; ocp != cp;
519 524 ocp = ocp->cpu_next_part) {
520 525 ASSERT(CPU_ACTIVE(ocp));
521 526
522 527 /*
523 528 * Something has appeared on the local run queue.
524 529 */
525 530 if (*local_nrunnable > 0)
526 531 return (1);
527 532 /*
528 533 * If we encounter another idle CPU that will
529 534 * soon be trolling around through disp_anywork()
530 535 * terminate our walk here and let this other CPU
531 536 * patrol the next part of the list.
532 537 */
533 538 if (ocp->cpu_dispatch_pri == -1 &&
534 539 (ocp->cpu_disp_flags & CPU_DISP_HALTED) == 0)
535 540 return (0);
536 541 /*
537 542 * Work can be taken from another CPU if:
538 543 * - There is unbound work on the run queue
539 544 * - That work isn't a thread undergoing a
540 545 * - context switch on an otherwise empty queue.
541 546 * - The CPU isn't running the idle loop.
542 547 */
543 548 if (ocp->cpu_disp->disp_max_unbound_pri != -1 &&
544 549 !((ocp->cpu_disp_flags & CPU_DISP_DONTSTEAL) &&
545 550 ocp->cpu_disp->disp_nrunnable == 1) &&
546 551 ocp->cpu_dispatch_pri != -1)
547 552 return (1);
548 553 }
549 554 }
550 555 return (0);
551 556 }
552 557
553 558 /*
554 559 * Called when CPU enters the idle loop
555 560 */
556 561 static void
557 562 idle_enter()
558 563 {
559 564 cpu_t *cp = CPU;
560 565
561 566 new_cpu_mstate(CMS_IDLE, gethrtime_unscaled());
562 567 CPU_STATS_ADDQ(cp, sys, idlethread, 1);
563 568 set_idle_cpu(cp->cpu_id); /* arch-dependent hook */
564 569 }
565 570
566 571 /*
567 572 * Called when CPU exits the idle loop
568 573 */
569 574 static void
570 575 idle_exit()
571 576 {
572 577 cpu_t *cp = CPU;
573 578
574 579 new_cpu_mstate(CMS_SYSTEM, gethrtime_unscaled());
575 580 unset_idle_cpu(cp->cpu_id); /* arch-dependent hook */
576 581 }
577 582
578 583 /*
579 584 * Idle loop.
580 585 */
581 586 void
582 587 idle()
583 588 {
584 589 struct cpu *cp = CPU; /* pointer to this CPU */
585 590 kthread_t *t; /* taken thread */
586 591
587 592 idle_enter();
588 593
589 594 /*
590 595 * Uniprocessor version of idle loop.
591 596 * Do this until notified that we're on an actual multiprocessor.
592 597 */
593 598 while (ncpus == 1) {
594 599 if (cp->cpu_disp->disp_nrunnable == 0) {
595 600 (*idle_cpu)();
596 601 continue;
597 602 }
598 603 idle_exit();
599 604 swtch();
600 605
601 606 idle_enter(); /* returned from swtch */
602 607 }
603 608
604 609 /*
605 610 * Multiprocessor idle loop.
606 611 */
607 612 for (;;) {
608 613 /*
609 614 * If CPU is completely quiesced by p_online(2), just wait
610 615 * here with minimal bus traffic until put online.
611 616 */
612 617 while (cp->cpu_flags & CPU_QUIESCED)
613 618 (*idle_cpu)();
614 619
615 620 if (cp->cpu_disp->disp_nrunnable != 0) {
616 621 idle_exit();
617 622 swtch();
618 623 } else {
619 624 if (cp->cpu_flags & CPU_OFFLINE)
620 625 continue;
621 626 if ((t = disp_getwork(cp)) == NULL) {
622 627 if (cp->cpu_chosen_level != -1) {
623 628 disp_t *dp = cp->cpu_disp;
624 629 disp_t *kpq;
625 630
626 631 disp_lock_enter(&dp->disp_lock);
627 632 /*
628 633 * Set kpq under lock to prevent
629 634 * migration between partitions.
630 635 */
631 636 kpq = &cp->cpu_part->cp_kp_queue;
632 637 if (kpq->disp_maxrunpri == -1)
633 638 cp->cpu_chosen_level = -1;
634 639 disp_lock_exit(&dp->disp_lock);
635 640 }
636 641 (*idle_cpu)();
637 642 continue;
638 643 }
639 644 /*
640 645 * If there was a thread but we couldn't steal
641 646 * it, then keep trying.
642 647 */
643 648 if (t == T_DONTSTEAL)
644 649 continue;
645 650 idle_exit();
646 651 swtch_to(t);
647 652 }
648 653 idle_enter(); /* returned from swtch/swtch_to */
649 654 }
650 655 }
651 656
652 657
653 658 /*
654 659 * Preempt the currently running thread in favor of the highest
655 660 * priority thread. The class of the current thread controls
656 661 * where it goes on the dispatcher queues. If panicking, turn
657 662 * preemption off.
658 663 */
659 664 void
660 665 preempt()
661 666 {
662 667 kthread_t *t = curthread;
663 668 klwp_t *lwp = ttolwp(curthread);
664 669
665 670 if (panicstr)
666 671 return;
667 672
668 673 TRACE_0(TR_FAC_DISP, TR_PREEMPT_START, "preempt_start");
669 674
670 675 thread_lock(t);
671 676
672 677 if (t->t_state != TS_ONPROC || t->t_disp_queue != CPU->cpu_disp) {
673 678 /*
674 679 * this thread has already been chosen to be run on
675 680 * another CPU. Clear kprunrun on this CPU since we're
676 681 * already headed for swtch().
677 682 */
678 683 CPU->cpu_kprunrun = 0;
679 684 thread_unlock_nopreempt(t);
680 685 TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end");
681 686 } else {
682 687 if (lwp != NULL)
683 688 lwp->lwp_ru.nivcsw++;
684 689 CPU_STATS_ADDQ(CPU, sys, inv_swtch, 1);
685 690 THREAD_TRANSITION(t);
686 691 CL_PREEMPT(t);
687 692 DTRACE_SCHED(preempt);
688 693 thread_unlock_nopreempt(t);
689 694
690 695 TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end");
691 696
692 697 swtch(); /* clears CPU->cpu_runrun via disp() */
693 698 }
694 699 }
695 700
696 701 extern kthread_t *thread_unpin();
697 702
698 703 /*
699 704 * disp() - find the highest priority thread for this processor to run, and
700 705 * set it in TS_ONPROC state so that resume() can be called to run it.
701 706 */
702 707 static kthread_t *
703 708 disp()
704 709 {
705 710 cpu_t *cpup;
706 711 disp_t *dp;
707 712 kthread_t *tp;
708 713 dispq_t *dq;
709 714 int maxrunword;
710 715 pri_t pri;
711 716 disp_t *kpq;
712 717
713 718 TRACE_0(TR_FAC_DISP, TR_DISP_START, "disp_start");
714 719
715 720 cpup = CPU;
716 721 /*
717 722 * Find the highest priority loaded, runnable thread.
718 723 */
719 724 dp = cpup->cpu_disp;
720 725
721 726 reschedule:
722 727 /*
723 728 * If there is more important work on the global queue with a better
724 729 * priority than the maximum on this CPU, take it now.
725 730 */
726 731 kpq = &cpup->cpu_part->cp_kp_queue;
727 732 while ((pri = kpq->disp_maxrunpri) >= 0 &&
728 733 pri >= dp->disp_maxrunpri &&
729 734 (cpup->cpu_flags & CPU_OFFLINE) == 0 &&
730 735 (tp = disp_getbest(kpq)) != NULL) {
731 736 if (disp_ratify(tp, kpq) != NULL) {
732 737 TRACE_1(TR_FAC_DISP, TR_DISP_END,
733 738 "disp_end:tid %p", tp);
734 739 return (tp);
735 740 }
736 741 }
737 742
738 743 disp_lock_enter(&dp->disp_lock);
739 744 pri = dp->disp_maxrunpri;
740 745
741 746 /*
742 747 * If there is nothing to run, look at what's runnable on other queues.
743 748 * Choose the idle thread if the CPU is quiesced.
744 749 * Note that CPUs that have the CPU_OFFLINE flag set can still run
745 750 * interrupt threads, which will be the only threads on the CPU's own
746 751 * queue, but cannot run threads from other queues.
747 752 */
748 753 if (pri == -1) {
749 754 if (!(cpup->cpu_flags & CPU_OFFLINE)) {
750 755 disp_lock_exit(&dp->disp_lock);
751 756 if ((tp = disp_getwork(cpup)) == NULL ||
752 757 tp == T_DONTSTEAL) {
753 758 tp = cpup->cpu_idle_thread;
754 759 (void) splhigh();
755 760 THREAD_ONPROC(tp, cpup);
756 761 cpup->cpu_dispthread = tp;
757 762 cpup->cpu_dispatch_pri = -1;
758 763 cpup->cpu_runrun = cpup->cpu_kprunrun = 0;
759 764 cpup->cpu_chosen_level = -1;
760 765 }
761 766 } else {
762 767 disp_lock_exit_high(&dp->disp_lock);
763 768 tp = cpup->cpu_idle_thread;
764 769 THREAD_ONPROC(tp, cpup);
765 770 cpup->cpu_dispthread = tp;
766 771 cpup->cpu_dispatch_pri = -1;
767 772 cpup->cpu_runrun = cpup->cpu_kprunrun = 0;
768 773 cpup->cpu_chosen_level = -1;
769 774 }
770 775 TRACE_1(TR_FAC_DISP, TR_DISP_END,
771 776 "disp_end:tid %p", tp);
772 777 return (tp);
773 778 }
774 779
775 780 dq = &dp->disp_q[pri];
776 781 tp = dq->dq_first;
777 782
778 783 ASSERT(tp != NULL);
779 784 ASSERT(tp->t_schedflag & TS_LOAD); /* thread must be swapped in */
780 785
781 786 DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp);
782 787
783 788 /*
784 789 * Found it so remove it from queue.
785 790 */
786 791 dp->disp_nrunnable--;
787 792 dq->dq_sruncnt--;
788 793 if ((dq->dq_first = tp->t_link) == NULL) {
789 794 ulong_t *dqactmap = dp->disp_qactmap;
790 795
791 796 ASSERT(dq->dq_sruncnt == 0);
792 797 dq->dq_last = NULL;
793 798
794 799 /*
795 800 * The queue is empty, so the corresponding bit needs to be
796 801 * turned off in dqactmap. If nrunnable != 0 just took the
797 802 * last runnable thread off the
798 803 * highest queue, so recompute disp_maxrunpri.
799 804 */
800 805 maxrunword = pri >> BT_ULSHIFT;
801 806 dqactmap[maxrunword] &= ~BT_BIW(pri);
802 807
803 808 if (dp->disp_nrunnable == 0) {
804 809 dp->disp_max_unbound_pri = -1;
805 810 dp->disp_maxrunpri = -1;
806 811 } else {
807 812 int ipri;
808 813
809 814 ipri = bt_gethighbit(dqactmap, maxrunword);
810 815 dp->disp_maxrunpri = ipri;
811 816 if (ipri < dp->disp_max_unbound_pri)
812 817 dp->disp_max_unbound_pri = ipri;
813 818 }
814 819 } else {
815 820 tp->t_link = NULL;
816 821 }
817 822
818 823 /*
819 824 * Set TS_DONT_SWAP flag to prevent another processor from swapping
820 825 * out this thread before we have a chance to run it.
821 826 * While running, it is protected against swapping by t_lock.
822 827 */
823 828 tp->t_schedflag |= TS_DONT_SWAP;
824 829 cpup->cpu_dispthread = tp; /* protected by spl only */
825 830 cpup->cpu_dispatch_pri = pri;
826 831 ASSERT(pri == DISP_PRIO(tp));
827 832 thread_onproc(tp, cpup); /* set t_state to TS_ONPROC */
828 833 disp_lock_exit_high(&dp->disp_lock); /* drop run queue lock */
829 834
830 835 ASSERT(tp != NULL);
831 836 TRACE_1(TR_FAC_DISP, TR_DISP_END,
832 837 "disp_end:tid %p", tp);
833 838
834 839 if (disp_ratify(tp, kpq) == NULL)
835 840 goto reschedule;
836 841
837 842 return (tp);
838 843 }
839 844
840 845 /*
841 846 * swtch()
842 847 * Find best runnable thread and run it.
843 848 * Called with the current thread already switched to a new state,
844 849 * on a sleep queue, run queue, stopped, and not zombied.
845 850 * May be called at any spl level less than or equal to LOCK_LEVEL.
846 851 * Always drops spl to the base level (spl0()).
847 852 */
848 853 void
849 854 swtch()
850 855 {
851 856 kthread_t *t = curthread;
852 857 kthread_t *next;
853 858 cpu_t *cp;
854 859
855 860 TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
856 861
857 862 if (t->t_flag & T_INTR_THREAD)
858 863 cpu_intr_swtch_enter(t);
859 864
860 865 if (t->t_intr != NULL) {
861 866 /*
862 867 * We are an interrupt thread. Setup and return
863 868 * the interrupted thread to be resumed.
864 869 */
865 870 (void) splhigh(); /* block other scheduler action */
866 871 cp = CPU; /* now protected against migration */
867 872 ASSERT(CPU_ON_INTR(cp) == 0); /* not called with PIL > 10 */
868 873 CPU_STATS_ADDQ(cp, sys, pswitch, 1);
869 874 CPU_STATS_ADDQ(cp, sys, intrblk, 1);
870 875 next = thread_unpin();
871 876 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
872 877 resume_from_intr(next);
873 878 } else {
874 879 #ifdef DEBUG
875 880 if (t->t_state == TS_ONPROC &&
876 881 t->t_disp_queue->disp_cpu == CPU &&
877 882 t->t_preempt == 0) {
878 883 thread_lock(t);
879 884 ASSERT(t->t_state != TS_ONPROC ||
880 885 t->t_disp_queue->disp_cpu != CPU ||
881 886 t->t_preempt != 0); /* cannot migrate */
882 887 thread_unlock_nopreempt(t);
883 888 }
884 889 #endif /* DEBUG */
885 890 cp = CPU;
886 891 next = disp(); /* returns with spl high */
887 892 ASSERT(CPU_ON_INTR(cp) == 0); /* not called with PIL > 10 */
888 893
889 894 /* OK to steal anything left on run queue */
890 895 cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL;
891 896
892 897 if (next != t) {
893 898 hrtime_t now;
894 899
895 900 now = gethrtime_unscaled();
896 901 pg_ev_thread_swtch(cp, now, t, next);
897 902
898 903 /*
899 904 * If t was previously in the TS_ONPROC state,
900 905 * setfrontdq and setbackdq won't have set its t_waitrq.
901 906 * Since we now finally know that we're switching away
902 907 * from this thread, set its t_waitrq if it is on a run
903 908 * queue.
904 909 */
905 910 if ((t->t_state == TS_RUN) && (t->t_waitrq == 0)) {
906 911 t->t_waitrq = now;
907 912 }
908 913
909 914 /*
910 915 * restore mstate of thread that we are switching to
911 916 */
912 917 restore_mstate(next);
913 918
914 919 CPU_STATS_ADDQ(cp, sys, pswitch, 1);
915 920 cp->cpu_last_swtch = t->t_disp_time = ddi_get_lbolt();
916 921 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
917 922
918 923 if (dtrace_vtime_active)
919 924 dtrace_vtime_switch(next);
920 925
921 926 resume(next);
922 927 /*
923 928 * The TR_RESUME_END and TR_SWTCH_END trace points
924 929 * appear at the end of resume(), because we may not
925 930 * return here
926 931 */
927 932 } else {
928 933 if (t->t_flag & T_INTR_THREAD)
929 934 cpu_intr_swtch_exit(t);
930 935 /*
931 936 * Threads that enqueue themselves on a run queue defer
932 937 * setting t_waitrq. It is then either set in swtch()
933 938 * when the CPU is actually yielded, or not at all if it
934 939 * is remaining on the CPU.
935 940 * There is however a window between where the thread
936 941 * placed itself on a run queue, and where it selects
937 942 * itself in disp(), where a third party (eg. clock()
938 943 * doing tick processing) may have re-enqueued this
939 944 * thread, setting t_waitrq in the process. We detect
940 945 * this race by noticing that despite switching to
941 946 * ourself, our t_waitrq has been set, and should be
942 947 * cleared.
943 948 */
944 949 if (t->t_waitrq != 0)
945 950 t->t_waitrq = 0;
946 951
947 952 pg_ev_thread_remain(cp, t);
948 953
949 954 DTRACE_SCHED(remain__cpu);
950 955 TRACE_0(TR_FAC_DISP, TR_SWTCH_END, "swtch_end");
951 956 (void) spl0();
952 957 }
953 958 }
954 959 }
955 960
956 961 /*
957 962 * swtch_from_zombie()
958 963 * Special case of swtch(), which allows checks for TS_ZOMB to be
959 964 * eliminated from normal resume.
960 965 * Find best runnable thread and run it.
961 966 * Called with the current thread zombied.
962 967 * Zombies cannot migrate, so CPU references are safe.
963 968 */
964 969 void
965 970 swtch_from_zombie()
966 971 {
967 972 kthread_t *next;
968 973 cpu_t *cpu = CPU;
969 974
970 975 TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
971 976
972 977 ASSERT(curthread->t_state == TS_ZOMB);
973 978
974 979 next = disp(); /* returns with spl high */
975 980 ASSERT(CPU_ON_INTR(CPU) == 0); /* not called with PIL > 10 */
976 981 CPU_STATS_ADDQ(CPU, sys, pswitch, 1);
977 982 ASSERT(next != curthread);
978 983 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
979 984
980 985 pg_ev_thread_swtch(cpu, gethrtime_unscaled(), curthread, next);
981 986
982 987 restore_mstate(next);
983 988
984 989 if (dtrace_vtime_active)
985 990 dtrace_vtime_switch(next);
986 991
987 992 resume_from_zombie(next);
988 993 /*
989 994 * The TR_RESUME_END and TR_SWTCH_END trace points
990 995 * appear at the end of resume(), because we certainly will not
991 996 * return here
992 997 */
993 998 }
994 999
995 1000 #if defined(DEBUG) && (defined(DISP_DEBUG) || defined(lint))
996 1001
997 1002 /*
998 1003 * search_disp_queues()
999 1004 * Search the given dispatch queues for thread tp.
1000 1005 * Return 1 if tp is found, otherwise return 0.
1001 1006 */
1002 1007 static int
1003 1008 search_disp_queues(disp_t *dp, kthread_t *tp)
1004 1009 {
1005 1010 dispq_t *dq;
1006 1011 dispq_t *eq;
1007 1012
1008 1013 disp_lock_enter_high(&dp->disp_lock);
1009 1014
1010 1015 for (dq = dp->disp_q, eq = dp->disp_q_limit; dq < eq; ++dq) {
1011 1016 kthread_t *rp;
1012 1017
1013 1018 ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL);
1014 1019
1015 1020 for (rp = dq->dq_first; rp; rp = rp->t_link)
1016 1021 if (tp == rp) {
1017 1022 disp_lock_exit_high(&dp->disp_lock);
1018 1023 return (1);
1019 1024 }
1020 1025 }
1021 1026 disp_lock_exit_high(&dp->disp_lock);
1022 1027
1023 1028 return (0);
1024 1029 }
1025 1030
1026 1031 /*
1027 1032 * thread_on_queue()
1028 1033 * Search all per-CPU dispatch queues and all partition-wide kpreempt
1029 1034 * queues for thread tp. Return 1 if tp is found, otherwise return 0.
1030 1035 */
1031 1036 static int
1032 1037 thread_on_queue(kthread_t *tp)
1033 1038 {
1034 1039 cpu_t *cp;
1035 1040 struct cpupart *part;
1036 1041
1037 1042 ASSERT(getpil() >= DISP_LEVEL);
1038 1043
1039 1044 /*
1040 1045 * Search the per-CPU dispatch queues for tp.
1041 1046 */
1042 1047 cp = CPU;
1043 1048 do {
1044 1049 if (search_disp_queues(cp->cpu_disp, tp))
1045 1050 return (1);
1046 1051 } while ((cp = cp->cpu_next_onln) != CPU);
1047 1052
1048 1053 /*
1049 1054 * Search the partition-wide kpreempt queues for tp.
1050 1055 */
1051 1056 part = CPU->cpu_part;
1052 1057 do {
1053 1058 if (search_disp_queues(&part->cp_kp_queue, tp))
1054 1059 return (1);
1055 1060 } while ((part = part->cp_next) != CPU->cpu_part);
1056 1061
1057 1062 return (0);
1058 1063 }
1059 1064
1060 1065 #else
1061 1066
1062 1067 #define thread_on_queue(tp) 0 /* ASSERT must be !thread_on_queue */
1063 1068
1064 1069 #endif /* DEBUG */
1065 1070
1066 1071 /*
1067 1072 * like swtch(), but switch to a specified thread taken from another CPU.
1068 1073 * called with spl high..
1069 1074 */
1070 1075 void
1071 1076 swtch_to(kthread_t *next)
1072 1077 {
1073 1078 cpu_t *cp = CPU;
1074 1079 hrtime_t now;
1075 1080
1076 1081 TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
1077 1082
1078 1083 /*
1079 1084 * Update context switch statistics.
1080 1085 */
1081 1086 CPU_STATS_ADDQ(cp, sys, pswitch, 1);
1082 1087
1083 1088 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
1084 1089
1085 1090 now = gethrtime_unscaled();
1086 1091 pg_ev_thread_swtch(cp, now, curthread, next);
1087 1092
1088 1093 /* OK to steal anything left on run queue */
1089 1094 cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL;
1090 1095
1091 1096 /* record last execution time */
1092 1097 cp->cpu_last_swtch = curthread->t_disp_time = ddi_get_lbolt();
1093 1098
1094 1099 /*
1095 1100 * If t was previously in the TS_ONPROC state, setfrontdq and setbackdq
1096 1101 * won't have set its t_waitrq. Since we now finally know that we're
1097 1102 * switching away from this thread, set its t_waitrq if it is on a run
1098 1103 * queue.
1099 1104 */
1100 1105 if ((curthread->t_state == TS_RUN) && (curthread->t_waitrq == 0)) {
1101 1106 curthread->t_waitrq = now;
1102 1107 }
1103 1108
1104 1109 /* restore next thread to previously running microstate */
1105 1110 restore_mstate(next);
1106 1111
1107 1112 if (dtrace_vtime_active)
↓ open down ↓ |
1039 lines elided |
↑ open up ↑ |
1108 1113 dtrace_vtime_switch(next);
1109 1114
1110 1115 resume(next);
1111 1116 /*
1112 1117 * The TR_RESUME_END and TR_SWTCH_END trace points
1113 1118 * appear at the end of resume(), because we may not
1114 1119 * return here
1115 1120 */
1116 1121 }
1117 1122
1118 -#define CPU_IDLING(pri) ((pri) == -1)
1119 -
1120 1123 static void
1121 1124 cpu_resched(cpu_t *cp, pri_t tpri)
1122 1125 {
1123 1126 int call_poke_cpu = 0;
1124 1127 pri_t cpupri = cp->cpu_dispatch_pri;
1125 1128
1126 - if (!CPU_IDLING(cpupri) && (cpupri < tpri)) {
1129 + if (cpupri != CPU_IDLE_PRI && cpupri < tpri) {
1127 1130 TRACE_2(TR_FAC_DISP, TR_CPU_RESCHED,
1128 1131 "CPU_RESCHED:Tpri %d Cpupri %d", tpri, cpupri);
1129 1132 if (tpri >= upreemptpri && cp->cpu_runrun == 0) {
1130 1133 cp->cpu_runrun = 1;
1131 1134 aston(cp->cpu_dispthread);
1132 1135 if (tpri < kpreemptpri && cp != CPU)
1133 1136 call_poke_cpu = 1;
1134 1137 }
1135 1138 if (tpri >= kpreemptpri && cp->cpu_kprunrun == 0) {
1136 1139 cp->cpu_kprunrun = 1;
1137 1140 if (cp != CPU)
1138 1141 call_poke_cpu = 1;
1139 1142 }
1140 1143 }
1141 1144
1142 1145 /*
1143 1146 * Propagate cpu_runrun, and cpu_kprunrun to global visibility.
1144 1147 */
1145 1148 membar_enter();
1146 1149
1147 1150 if (call_poke_cpu)
1148 1151 poke_cpu(cp->cpu_id);
1149 1152 }
1150 1153
1151 1154 /*
1152 1155 * setbackdq() keeps runqs balanced such that the difference in length
1153 1156 * between the chosen runq and the next one is no more than RUNQ_MAX_DIFF.
1154 1157 * For threads with priorities below RUNQ_MATCH_PRI levels, the runq's lengths
1155 1158 * must match. When per-thread TS_RUNQMATCH flag is set, setbackdq() will
1156 1159 * try to keep runqs perfectly balanced regardless of the thread priority.
1157 1160 */
1158 1161 #define RUNQ_MATCH_PRI 16 /* pri below which queue lengths must match */
1159 1162 #define RUNQ_MAX_DIFF 2 /* maximum runq length difference */
1160 1163 #define RUNQ_LEN(cp, pri) ((cp)->cpu_disp->disp_q[pri].dq_sruncnt)
1161 1164
1162 1165 /*
1163 1166 * Macro that evaluates to true if it is likely that the thread has cache
1164 1167 * warmth. This is based on the amount of time that has elapsed since the
1165 1168 * thread last ran. If that amount of time is less than "rechoose_interval"
1166 1169 * ticks, then we decide that the thread has enough cache warmth to warrant
1167 1170 * some affinity for t->t_cpu.
1168 1171 */
1169 1172 #define THREAD_HAS_CACHE_WARMTH(thread) \
1170 1173 ((thread == curthread) || \
1171 1174 ((ddi_get_lbolt() - thread->t_disp_time) <= rechoose_interval))
1172 1175 /*
1173 1176 * Put the specified thread on the back of the dispatcher
1174 1177 * queue corresponding to its current priority.
1175 1178 *
1176 1179 * Called with the thread in transition, onproc or stopped state
1177 1180 * and locked (transition implies locked) and at high spl.
1178 1181 * Returns with the thread in TS_RUN state and still locked.
1179 1182 */
1180 1183 void
1181 1184 setbackdq(kthread_t *tp)
1182 1185 {
1183 1186 dispq_t *dq;
1184 1187 disp_t *dp;
1185 1188 cpu_t *cp;
1186 1189 pri_t tpri;
1187 1190 int bound;
1188 1191 boolean_t self;
1189 1192
1190 1193 ASSERT(THREAD_LOCK_HELD(tp));
1191 1194 ASSERT((tp->t_schedflag & TS_ALLSTART) == 0);
1192 1195 ASSERT(!thread_on_queue(tp)); /* make sure tp isn't on a runq */
1193 1196
1194 1197 /*
1195 1198 * If thread is "swapped" or on the swap queue don't
1196 1199 * queue it, but wake sched.
1197 1200 */
1198 1201 if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) {
1199 1202 disp_swapped_setrun(tp);
1200 1203 return;
1201 1204 }
1202 1205
1203 1206 self = (tp == curthread);
1204 1207
1205 1208 if (tp->t_bound_cpu || tp->t_weakbound_cpu)
1206 1209 bound = 1;
1207 1210 else
1208 1211 bound = 0;
1209 1212
1210 1213 tpri = DISP_PRIO(tp);
1211 1214 if (ncpus == 1)
↓ open down ↓ |
75 lines elided |
↑ open up ↑ |
1212 1215 cp = tp->t_cpu;
1213 1216 else if (!bound) {
1214 1217 if (tpri >= kpqpri) {
1215 1218 setkpdq(tp, SETKP_BACK);
1216 1219 return;
1217 1220 }
1218 1221
1219 1222 /*
1220 1223 * We'll generally let this thread continue to run where
1221 1224 * it last ran...but will consider migration if:
1222 - * - We thread probably doesn't have much cache warmth.
1225 + * - The thread probably doesn't have much cache warmth.
1226 + * - HT exclusion would prefer us to run elsewhere
1223 1227 * - The CPU where it last ran is the target of an offline
1224 1228 * request.
1225 - * - The thread last ran outside it's home lgroup.
1229 + * - The thread last ran outside its home lgroup.
1226 1230 */
1227 1231 if ((!THREAD_HAS_CACHE_WARMTH(tp)) ||
1228 - (tp->t_cpu == cpu_inmotion)) {
1229 - cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri, NULL);
1230 - } else if (!LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, tp->t_cpu)) {
1231 - cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri,
1232 - self ? tp->t_cpu : NULL);
1232 + !ht_should_run(tp, tp->t_cpu) ||
1233 + (tp->t_cpu == cpu_inmotion) ||
1234 + !LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, tp->t_cpu)) {
1235 + cp = disp_lowpri_cpu(tp->t_cpu, tp, tpri);
1233 1236 } else {
1234 1237 cp = tp->t_cpu;
1235 1238 }
1236 1239
1237 1240 if (tp->t_cpupart == cp->cpu_part) {
1238 1241 int qlen;
1239 1242
1240 1243 /*
1241 1244 * Perform any CMT load balancing
1242 1245 */
1243 1246 cp = cmt_balance(tp, cp);
1244 1247
1245 1248 /*
1246 1249 * Balance across the run queues
1247 1250 */
1248 1251 qlen = RUNQ_LEN(cp, tpri);
1249 1252 if (tpri >= RUNQ_MATCH_PRI &&
1250 1253 !(tp->t_schedflag & TS_RUNQMATCH))
↓ open down ↓ |
8 lines elided |
↑ open up ↑ |
1251 1254 qlen -= RUNQ_MAX_DIFF;
1252 1255 if (qlen > 0) {
1253 1256 cpu_t *newcp;
1254 1257
1255 1258 if (tp->t_lpl->lpl_lgrpid == LGRP_ROOTID) {
1256 1259 newcp = cp->cpu_next_part;
1257 1260 } else if ((newcp = cp->cpu_next_lpl) == cp) {
1258 1261 newcp = cp->cpu_next_part;
1259 1262 }
1260 1263
1261 - if (RUNQ_LEN(newcp, tpri) < qlen) {
1264 + if (ht_should_run(tp, newcp) &&
1265 + RUNQ_LEN(newcp, tpri) < qlen) {
1262 1266 DTRACE_PROBE3(runq__balance,
1263 1267 kthread_t *, tp,
1264 1268 cpu_t *, cp, cpu_t *, newcp);
1265 1269 cp = newcp;
1266 1270 }
1267 1271 }
1268 1272 } else {
1269 1273 /*
1270 1274 * Migrate to a cpu in the new partition.
1271 1275 */
1272 - cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist,
1273 - tp->t_lpl, tp->t_pri, NULL);
1276 + cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist, tp,
1277 + tp->t_pri);
1274 1278 }
1275 1279 ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1276 1280 } else {
1277 1281 /*
1278 1282 * It is possible that t_weakbound_cpu != t_bound_cpu (for
1279 1283 * a short time until weak binding that existed when the
1280 1284 * strong binding was established has dropped) so we must
1281 1285 * favour weak binding over strong.
1282 1286 */
1283 1287 cp = tp->t_weakbound_cpu ?
1284 1288 tp->t_weakbound_cpu : tp->t_bound_cpu;
1285 1289 }
1286 1290 /*
1287 1291 * A thread that is ONPROC may be temporarily placed on the run queue
1288 1292 * but then chosen to run again by disp. If the thread we're placing on
1289 1293 * the queue is in TS_ONPROC state, don't set its t_waitrq until a
1290 1294 * replacement process is actually scheduled in swtch(). In this
1291 1295 * situation, curthread is the only thread that could be in the ONPROC
1292 1296 * state.
1293 1297 */
1294 1298 if ((!self) && (tp->t_waitrq == 0)) {
1295 1299 hrtime_t curtime;
1296 1300
1297 1301 curtime = gethrtime_unscaled();
1298 1302 (void) cpu_update_pct(tp, curtime);
1299 1303 tp->t_waitrq = curtime;
1300 1304 } else {
1301 1305 (void) cpu_update_pct(tp, gethrtime_unscaled());
1302 1306 }
1303 1307
1304 1308 dp = cp->cpu_disp;
1305 1309 disp_lock_enter_high(&dp->disp_lock);
1306 1310
1307 1311 DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 0);
1308 1312 TRACE_3(TR_FAC_DISP, TR_BACKQ, "setbackdq:pri %d cpu %p tid %p",
1309 1313 tpri, cp, tp);
1310 1314
1311 1315 #ifndef NPROBE
1312 1316 /* Kernel probe */
1313 1317 if (tnf_tracing_active)
1314 1318 tnf_thread_queue(tp, cp, tpri);
1315 1319 #endif /* NPROBE */
1316 1320
1317 1321 ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1318 1322
1319 1323 THREAD_RUN(tp, &dp->disp_lock); /* set t_state to TS_RUN */
1320 1324 tp->t_disp_queue = dp;
1321 1325 tp->t_link = NULL;
1322 1326
1323 1327 dq = &dp->disp_q[tpri];
1324 1328 dp->disp_nrunnable++;
1325 1329 if (!bound)
1326 1330 dp->disp_steal = 0;
1327 1331 membar_enter();
1328 1332
1329 1333 if (dq->dq_sruncnt++ != 0) {
1330 1334 ASSERT(dq->dq_first != NULL);
1331 1335 dq->dq_last->t_link = tp;
1332 1336 dq->dq_last = tp;
1333 1337 } else {
1334 1338 ASSERT(dq->dq_first == NULL);
1335 1339 ASSERT(dq->dq_last == NULL);
1336 1340 dq->dq_first = dq->dq_last = tp;
1337 1341 BT_SET(dp->disp_qactmap, tpri);
1338 1342 if (tpri > dp->disp_maxrunpri) {
1339 1343 dp->disp_maxrunpri = tpri;
1340 1344 membar_enter();
1341 1345 cpu_resched(cp, tpri);
1342 1346 }
1343 1347 }
1344 1348
1345 1349 if (!bound && tpri > dp->disp_max_unbound_pri) {
1346 1350 if (self && dp->disp_max_unbound_pri == -1 && cp == CPU) {
1347 1351 /*
1348 1352 * If there are no other unbound threads on the
1349 1353 * run queue, don't allow other CPUs to steal
1350 1354 * this thread while we are in the middle of a
1351 1355 * context switch. We may just switch to it
1352 1356 * again right away. CPU_DISP_DONTSTEAL is cleared
1353 1357 * in swtch and swtch_to.
1354 1358 */
1355 1359 cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL;
1356 1360 }
1357 1361 dp->disp_max_unbound_pri = tpri;
1358 1362 }
1359 1363 (*disp_enq_thread)(cp, bound);
1360 1364 }
1361 1365
1362 1366 /*
1363 1367 * Put the specified thread on the front of the dispatcher
1364 1368 * queue corresponding to its current priority.
1365 1369 *
1366 1370 * Called with the thread in transition, onproc or stopped state
1367 1371 * and locked (transition implies locked) and at high spl.
1368 1372 * Returns with the thread in TS_RUN state and still locked.
1369 1373 */
1370 1374 void
1371 1375 setfrontdq(kthread_t *tp)
1372 1376 {
1373 1377 disp_t *dp;
1374 1378 dispq_t *dq;
1375 1379 cpu_t *cp;
1376 1380 pri_t tpri;
1377 1381 int bound;
1378 1382
1379 1383 ASSERT(THREAD_LOCK_HELD(tp));
1380 1384 ASSERT((tp->t_schedflag & TS_ALLSTART) == 0);
1381 1385 ASSERT(!thread_on_queue(tp)); /* make sure tp isn't on a runq */
1382 1386
1383 1387 /*
1384 1388 * If thread is "swapped" or on the swap queue don't
1385 1389 * queue it, but wake sched.
1386 1390 */
1387 1391 if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) {
1388 1392 disp_swapped_setrun(tp);
1389 1393 return;
1390 1394 }
1391 1395
1392 1396 if (tp->t_bound_cpu || tp->t_weakbound_cpu)
1393 1397 bound = 1;
1394 1398 else
1395 1399 bound = 0;
1396 1400
1397 1401 tpri = DISP_PRIO(tp);
1398 1402 if (ncpus == 1)
1399 1403 cp = tp->t_cpu;
↓ open down ↓ |
116 lines elided |
↑ open up ↑ |
1400 1404 else if (!bound) {
1401 1405 if (tpri >= kpqpri) {
1402 1406 setkpdq(tp, SETKP_FRONT);
1403 1407 return;
1404 1408 }
1405 1409 cp = tp->t_cpu;
1406 1410 if (tp->t_cpupart == cp->cpu_part) {
1407 1411 /*
1408 1412 * We'll generally let this thread continue to run
1409 1413 * where it last ran, but will consider migration if:
1410 - * - The thread last ran outside it's home lgroup.
1414 + * - The thread last ran outside its home lgroup.
1411 1415 * - The CPU where it last ran is the target of an
1412 1416 * offline request (a thread_nomigrate() on the in
1413 1417 * motion CPU relies on this when forcing a preempt).
1414 1418 * - The thread isn't the highest priority thread where
1415 1419 * it last ran, and it is considered not likely to
1416 1420 * have significant cache warmth.
1417 1421 */
1418 - if ((!LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, cp)) ||
1419 - (cp == cpu_inmotion)) {
1420 - cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri,
1421 - (tp == curthread) ? cp : NULL);
1422 - } else if ((tpri < cp->cpu_disp->disp_maxrunpri) &&
1423 - (!THREAD_HAS_CACHE_WARMTH(tp))) {
1424 - cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri,
1425 - NULL);
1422 + if (!LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, cp) ||
1423 + cp == cpu_inmotion ||
1424 + (tpri < cp->cpu_disp->disp_maxrunpri &&
1425 + !THREAD_HAS_CACHE_WARMTH(tp))) {
1426 + cp = disp_lowpri_cpu(tp->t_cpu, tp, tpri);
1426 1427 }
1427 1428 } else {
1428 1429 /*
1429 1430 * Migrate to a cpu in the new partition.
1430 1431 */
1431 1432 cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist,
1432 - tp->t_lpl, tp->t_pri, NULL);
1433 + tp, tp->t_pri);
1433 1434 }
1434 1435 ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1435 1436 } else {
1436 1437 /*
1437 1438 * It is possible that t_weakbound_cpu != t_bound_cpu (for
1438 1439 * a short time until weak binding that existed when the
1439 1440 * strong binding was established has dropped) so we must
1440 1441 * favour weak binding over strong.
1441 1442 */
1442 1443 cp = tp->t_weakbound_cpu ?
1443 1444 tp->t_weakbound_cpu : tp->t_bound_cpu;
1444 1445 }
1445 1446
1446 1447 /*
1447 1448 * A thread that is ONPROC may be temporarily placed on the run queue
1448 1449 * but then chosen to run again by disp. If the thread we're placing on
1449 1450 * the queue is in TS_ONPROC state, don't set its t_waitrq until a
1450 1451 * replacement process is actually scheduled in swtch(). In this
1451 1452 * situation, curthread is the only thread that could be in the ONPROC
1452 1453 * state.
1453 1454 */
1454 1455 if ((tp != curthread) && (tp->t_waitrq == 0)) {
1455 1456 hrtime_t curtime;
1456 1457
1457 1458 curtime = gethrtime_unscaled();
1458 1459 (void) cpu_update_pct(tp, curtime);
1459 1460 tp->t_waitrq = curtime;
1460 1461 } else {
1461 1462 (void) cpu_update_pct(tp, gethrtime_unscaled());
1462 1463 }
1463 1464
1464 1465 dp = cp->cpu_disp;
1465 1466 disp_lock_enter_high(&dp->disp_lock);
1466 1467
1467 1468 TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp);
1468 1469 DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 1);
1469 1470
1470 1471 #ifndef NPROBE
1471 1472 /* Kernel probe */
1472 1473 if (tnf_tracing_active)
1473 1474 tnf_thread_queue(tp, cp, tpri);
1474 1475 #endif /* NPROBE */
1475 1476
1476 1477 ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1477 1478
1478 1479 THREAD_RUN(tp, &dp->disp_lock); /* set TS_RUN state and lock */
1479 1480 tp->t_disp_queue = dp;
1480 1481
1481 1482 dq = &dp->disp_q[tpri];
1482 1483 dp->disp_nrunnable++;
1483 1484 if (!bound)
1484 1485 dp->disp_steal = 0;
1485 1486 membar_enter();
1486 1487
1487 1488 if (dq->dq_sruncnt++ != 0) {
1488 1489 ASSERT(dq->dq_last != NULL);
1489 1490 tp->t_link = dq->dq_first;
1490 1491 dq->dq_first = tp;
1491 1492 } else {
1492 1493 ASSERT(dq->dq_last == NULL);
1493 1494 ASSERT(dq->dq_first == NULL);
1494 1495 tp->t_link = NULL;
1495 1496 dq->dq_first = dq->dq_last = tp;
1496 1497 BT_SET(dp->disp_qactmap, tpri);
1497 1498 if (tpri > dp->disp_maxrunpri) {
1498 1499 dp->disp_maxrunpri = tpri;
1499 1500 membar_enter();
1500 1501 cpu_resched(cp, tpri);
1501 1502 }
1502 1503 }
1503 1504
1504 1505 if (!bound && tpri > dp->disp_max_unbound_pri) {
1505 1506 if (tp == curthread && dp->disp_max_unbound_pri == -1 &&
1506 1507 cp == CPU) {
1507 1508 /*
1508 1509 * If there are no other unbound threads on the
1509 1510 * run queue, don't allow other CPUs to steal
1510 1511 * this thread while we are in the middle of a
1511 1512 * context switch. We may just switch to it
1512 1513 * again right away. CPU_DISP_DONTSTEAL is cleared
1513 1514 * in swtch and swtch_to.
1514 1515 */
1515 1516 cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL;
1516 1517 }
1517 1518 dp->disp_max_unbound_pri = tpri;
1518 1519 }
1519 1520 (*disp_enq_thread)(cp, bound);
1520 1521 }
1521 1522
1522 1523 /*
1523 1524 * Put a high-priority unbound thread on the kp queue
1524 1525 */
1525 1526 static void
1526 1527 setkpdq(kthread_t *tp, int borf)
1527 1528 {
1528 1529 dispq_t *dq;
1529 1530 disp_t *dp;
1530 1531 cpu_t *cp;
1531 1532 pri_t tpri;
1532 1533
1533 1534 tpri = DISP_PRIO(tp);
1534 1535
1535 1536 dp = &tp->t_cpupart->cp_kp_queue;
1536 1537 disp_lock_enter_high(&dp->disp_lock);
1537 1538
1538 1539 TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp);
1539 1540
1540 1541 ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1541 1542 DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, borf);
1542 1543 THREAD_RUN(tp, &dp->disp_lock); /* set t_state to TS_RUN */
1543 1544 tp->t_disp_queue = dp;
1544 1545 dp->disp_nrunnable++;
1545 1546 dq = &dp->disp_q[tpri];
1546 1547
1547 1548 if (dq->dq_sruncnt++ != 0) {
1548 1549 if (borf == SETKP_BACK) {
1549 1550 ASSERT(dq->dq_first != NULL);
1550 1551 tp->t_link = NULL;
1551 1552 dq->dq_last->t_link = tp;
1552 1553 dq->dq_last = tp;
1553 1554 } else {
1554 1555 ASSERT(dq->dq_last != NULL);
1555 1556 tp->t_link = dq->dq_first;
1556 1557 dq->dq_first = tp;
1557 1558 }
1558 1559 } else {
1559 1560 if (borf == SETKP_BACK) {
1560 1561 ASSERT(dq->dq_first == NULL);
1561 1562 ASSERT(dq->dq_last == NULL);
1562 1563 dq->dq_first = dq->dq_last = tp;
1563 1564 } else {
1564 1565 ASSERT(dq->dq_last == NULL);
1565 1566 ASSERT(dq->dq_first == NULL);
1566 1567 tp->t_link = NULL;
1567 1568 dq->dq_first = dq->dq_last = tp;
1568 1569 }
1569 1570 BT_SET(dp->disp_qactmap, tpri);
1570 1571 if (tpri > dp->disp_max_unbound_pri)
1571 1572 dp->disp_max_unbound_pri = tpri;
1572 1573 if (tpri > dp->disp_maxrunpri) {
↓ open down ↓ |
130 lines elided |
↑ open up ↑ |
1573 1574 dp->disp_maxrunpri = tpri;
1574 1575 membar_enter();
1575 1576 }
1576 1577 }
1577 1578
1578 1579 cp = tp->t_cpu;
1579 1580 if (tp->t_cpupart != cp->cpu_part) {
1580 1581 /* migrate to a cpu in the new partition */
1581 1582 cp = tp->t_cpupart->cp_cpulist;
1582 1583 }
1583 - cp = disp_lowpri_cpu(cp, tp->t_lpl, tp->t_pri, NULL);
1584 + cp = disp_lowpri_cpu(cp, tp, tp->t_pri);
1584 1585 disp_lock_enter_high(&cp->cpu_disp->disp_lock);
1585 1586 ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1586 1587
1587 1588 #ifndef NPROBE
1588 1589 /* Kernel probe */
1589 1590 if (tnf_tracing_active)
1590 1591 tnf_thread_queue(tp, cp, tpri);
1591 1592 #endif /* NPROBE */
1592 1593
1593 1594 if (cp->cpu_chosen_level < tpri)
1594 1595 cp->cpu_chosen_level = tpri;
1595 1596 cpu_resched(cp, tpri);
1596 1597 disp_lock_exit_high(&cp->cpu_disp->disp_lock);
1597 1598 (*disp_enq_thread)(cp, 0);
1598 1599 }
1599 1600
1600 1601 /*
1601 1602 * Remove a thread from the dispatcher queue if it is on it.
1602 1603 * It is not an error if it is not found but we return whether
1603 1604 * or not it was found in case the caller wants to check.
1604 1605 */
1605 1606 int
1606 1607 dispdeq(kthread_t *tp)
1607 1608 {
1608 1609 disp_t *dp;
1609 1610 dispq_t *dq;
1610 1611 kthread_t *rp;
1611 1612 kthread_t *trp;
1612 1613 kthread_t **ptp;
1613 1614 int tpri;
1614 1615
1615 1616 ASSERT(THREAD_LOCK_HELD(tp));
1616 1617
1617 1618 if (tp->t_state != TS_RUN)
1618 1619 return (0);
1619 1620
1620 1621 /*
1621 1622 * The thread is "swapped" or is on the swap queue and
1622 1623 * hence no longer on the run queue, so return true.
1623 1624 */
1624 1625 if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD)
1625 1626 return (1);
1626 1627
1627 1628 tpri = DISP_PRIO(tp);
1628 1629 dp = tp->t_disp_queue;
1629 1630 ASSERT(tpri < dp->disp_npri);
1630 1631 dq = &dp->disp_q[tpri];
1631 1632 ptp = &dq->dq_first;
1632 1633 rp = *ptp;
1633 1634 trp = NULL;
1634 1635
1635 1636 ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL);
1636 1637
1637 1638 /*
1638 1639 * Search for thread in queue.
1639 1640 * Double links would simplify this at the expense of disp/setrun.
1640 1641 */
1641 1642 while (rp != tp && rp != NULL) {
1642 1643 trp = rp;
1643 1644 ptp = &trp->t_link;
1644 1645 rp = trp->t_link;
1645 1646 }
1646 1647
1647 1648 if (rp == NULL) {
1648 1649 panic("dispdeq: thread not on queue");
1649 1650 }
1650 1651
1651 1652 DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp);
1652 1653
1653 1654 /*
1654 1655 * Found it so remove it from queue.
1655 1656 */
1656 1657 if ((*ptp = rp->t_link) == NULL)
1657 1658 dq->dq_last = trp;
1658 1659
1659 1660 dp->disp_nrunnable--;
1660 1661 if (--dq->dq_sruncnt == 0) {
1661 1662 dp->disp_qactmap[tpri >> BT_ULSHIFT] &= ~BT_BIW(tpri);
1662 1663 if (dp->disp_nrunnable == 0) {
1663 1664 dp->disp_max_unbound_pri = -1;
1664 1665 dp->disp_maxrunpri = -1;
1665 1666 } else if (tpri == dp->disp_maxrunpri) {
1666 1667 int ipri;
1667 1668
1668 1669 ipri = bt_gethighbit(dp->disp_qactmap,
1669 1670 dp->disp_maxrunpri >> BT_ULSHIFT);
1670 1671 if (ipri < dp->disp_max_unbound_pri)
1671 1672 dp->disp_max_unbound_pri = ipri;
1672 1673 dp->disp_maxrunpri = ipri;
1673 1674 }
1674 1675 }
1675 1676 tp->t_link = NULL;
1676 1677 THREAD_TRANSITION(tp); /* put in intermediate state */
1677 1678 return (1);
1678 1679 }
1679 1680
1680 1681
1681 1682 /*
1682 1683 * dq_sruninc and dq_srundec are public functions for
1683 1684 * incrementing/decrementing the sruncnts when a thread on
1684 1685 * a dispatcher queue is made schedulable/unschedulable by
1685 1686 * resetting the TS_LOAD flag.
1686 1687 *
1687 1688 * The caller MUST have the thread lock and therefore the dispatcher
1688 1689 * queue lock so that the operation which changes
1689 1690 * the flag, the operation that checks the status of the thread to
1690 1691 * determine if it's on a disp queue AND the call to this function
1691 1692 * are one atomic operation with respect to interrupts.
1692 1693 */
1693 1694
1694 1695 /*
1695 1696 * Called by sched AFTER TS_LOAD flag is set on a swapped, runnable thread.
1696 1697 */
1697 1698 void
1698 1699 dq_sruninc(kthread_t *t)
1699 1700 {
1700 1701 ASSERT(t->t_state == TS_RUN);
1701 1702 ASSERT(t->t_schedflag & TS_LOAD);
1702 1703
1703 1704 THREAD_TRANSITION(t);
1704 1705 setfrontdq(t);
1705 1706 }
1706 1707
1707 1708 /*
1708 1709 * See comment on calling conventions above.
1709 1710 * Called by sched BEFORE TS_LOAD flag is cleared on a runnable thread.
1710 1711 */
1711 1712 void
1712 1713 dq_srundec(kthread_t *t)
1713 1714 {
1714 1715 ASSERT(t->t_schedflag & TS_LOAD);
1715 1716
1716 1717 (void) dispdeq(t);
1717 1718 disp_swapped_enq(t);
1718 1719 }
1719 1720
1720 1721 /*
1721 1722 * Change the dispatcher lock of thread to the "swapped_lock"
1722 1723 * and return with thread lock still held.
1723 1724 *
1724 1725 * Called with thread_lock held, in transition state, and at high spl.
1725 1726 */
1726 1727 void
1727 1728 disp_swapped_enq(kthread_t *tp)
1728 1729 {
1729 1730 ASSERT(THREAD_LOCK_HELD(tp));
1730 1731 ASSERT(tp->t_schedflag & TS_LOAD);
1731 1732
1732 1733 switch (tp->t_state) {
1733 1734 case TS_RUN:
1734 1735 disp_lock_enter_high(&swapped_lock);
1735 1736 THREAD_SWAP(tp, &swapped_lock); /* set TS_RUN state and lock */
1736 1737 break;
1737 1738 case TS_ONPROC:
1738 1739 disp_lock_enter_high(&swapped_lock);
1739 1740 THREAD_TRANSITION(tp);
1740 1741 wake_sched_sec = 1; /* tell clock to wake sched */
1741 1742 THREAD_SWAP(tp, &swapped_lock); /* set TS_RUN state and lock */
1742 1743 break;
1743 1744 default:
1744 1745 panic("disp_swapped: tp: %p bad t_state", (void *)tp);
1745 1746 }
1746 1747 }
1747 1748
1748 1749 /*
1749 1750 * This routine is called by setbackdq/setfrontdq if the thread is
1750 1751 * not loaded or loaded and on the swap queue.
1751 1752 *
1752 1753 * Thread state TS_SLEEP implies that a swapped thread
1753 1754 * has been woken up and needs to be swapped in by the swapper.
1754 1755 *
1755 1756 * Thread state TS_RUN, it implies that the priority of a swapped
1756 1757 * thread is being increased by scheduling class (e.g. ts_update).
1757 1758 */
1758 1759 static void
1759 1760 disp_swapped_setrun(kthread_t *tp)
1760 1761 {
1761 1762 ASSERT(THREAD_LOCK_HELD(tp));
1762 1763 ASSERT((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD);
1763 1764
1764 1765 switch (tp->t_state) {
1765 1766 case TS_SLEEP:
1766 1767 disp_lock_enter_high(&swapped_lock);
1767 1768 /*
1768 1769 * Wakeup sched immediately (i.e., next tick) if the
1769 1770 * thread priority is above maxclsyspri.
1770 1771 */
1771 1772 if (DISP_PRIO(tp) > maxclsyspri)
1772 1773 wake_sched = 1;
1773 1774 else
1774 1775 wake_sched_sec = 1;
1775 1776 THREAD_RUN(tp, &swapped_lock); /* set TS_RUN state and lock */
1776 1777 break;
1777 1778 case TS_RUN: /* called from ts_update */
1778 1779 break;
1779 1780 default:
1780 1781 panic("disp_swapped_setrun: tp: %p bad t_state", (void *)tp);
1781 1782 }
1782 1783 }
1783 1784
1784 1785 /*
1785 1786 * Make a thread give up its processor. Find the processor on
1786 1787 * which this thread is executing, and have that processor
1787 1788 * preempt.
1788 1789 *
1789 1790 * We allow System Duty Cycle (SDC) threads to be preempted even if
1790 1791 * they are running at kernel priorities. To implement this, we always
1791 1792 * set cpu_kprunrun; this ensures preempt() will be called. Since SDC
1792 1793 * calls cpu_surrender() very often, we only preempt if there is anyone
1793 1794 * competing with us.
1794 1795 */
1795 1796 void
1796 1797 cpu_surrender(kthread_t *tp)
1797 1798 {
1798 1799 cpu_t *cpup;
1799 1800 int max_pri;
1800 1801 int max_run_pri;
1801 1802 klwp_t *lwp;
1802 1803
1803 1804 ASSERT(THREAD_LOCK_HELD(tp));
1804 1805
1805 1806 if (tp->t_state != TS_ONPROC)
1806 1807 return;
1807 1808 cpup = tp->t_disp_queue->disp_cpu; /* CPU thread dispatched to */
1808 1809 max_pri = cpup->cpu_disp->disp_maxrunpri; /* best pri of that CPU */
1809 1810 max_run_pri = CP_MAXRUNPRI(cpup->cpu_part);
1810 1811 if (max_pri < max_run_pri)
1811 1812 max_pri = max_run_pri;
1812 1813
1813 1814 if (tp->t_cid == sysdccid) {
1814 1815 uint_t t_pri = DISP_PRIO(tp);
1815 1816 if (t_pri > max_pri)
1816 1817 return; /* we are not competing w/ anyone */
1817 1818 cpup->cpu_runrun = cpup->cpu_kprunrun = 1;
1818 1819 } else {
1819 1820 cpup->cpu_runrun = 1;
1820 1821 if (max_pri >= kpreemptpri && cpup->cpu_kprunrun == 0) {
1821 1822 cpup->cpu_kprunrun = 1;
1822 1823 }
1823 1824 }
1824 1825
1825 1826 /*
1826 1827 * Propagate cpu_runrun, and cpu_kprunrun to global visibility.
1827 1828 */
1828 1829 membar_enter();
1829 1830
1830 1831 DTRACE_SCHED1(surrender, kthread_t *, tp);
1831 1832
1832 1833 /*
1833 1834 * Make the target thread take an excursion through trap()
1834 1835 * to do preempt() (unless we're already in trap or post_syscall,
1835 1836 * calling cpu_surrender via CL_TRAPRET).
1836 1837 */
1837 1838 if (tp != curthread || (lwp = tp->t_lwp) == NULL ||
1838 1839 lwp->lwp_state != LWP_USER) {
1839 1840 aston(tp);
1840 1841 if (cpup != CPU)
1841 1842 poke_cpu(cpup->cpu_id);
1842 1843 }
1843 1844 TRACE_2(TR_FAC_DISP, TR_CPU_SURRENDER,
1844 1845 "cpu_surrender:tid %p cpu %p", tp, cpup);
1845 1846 }
1846 1847
1847 1848 /*
1848 1849 * Commit to and ratify a scheduling decision
1849 1850 */
1850 1851 /*ARGSUSED*/
1851 1852 static kthread_t *
1852 1853 disp_ratify(kthread_t *tp, disp_t *kpq)
1853 1854 {
1854 1855 pri_t tpri, maxpri;
1855 1856 pri_t maxkpri;
1856 1857 cpu_t *cpup;
1857 1858
1858 1859 ASSERT(tp != NULL);
1859 1860 /*
1860 1861 * Commit to, then ratify scheduling decision
1861 1862 */
1862 1863 cpup = CPU;
1863 1864 if (cpup->cpu_runrun != 0)
1864 1865 cpup->cpu_runrun = 0;
1865 1866 if (cpup->cpu_kprunrun != 0)
1866 1867 cpup->cpu_kprunrun = 0;
1867 1868 if (cpup->cpu_chosen_level != -1)
1868 1869 cpup->cpu_chosen_level = -1;
1869 1870 membar_enter();
1870 1871 tpri = DISP_PRIO(tp);
1871 1872 maxpri = cpup->cpu_disp->disp_maxrunpri;
1872 1873 maxkpri = kpq->disp_maxrunpri;
1873 1874 if (maxpri < maxkpri)
1874 1875 maxpri = maxkpri;
1875 1876 if (tpri < maxpri) {
1876 1877 /*
1877 1878 * should have done better
1878 1879 * put this one back and indicate to try again
1879 1880 */
1880 1881 cpup->cpu_dispthread = curthread; /* fixup dispthread */
1881 1882 cpup->cpu_dispatch_pri = DISP_PRIO(curthread);
1882 1883 thread_lock_high(tp);
1883 1884 THREAD_TRANSITION(tp);
1884 1885 setfrontdq(tp);
1885 1886 thread_unlock_nopreempt(tp);
1886 1887
1887 1888 tp = NULL;
1888 1889 }
1889 1890 return (tp);
1890 1891 }
1891 1892
1892 1893 /*
1893 1894 * See if there is any work on the dispatcher queue for other CPUs.
1894 1895 * If there is, dequeue the best thread and return.
1895 1896 */
1896 1897 static kthread_t *
1897 1898 disp_getwork(cpu_t *cp)
1898 1899 {
1899 1900 cpu_t *ocp; /* other CPU */
1900 1901 cpu_t *ocp_start;
1901 1902 cpu_t *tcp; /* target local CPU */
1902 1903 kthread_t *tp;
1903 1904 kthread_t *retval = NULL;
1904 1905 pri_t maxpri;
1905 1906 disp_t *kpq; /* kp queue for this partition */
1906 1907 lpl_t *lpl, *lpl_leaf;
1907 1908 int leafidx, startidx;
1908 1909 hrtime_t stealtime;
1909 1910 lgrp_id_t local_id;
1910 1911
1911 1912 maxpri = -1;
1912 1913 tcp = NULL;
1913 1914
1914 1915 kpq = &cp->cpu_part->cp_kp_queue;
1915 1916 while (kpq->disp_maxrunpri >= 0) {
1916 1917 /*
1917 1918 * Try to take a thread from the kp_queue.
1918 1919 */
1919 1920 tp = (disp_getbest(kpq));
1920 1921 if (tp)
1921 1922 return (disp_ratify(tp, kpq));
1922 1923 }
1923 1924
1924 1925 kpreempt_disable(); /* protect the cpu_active list */
1925 1926
1926 1927 /*
1927 1928 * Try to find something to do on another CPU's run queue.
1928 1929 * Loop through all other CPUs looking for the one with the highest
1929 1930 * priority unbound thread.
1930 1931 *
1931 1932 * On NUMA machines, the partition's CPUs are consulted in order of
1932 1933 * distance from the current CPU. This way, the first available
1933 1934 * work found is also the closest, and will suffer the least
1934 1935 * from being migrated.
1935 1936 */
1936 1937 lpl = lpl_leaf = cp->cpu_lpl;
1937 1938 local_id = lpl_leaf->lpl_lgrpid;
1938 1939 leafidx = startidx = 0;
1939 1940
1940 1941 /*
1941 1942 * This loop traverses the lpl hierarchy. Higher level lpls represent
1942 1943 * broader levels of locality
1943 1944 */
1944 1945 do {
1945 1946 /* This loop iterates over the lpl's leaves */
1946 1947 do {
1947 1948 if (lpl_leaf != cp->cpu_lpl)
1948 1949 ocp = lpl_leaf->lpl_cpus;
1949 1950 else
1950 1951 ocp = cp->cpu_next_lpl;
1951 1952
1952 1953 /* This loop iterates over the CPUs in the leaf */
1953 1954 ocp_start = ocp;
1954 1955 do {
1955 1956 pri_t pri;
1956 1957
1957 1958 ASSERT(CPU_ACTIVE(ocp));
1958 1959
1959 1960 /*
1960 1961 * End our stroll around this lpl if:
1961 1962 *
1962 1963 * - Something became runnable on the local
1963 1964 * queue...which also ends our stroll around
1964 1965 * the partition.
1965 1966 *
1966 1967 * - We happen across another idle CPU.
1967 1968 * Since it is patrolling the next portion
1968 1969 * of the lpl's list (assuming it's not
1969 1970 * halted, or busy servicing an interrupt),
1970 1971 * move to the next higher level of locality.
1971 1972 */
1972 1973 if (cp->cpu_disp->disp_nrunnable != 0) {
1973 1974 kpreempt_enable();
1974 1975 return (NULL);
1975 1976 }
1976 1977 if (ocp->cpu_dispatch_pri == -1) {
1977 1978 if (ocp->cpu_disp_flags &
1978 1979 CPU_DISP_HALTED ||
1979 1980 ocp->cpu_intr_actv != 0)
1980 1981 continue;
1981 1982 else
1982 1983 goto next_level;
1983 1984 }
1984 1985
1985 1986 /*
1986 1987 * If there's only one thread and the CPU
1987 1988 * is in the middle of a context switch,
1988 1989 * or it's currently running the idle thread,
1989 1990 * don't steal it.
1990 1991 */
1991 1992 if ((ocp->cpu_disp_flags &
1992 1993 CPU_DISP_DONTSTEAL) &&
1993 1994 ocp->cpu_disp->disp_nrunnable == 1)
1994 1995 continue;
1995 1996
1996 1997 pri = ocp->cpu_disp->disp_max_unbound_pri;
1997 1998 if (pri > maxpri) {
1998 1999 /*
1999 2000 * Don't steal threads that we attempted
2000 2001 * to steal recently until they're ready
2001 2002 * to be stolen again.
2002 2003 */
2003 2004 stealtime = ocp->cpu_disp->disp_steal;
2004 2005 if (stealtime == 0 ||
2005 2006 stealtime - gethrtime() <= 0) {
2006 2007 maxpri = pri;
2007 2008 tcp = ocp;
2008 2009 } else {
2009 2010 /*
2010 2011 * Don't update tcp, just set
2011 2012 * the retval to T_DONTSTEAL, so
2012 2013 * that if no acceptable CPUs
2013 2014 * are found the return value
2014 2015 * will be T_DONTSTEAL rather
2015 2016 * then NULL.
2016 2017 */
2017 2018 retval = T_DONTSTEAL;
2018 2019 }
2019 2020 }
2020 2021 } while ((ocp = ocp->cpu_next_lpl) != ocp_start);
2021 2022
2022 2023 /*
2023 2024 * Iterate to the next leaf lpl in the resource set
2024 2025 * at this level of locality. If we hit the end of
2025 2026 * the set, wrap back around to the beginning.
2026 2027 *
2027 2028 * Note: This iteration is NULL terminated for a reason
2028 2029 * see lpl_topo_bootstrap() in lgrp.c for details.
2029 2030 */
2030 2031 if ((lpl_leaf = lpl->lpl_rset[++leafidx]) == NULL) {
2031 2032 leafidx = 0;
2032 2033 lpl_leaf = lpl->lpl_rset[leafidx];
2033 2034 }
2034 2035 } while (leafidx != startidx);
2035 2036
2036 2037 next_level:
2037 2038 /*
2038 2039 * Expand the search to include farther away CPUs (next
2039 2040 * locality level). The closer CPUs that have already been
2040 2041 * checked will be checked again. In doing so, idle CPUs
2041 2042 * will tend to be more aggresive about stealing from CPUs
2042 2043 * that are closer (since the closer CPUs will be considered
2043 2044 * more often).
2044 2045 * Begin at this level with the CPUs local leaf lpl.
2045 2046 */
2046 2047 if ((lpl = lpl->lpl_parent) != NULL) {
2047 2048 leafidx = startidx = lpl->lpl_id2rset[local_id];
2048 2049 lpl_leaf = lpl->lpl_rset[leafidx];
2049 2050 }
2050 2051 } while (!tcp && lpl);
2051 2052
2052 2053 kpreempt_enable();
2053 2054
2054 2055 /*
2055 2056 * If another queue looks good, and there is still nothing on
2056 2057 * the local queue, try to transfer one or more threads
2057 2058 * from it to our queue.
2058 2059 */
2059 2060 if (tcp && cp->cpu_disp->disp_nrunnable == 0) {
2060 2061 tp = disp_getbest(tcp->cpu_disp);
2061 2062 if (tp == NULL || tp == T_DONTSTEAL)
2062 2063 return (tp);
2063 2064 return (disp_ratify(tp, kpq));
2064 2065 }
2065 2066 return (retval);
2066 2067 }
2067 2068
2068 2069
2069 2070 /*
2070 2071 * disp_fix_unbound_pri()
2071 2072 * Determines the maximum priority of unbound threads on the queue.
2072 2073 * The priority is kept for the queue, but is only increased, never
2073 2074 * reduced unless some CPU is looking for something on that queue.
2074 2075 *
2075 2076 * The priority argument is the known upper limit.
2076 2077 *
2077 2078 * Perhaps this should be kept accurately, but that probably means
2078 2079 * separate bitmaps for bound and unbound threads. Since only idled
2079 2080 * CPUs will have to do this recalculation, it seems better this way.
2080 2081 */
2081 2082 static void
2082 2083 disp_fix_unbound_pri(disp_t *dp, pri_t pri)
2083 2084 {
2084 2085 kthread_t *tp;
2085 2086 dispq_t *dq;
2086 2087 ulong_t *dqactmap = dp->disp_qactmap;
2087 2088 ulong_t mapword;
2088 2089 int wx;
2089 2090
2090 2091 ASSERT(DISP_LOCK_HELD(&dp->disp_lock));
2091 2092
2092 2093 ASSERT(pri >= 0); /* checked by caller */
2093 2094
2094 2095 /*
2095 2096 * Start the search at the next lowest priority below the supplied
2096 2097 * priority. This depends on the bitmap implementation.
2097 2098 */
2098 2099 do {
2099 2100 wx = pri >> BT_ULSHIFT; /* index of word in map */
2100 2101
2101 2102 /*
2102 2103 * Form mask for all lower priorities in the word.
2103 2104 */
2104 2105 mapword = dqactmap[wx] & (BT_BIW(pri) - 1);
2105 2106
2106 2107 /*
2107 2108 * Get next lower active priority.
2108 2109 */
2109 2110 if (mapword != 0) {
2110 2111 pri = (wx << BT_ULSHIFT) + highbit(mapword) - 1;
2111 2112 } else if (wx > 0) {
2112 2113 pri = bt_gethighbit(dqactmap, wx - 1); /* sign extend */
2113 2114 if (pri < 0)
2114 2115 break;
2115 2116 } else {
2116 2117 pri = -1;
2117 2118 break;
2118 2119 }
2119 2120
2120 2121 /*
2121 2122 * Search the queue for unbound, runnable threads.
2122 2123 */
2123 2124 dq = &dp->disp_q[pri];
2124 2125 tp = dq->dq_first;
2125 2126
2126 2127 while (tp && (tp->t_bound_cpu || tp->t_weakbound_cpu)) {
2127 2128 tp = tp->t_link;
2128 2129 }
2129 2130
2130 2131 /*
2131 2132 * If a thread was found, set the priority and return.
2132 2133 */
2133 2134 } while (tp == NULL);
2134 2135
2135 2136 /*
2136 2137 * pri holds the maximum unbound thread priority or -1.
2137 2138 */
2138 2139 if (dp->disp_max_unbound_pri != pri)
2139 2140 dp->disp_max_unbound_pri = pri;
2140 2141 }
2141 2142
2142 2143 /*
2143 2144 * disp_adjust_unbound_pri() - thread is becoming unbound, so we should
2144 2145 * check if the CPU to which is was previously bound should have
2145 2146 * its disp_max_unbound_pri increased.
2146 2147 */
2147 2148 void
2148 2149 disp_adjust_unbound_pri(kthread_t *tp)
2149 2150 {
2150 2151 disp_t *dp;
2151 2152 pri_t tpri;
2152 2153
2153 2154 ASSERT(THREAD_LOCK_HELD(tp));
2154 2155
2155 2156 /*
2156 2157 * Don't do anything if the thread is not bound, or
2157 2158 * currently not runnable or swapped out.
2158 2159 */
2159 2160 if (tp->t_bound_cpu == NULL ||
2160 2161 tp->t_state != TS_RUN ||
2161 2162 tp->t_schedflag & TS_ON_SWAPQ)
2162 2163 return;
2163 2164
2164 2165 tpri = DISP_PRIO(tp);
2165 2166 dp = tp->t_bound_cpu->cpu_disp;
2166 2167 ASSERT(tpri >= 0 && tpri < dp->disp_npri);
2167 2168 if (tpri > dp->disp_max_unbound_pri)
2168 2169 dp->disp_max_unbound_pri = tpri;
2169 2170 }
2170 2171
2171 2172 /*
2172 2173 * disp_getbest()
2173 2174 * De-queue the highest priority unbound runnable thread.
2174 2175 * Returns with the thread unlocked and onproc but at splhigh (like disp()).
2175 2176 * Returns NULL if nothing found.
2176 2177 * Returns T_DONTSTEAL if the thread was not stealable.
2177 2178 * so that the caller will try again later.
2178 2179 *
2179 2180 * Passed a pointer to a dispatch queue not associated with this CPU, and
2180 2181 * its type.
2181 2182 */
2182 2183 static kthread_t *
2183 2184 disp_getbest(disp_t *dp)
2184 2185 {
2185 2186 kthread_t *tp;
2186 2187 dispq_t *dq;
2187 2188 pri_t pri;
2188 2189 cpu_t *cp, *tcp;
2189 2190 boolean_t allbound;
2190 2191
2191 2192 disp_lock_enter(&dp->disp_lock);
2192 2193
2193 2194 /*
2194 2195 * If there is nothing to run, or the CPU is in the middle of a
2195 2196 * context switch of the only thread, return NULL.
2196 2197 */
2197 2198 tcp = dp->disp_cpu;
2198 2199 cp = CPU;
2199 2200 pri = dp->disp_max_unbound_pri;
2200 2201 if (pri == -1 ||
2201 2202 (tcp != NULL && (tcp->cpu_disp_flags & CPU_DISP_DONTSTEAL) &&
2202 2203 tcp->cpu_disp->disp_nrunnable == 1)) {
2203 2204 disp_lock_exit_nopreempt(&dp->disp_lock);
2204 2205 return (NULL);
2205 2206 }
2206 2207
2207 2208 dq = &dp->disp_q[pri];
2208 2209
2209 2210
2210 2211 /*
2211 2212 * Assume that all threads are bound on this queue, and change it
2212 2213 * later when we find out that it is not the case.
2213 2214 */
2214 2215 allbound = B_TRUE;
2215 2216 for (tp = dq->dq_first; tp != NULL; tp = tp->t_link) {
2216 2217 hrtime_t now, nosteal, rqtime;
2217 2218
2218 2219 /*
2219 2220 * Skip over bound threads which could be here even
2220 2221 * though disp_max_unbound_pri indicated this level.
2221 2222 */
2222 2223 if (tp->t_bound_cpu || tp->t_weakbound_cpu)
2223 2224 continue;
2224 2225
2225 2226 /*
2226 2227 * We've got some unbound threads on this queue, so turn
2227 2228 * the allbound flag off now.
2228 2229 */
2229 2230 allbound = B_FALSE;
2230 2231
2231 2232 /*
2232 2233 * The thread is a candidate for stealing from its run queue. We
2233 2234 * don't want to steal threads that became runnable just a
2234 2235 * moment ago. This improves CPU affinity for threads that get
2235 2236 * preempted for short periods of time and go back on the run
2236 2237 * queue.
2237 2238 *
2238 2239 * We want to let it stay on its run queue if it was only placed
2239 2240 * there recently and it was running on the same CPU before that
2240 2241 * to preserve its cache investment. For the thread to remain on
2241 2242 * its run queue, ALL of the following conditions must be
2242 2243 * satisfied:
2243 2244 *
2244 2245 * - the disp queue should not be the kernel preemption queue
2245 2246 * - delayed idle stealing should not be disabled
2246 2247 * - nosteal_nsec should be non-zero
2247 2248 * - it should run with user priority
2248 2249 * - it should be on the run queue of the CPU where it was
2249 2250 * running before being placed on the run queue
2250 2251 * - it should be the only thread on the run queue (to prevent
2251 2252 * extra scheduling latency for other threads)
2252 2253 * - it should sit on the run queue for less than per-chip
2253 2254 * nosteal interval or global nosteal interval
2254 2255 * - in case of CPUs with shared cache it should sit in a run
2255 2256 * queue of a CPU from a different chip
2256 2257 *
2257 2258 * The checks are arranged so that the ones that are faster are
2258 2259 * placed earlier.
2259 2260 */
2260 2261 if (tcp == NULL ||
2261 2262 pri >= minclsyspri ||
2262 2263 tp->t_cpu != tcp)
2263 2264 break;
2264 2265
2265 2266 /*
2266 2267 * Steal immediately if, due to CMT processor architecture
2267 2268 * migraiton between cp and tcp would incur no performance
2268 2269 * penalty.
2269 2270 */
2270 2271 if (pg_cmt_can_migrate(cp, tcp))
2271 2272 break;
2272 2273
2273 2274 nosteal = nosteal_nsec;
2274 2275 if (nosteal == 0)
2275 2276 break;
2276 2277
2277 2278 /*
2278 2279 * Calculate time spent sitting on run queue
2279 2280 */
2280 2281 now = gethrtime_unscaled();
2281 2282 rqtime = now - tp->t_waitrq;
2282 2283 scalehrtime(&rqtime);
2283 2284
2284 2285 /*
2285 2286 * Steal immediately if the time spent on this run queue is more
2286 2287 * than allowed nosteal delay.
2287 2288 *
2288 2289 * Negative rqtime check is needed here to avoid infinite
2289 2290 * stealing delays caused by unlikely but not impossible
2290 2291 * drifts between CPU times on different CPUs.
2291 2292 */
2292 2293 if (rqtime > nosteal || rqtime < 0)
2293 2294 break;
2294 2295
2295 2296 DTRACE_PROBE4(nosteal, kthread_t *, tp,
2296 2297 cpu_t *, tcp, cpu_t *, cp, hrtime_t, rqtime);
2297 2298 scalehrtime(&now);
2298 2299 /*
2299 2300 * Calculate when this thread becomes stealable
2300 2301 */
2301 2302 now += (nosteal - rqtime);
2302 2303
2303 2304 /*
2304 2305 * Calculate time when some thread becomes stealable
2305 2306 */
2306 2307 if (now < dp->disp_steal)
2307 2308 dp->disp_steal = now;
2308 2309 }
2309 2310
2310 2311 /*
2311 2312 * If there were no unbound threads on this queue, find the queue
2312 2313 * where they are and then return later. The value of
2313 2314 * disp_max_unbound_pri is not always accurate because it isn't
2314 2315 * reduced until another idle CPU looks for work.
2315 2316 */
2316 2317 if (allbound)
2317 2318 disp_fix_unbound_pri(dp, pri);
2318 2319
2319 2320 /*
2320 2321 * If we reached the end of the queue and found no unbound threads
2321 2322 * then return NULL so that other CPUs will be considered. If there
2322 2323 * are unbound threads but they cannot yet be stolen, then
2323 2324 * return T_DONTSTEAL and try again later.
2324 2325 */
2325 2326 if (tp == NULL) {
2326 2327 disp_lock_exit_nopreempt(&dp->disp_lock);
2327 2328 return (allbound ? NULL : T_DONTSTEAL);
2328 2329 }
2329 2330
2330 2331 /*
2331 2332 * Found a runnable, unbound thread, so remove it from queue.
2332 2333 * dispdeq() requires that we have the thread locked, and we do,
2333 2334 * by virtue of holding the dispatch queue lock. dispdeq() will
2334 2335 * put the thread in transition state, thereby dropping the dispq
2335 2336 * lock.
2336 2337 */
2337 2338
2338 2339 #ifdef DEBUG
2339 2340 {
2340 2341 int thread_was_on_queue;
2341 2342
2342 2343 thread_was_on_queue = dispdeq(tp); /* drops disp_lock */
2343 2344 ASSERT(thread_was_on_queue);
2344 2345 }
2345 2346
2346 2347 #else /* DEBUG */
2347 2348 (void) dispdeq(tp); /* drops disp_lock */
2348 2349 #endif /* DEBUG */
2349 2350
2350 2351 /*
2351 2352 * Reset the disp_queue steal time - we do not know what is the smallest
2352 2353 * value across the queue is.
2353 2354 */
2354 2355 dp->disp_steal = 0;
2355 2356
2356 2357 tp->t_schedflag |= TS_DONT_SWAP;
2357 2358
2358 2359 /*
2359 2360 * Setup thread to run on the current CPU.
2360 2361 */
2361 2362 tp->t_disp_queue = cp->cpu_disp;
2362 2363
2363 2364 cp->cpu_dispthread = tp; /* protected by spl only */
2364 2365 cp->cpu_dispatch_pri = pri;
2365 2366
2366 2367 /*
2367 2368 * There can be a memory synchronization race between disp_getbest()
2368 2369 * and disp_ratify() vs cpu_resched() where cpu_resched() is trying
2369 2370 * to preempt the current thread to run the enqueued thread while
2370 2371 * disp_getbest() and disp_ratify() are changing the current thread
2371 2372 * to the stolen thread. This may lead to a situation where
2372 2373 * cpu_resched() tries to preempt the wrong thread and the
2373 2374 * stolen thread continues to run on the CPU which has been tagged
2374 2375 * for preemption.
2375 2376 * Later the clock thread gets enqueued but doesn't get to run on the
2376 2377 * CPU causing the system to hang.
2377 2378 *
2378 2379 * To avoid this, grabbing and dropping the disp_lock (which does
2379 2380 * a memory barrier) is needed to synchronize the execution of
2380 2381 * cpu_resched() with disp_getbest() and disp_ratify() and
2381 2382 * synchronize the memory read and written by cpu_resched(),
2382 2383 * disp_getbest(), and disp_ratify() with each other.
2383 2384 * (see CR#6482861 for more details).
2384 2385 */
2385 2386 disp_lock_enter_high(&cp->cpu_disp->disp_lock);
2386 2387 disp_lock_exit_high(&cp->cpu_disp->disp_lock);
2387 2388
2388 2389 ASSERT(pri == DISP_PRIO(tp));
2389 2390
2390 2391 DTRACE_PROBE3(steal, kthread_t *, tp, cpu_t *, tcp, cpu_t *, cp);
2391 2392
2392 2393 thread_onproc(tp, cp); /* set t_state to TS_ONPROC */
2393 2394
2394 2395 /*
2395 2396 * Return with spl high so that swtch() won't need to raise it.
2396 2397 * The disp_lock was dropped by dispdeq().
2397 2398 */
2398 2399
2399 2400 return (tp);
2400 2401 }
2401 2402
2402 2403 /*
2403 2404 * disp_bound_common() - common routine for higher level functions
2404 2405 * that check for bound threads under certain conditions.
2405 2406 * If 'threadlistsafe' is set then there is no need to acquire
2406 2407 * pidlock to stop the thread list from changing (eg, if
2407 2408 * disp_bound_* is called with cpus paused).
2408 2409 */
2409 2410 static int
2410 2411 disp_bound_common(cpu_t *cp, int threadlistsafe, int flag)
2411 2412 {
2412 2413 int found = 0;
2413 2414 kthread_t *tp;
2414 2415
2415 2416 ASSERT(flag);
2416 2417
2417 2418 if (!threadlistsafe)
2418 2419 mutex_enter(&pidlock);
2419 2420 tp = curthread; /* faster than allthreads */
2420 2421 do {
2421 2422 if (tp->t_state != TS_FREE) {
2422 2423 /*
2423 2424 * If an interrupt thread is busy, but the
2424 2425 * caller doesn't care (i.e. BOUND_INTR is off),
2425 2426 * then just ignore it and continue through.
2426 2427 */
2427 2428 if ((tp->t_flag & T_INTR_THREAD) &&
2428 2429 !(flag & BOUND_INTR))
2429 2430 continue;
2430 2431
2431 2432 /*
2432 2433 * Skip the idle thread for the CPU
2433 2434 * we're about to set offline.
2434 2435 */
2435 2436 if (tp == cp->cpu_idle_thread)
2436 2437 continue;
2437 2438
2438 2439 /*
2439 2440 * Skip the pause thread for the CPU
2440 2441 * we're about to set offline.
2441 2442 */
2442 2443 if (tp == cp->cpu_pause_thread)
2443 2444 continue;
2444 2445
2445 2446 if ((flag & BOUND_CPU) &&
2446 2447 (tp->t_bound_cpu == cp ||
2447 2448 tp->t_bind_cpu == cp->cpu_id ||
2448 2449 tp->t_weakbound_cpu == cp)) {
2449 2450 found = 1;
2450 2451 break;
2451 2452 }
2452 2453
2453 2454 if ((flag & BOUND_PARTITION) &&
2454 2455 (tp->t_cpupart == cp->cpu_part)) {
2455 2456 found = 1;
2456 2457 break;
2457 2458 }
2458 2459 }
2459 2460 } while ((tp = tp->t_next) != curthread && found == 0);
2460 2461 if (!threadlistsafe)
2461 2462 mutex_exit(&pidlock);
2462 2463 return (found);
2463 2464 }
2464 2465
2465 2466 /*
2466 2467 * disp_bound_threads - return nonzero if threads are bound to the processor.
2467 2468 * Called infrequently. Keep this simple.
2468 2469 * Includes threads that are asleep or stopped but not onproc.
2469 2470 */
2470 2471 int
2471 2472 disp_bound_threads(cpu_t *cp, int threadlistsafe)
2472 2473 {
2473 2474 return (disp_bound_common(cp, threadlistsafe, BOUND_CPU));
2474 2475 }
2475 2476
2476 2477 /*
2477 2478 * disp_bound_anythreads - return nonzero if _any_ threads are bound
2478 2479 * to the given processor, including interrupt threads.
2479 2480 */
2480 2481 int
2481 2482 disp_bound_anythreads(cpu_t *cp, int threadlistsafe)
2482 2483 {
2483 2484 return (disp_bound_common(cp, threadlistsafe, BOUND_CPU | BOUND_INTR));
2484 2485 }
2485 2486
2486 2487 /*
2487 2488 * disp_bound_partition - return nonzero if threads are bound to the same
2488 2489 * partition as the processor.
2489 2490 * Called infrequently. Keep this simple.
2490 2491 * Includes threads that are asleep or stopped but not onproc.
2491 2492 */
2492 2493 int
2493 2494 disp_bound_partition(cpu_t *cp, int threadlistsafe)
2494 2495 {
2495 2496 return (disp_bound_common(cp, threadlistsafe, BOUND_PARTITION));
2496 2497 }
2497 2498
2498 2499 /*
2499 2500 * disp_cpu_inactive - make a CPU inactive by moving all of its unbound
2500 2501 * threads to other CPUs.
2501 2502 */
2502 2503 void
2503 2504 disp_cpu_inactive(cpu_t *cp)
2504 2505 {
2505 2506 kthread_t *tp;
2506 2507 disp_t *dp = cp->cpu_disp;
2507 2508 dispq_t *dq;
2508 2509 pri_t pri;
2509 2510 int wasonq;
2510 2511
2511 2512 disp_lock_enter(&dp->disp_lock);
2512 2513 while ((pri = dp->disp_max_unbound_pri) != -1) {
2513 2514 dq = &dp->disp_q[pri];
2514 2515 tp = dq->dq_first;
2515 2516
2516 2517 /*
2517 2518 * Skip over bound threads.
2518 2519 */
2519 2520 while (tp != NULL && tp->t_bound_cpu != NULL) {
2520 2521 tp = tp->t_link;
2521 2522 }
2522 2523
2523 2524 if (tp == NULL) {
2524 2525 /* disp_max_unbound_pri must be inaccurate, so fix it */
2525 2526 disp_fix_unbound_pri(dp, pri);
2526 2527 continue;
2527 2528 }
2528 2529
2529 2530 wasonq = dispdeq(tp); /* drops disp_lock */
2530 2531 ASSERT(wasonq);
2531 2532 ASSERT(tp->t_weakbound_cpu == NULL);
2532 2533
2533 2534 setbackdq(tp);
2534 2535 /*
2535 2536 * Called from cpu_offline:
2536 2537 *
2537 2538 * cp has already been removed from the list of active cpus
2538 2539 * and tp->t_cpu has been changed so there is no risk of
2539 2540 * tp ending up back on cp.
2540 2541 *
2541 2542 * Called from cpupart_move_cpu:
2542 2543 *
2543 2544 * The cpu has moved to a new cpupart. Any threads that
2544 2545 * were on it's dispatch queues before the move remain
2545 2546 * in the old partition and can't run in the new partition.
↓ open down ↓ |
952 lines elided |
↑ open up ↑ |
2546 2547 */
2547 2548 ASSERT(tp->t_cpu != cp);
2548 2549 thread_unlock(tp);
2549 2550
2550 2551 disp_lock_enter(&dp->disp_lock);
2551 2552 }
2552 2553 disp_lock_exit(&dp->disp_lock);
2553 2554 }
2554 2555
2555 2556 /*
2556 - * disp_lowpri_cpu - find CPU running the lowest priority thread.
2557 - * The hint passed in is used as a starting point so we don't favor
2558 - * CPU 0 or any other CPU. The caller should pass in the most recently
2559 - * used CPU for the thread.
2557 + * Return a score rating this CPU for running this thread: lower is better.
2560 2558 *
2561 - * The lgroup and priority are used to determine the best CPU to run on
2562 - * in a NUMA machine. The lgroup specifies which CPUs are closest while
2563 - * the thread priority will indicate whether the thread will actually run
2564 - * there. To pick the best CPU, the CPUs inside and outside of the given
2565 - * lgroup which are running the lowest priority threads are found. The
2566 - * remote CPU is chosen only if the thread will not run locally on a CPU
2567 - * within the lgroup, but will run on the remote CPU. If the thread
2568 - * cannot immediately run on any CPU, the best local CPU will be chosen.
2559 + * If curthread is looking for a new CPU, then we ignore cpu_dispatch_pri for
2560 + * curcpu (as that's our own priority).
2569 2561 *
2570 - * The lpl specified also identifies the cpu partition from which
2571 - * disp_lowpri_cpu should select a CPU.
2562 + * If a cpu is the target of an offline request, then try to avoid it.
2572 2563 *
2573 - * curcpu is used to indicate that disp_lowpri_cpu is being called on
2574 - * behalf of the current thread. (curthread is looking for a new cpu)
2575 - * In this case, cpu_dispatch_pri for this thread's cpu should be
2576 - * ignored.
2564 + * Otherwise we'll use double the effective dispatcher priority for the CPU.
2577 2565 *
2578 - * If a cpu is the target of an offline request then try to avoid it.
2566 + * We do this so ht_adjust_cpu_score() can increment the score if needed,
2567 + * without ending up over-riding a dispatcher priority.
2568 + */
2569 +static pri_t
2570 +cpu_score(cpu_t *cp, kthread_t *tp)
2571 +{
2572 + pri_t score;
2573 +
2574 + if (tp == curthread && cp == curthread->t_cpu)
2575 + score = 2 * CPU_IDLE_PRI;
2576 + else if (cp == cpu_inmotion)
2577 + score = SHRT_MAX;
2578 + else
2579 + score = 2 * cp->cpu_dispatch_pri;
2580 +
2581 + if (2 * cp->cpu_disp->disp_maxrunpri > score)
2582 + score = 2 * cp->cpu_disp->disp_maxrunpri;
2583 + if (2 * cp->cpu_chosen_level > score)
2584 + score = 2 * cp->cpu_chosen_level;
2585 +
2586 + return (ht_adjust_cpu_score(tp, cp, score));
2587 +}
2588 +
2589 +/*
2590 + * disp_lowpri_cpu - find a suitable CPU to run the given thread.
2579 2591 *
2580 - * This function must be called at either high SPL, or with preemption
2581 - * disabled, so that the "hint" CPU cannot be removed from the online
2582 - * CPU list while we are traversing it.
2592 + * We are looking for a CPU with an effective dispatch priority lower than the
2593 + * thread's, so that the thread will run immediately rather than be enqueued.
2594 + * For NUMA locality, we prefer "home" CPUs within the thread's ->t_lpl group.
2595 + * If we don't find an available CPU there, we will expand our search to include
2596 + * wider locality levels. (Note these groups are already divided by CPU
2597 + * partition.)
2598 + *
2599 + * If the thread cannot immediately run on *any* CPU, we'll enqueue ourselves on
2600 + * the best home CPU we found.
2601 + *
2602 + * The hint passed in is used as a starting point so we don't favor CPU 0 or any
2603 + * other CPU. The caller should pass in the most recently used CPU for the
2604 + * thread; it's of course possible that this CPU isn't in the home lgroup.
2605 + *
2606 + * This function must be called at either high SPL, or with preemption disabled,
2607 + * so that the "hint" CPU cannot be removed from the online CPU list while we
2608 + * are traversing it.
2583 2609 */
2584 2610 cpu_t *
2585 -disp_lowpri_cpu(cpu_t *hint, lpl_t *lpl, pri_t tpri, cpu_t *curcpu)
2611 +disp_lowpri_cpu(cpu_t *hint, kthread_t *tp, pri_t tpri)
2586 2612 {
2587 2613 cpu_t *bestcpu;
2588 2614 cpu_t *besthomecpu;
2589 2615 cpu_t *cp, *cpstart;
2590 2616
2591 - pri_t bestpri;
2592 - pri_t cpupri;
2593 -
2594 2617 klgrpset_t done;
2595 - klgrpset_t cur_set;
2596 2618
2597 2619 lpl_t *lpl_iter, *lpl_leaf;
2598 - int i;
2599 2620
2600 - /*
2601 - * Scan for a CPU currently running the lowest priority thread.
2602 - * Cannot get cpu_lock here because it is adaptive.
2603 - * We do not require lock on CPU list.
2604 - */
2605 2621 ASSERT(hint != NULL);
2606 - ASSERT(lpl != NULL);
2607 - ASSERT(lpl->lpl_ncpu > 0);
2622 + ASSERT(tp->t_lpl->lpl_ncpu > 0);
2608 2623
2609 - /*
2610 - * First examine local CPUs. Note that it's possible the hint CPU
2611 - * passed in in remote to the specified home lgroup. If our priority
2612 - * isn't sufficient enough such that we can run immediately at home,
2613 - * then examine CPUs remote to our home lgroup.
2614 - * We would like to give preference to CPUs closest to "home".
2615 - * If we can't find a CPU where we'll run at a given level
2616 - * of locality, we expand our search to include the next level.
2617 - */
2618 2624 bestcpu = besthomecpu = NULL;
2619 2625 klgrpset_clear(done);
2620 - /* start with lpl we were passed */
2621 2626
2622 - lpl_iter = lpl;
2627 + lpl_iter = tp->t_lpl;
2623 2628
2624 2629 do {
2630 + pri_t best = SHRT_MAX;
2631 + klgrpset_t cur_set;
2625 2632
2626 - bestpri = SHRT_MAX;
2627 2633 klgrpset_clear(cur_set);
2628 2634
2629 - for (i = 0; i < lpl_iter->lpl_nrset; i++) {
2635 + for (int i = 0; i < lpl_iter->lpl_nrset; i++) {
2630 2636 lpl_leaf = lpl_iter->lpl_rset[i];
2631 2637 if (klgrpset_ismember(done, lpl_leaf->lpl_lgrpid))
2632 2638 continue;
2633 2639
2634 2640 klgrpset_add(cur_set, lpl_leaf->lpl_lgrpid);
2635 2641
2636 2642 if (hint->cpu_lpl == lpl_leaf)
2637 2643 cp = cpstart = hint;
2638 2644 else
2639 2645 cp = cpstart = lpl_leaf->lpl_cpus;
2640 2646
2641 2647 do {
2642 - if (cp == curcpu)
2643 - cpupri = -1;
2644 - else if (cp == cpu_inmotion)
2645 - cpupri = SHRT_MAX;
2646 - else
2647 - cpupri = cp->cpu_dispatch_pri;
2648 - if (cp->cpu_disp->disp_maxrunpri > cpupri)
2649 - cpupri = cp->cpu_disp->disp_maxrunpri;
2650 - if (cp->cpu_chosen_level > cpupri)
2651 - cpupri = cp->cpu_chosen_level;
2652 - if (cpupri < bestpri) {
2653 - if (CPU_IDLING(cpupri)) {
2654 - ASSERT((cp->cpu_flags &
2655 - CPU_QUIESCED) == 0);
2656 - return (cp);
2657 - }
2648 + pri_t score = cpu_score(cp, tp);
2649 +
2650 + if (score < best) {
2651 + best = score;
2658 2652 bestcpu = cp;
2659 - bestpri = cpupri;
2653 +
2654 + /* An idle CPU: we're done. */
2655 + if (score / 2 == CPU_IDLE_PRI)
2656 + goto out;
2660 2657 }
2661 2658 } while ((cp = cp->cpu_next_lpl) != cpstart);
2662 2659 }
2663 2660
2664 - if (bestcpu && (tpri > bestpri)) {
2665 - ASSERT((bestcpu->cpu_flags & CPU_QUIESCED) == 0);
2666 - return (bestcpu);
2667 - }
2661 + if (bestcpu != NULL && tpri > (best / 2))
2662 + goto out;
2663 +
2668 2664 if (besthomecpu == NULL)
2669 2665 besthomecpu = bestcpu;
2666 +
2670 2667 /*
2671 2668 * Add the lgrps we just considered to the "done" set
2672 2669 */
2673 2670 klgrpset_or(done, cur_set);
2674 2671
2675 2672 } while ((lpl_iter = lpl_iter->lpl_parent) != NULL);
2676 2673
2677 2674 /*
2678 2675 * The specified priority isn't high enough to run immediately
2679 2676 * anywhere, so just return the best CPU from the home lgroup.
2680 2677 */
2681 - ASSERT((besthomecpu->cpu_flags & CPU_QUIESCED) == 0);
2682 - return (besthomecpu);
2678 + bestcpu = besthomecpu;
2679 +
2680 +out:
2681 + ASSERT((bestcpu->cpu_flags & CPU_QUIESCED) == 0);
2682 + return (bestcpu);
2683 2683 }
2684 2684
2685 2685 /*
2686 2686 * This routine provides the generic idle cpu function for all processors.
2687 2687 * If a processor has some specific code to execute when idle (say, to stop
2688 2688 * the pipeline and save power) then that routine should be defined in the
2689 2689 * processors specific code (module_xx.c) and the global variable idle_cpu
2690 2690 * set to that function.
2691 2691 */
2692 2692 static void
2693 2693 generic_idle_cpu(void)
2694 2694 {
2695 2695 }
2696 2696
2697 2697 /*ARGSUSED*/
2698 2698 static void
2699 2699 generic_enq_thread(cpu_t *cpu, int bound)
2700 2700 {
2701 +}
2702 +
2703 +cpu_t *
2704 +disp_choose_best_cpu(void)
2705 +{
2706 + kthread_t *t = curthread;
2707 + cpu_t *curcpu = CPU;
2708 +
2709 + ASSERT(t->t_preempt > 0);
2710 + ASSERT(t->t_state == TS_ONPROC);
2711 + ASSERT(t->t_schedflag & TS_VCPU);
2712 +
2713 + if (ht_should_run(t, curcpu))
2714 + return (curcpu);
2715 +
2716 + return (disp_lowpri_cpu(curcpu, t, t->t_pri));
2701 2717 }
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX