1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2012, 2015 by Delphix. All rights reserved. 24 */ 25 26 /* 27 * The System Duty Cycle (SDC) scheduling class 28 * -------------------------------------------- 29 * 30 * Background 31 * 32 * Kernel threads in Solaris have traditionally not been large consumers 33 * of CPU time. They typically wake up, perform a small amount of 34 * work, then go back to sleep waiting for either a timeout or another 35 * signal. On the assumption that the small amount of work that they do 36 * is important for the behavior of the whole system, these threads are 37 * treated kindly by the dispatcher and the SYS scheduling class: they run 38 * without preemption from anything other than real-time and interrupt 39 * threads; when preempted, they are put at the front of the queue, so they 40 * generally do not migrate between CPUs; and they are allowed to stay 41 * running until they voluntarily give up the CPU. 42 * 43 * As Solaris has evolved, new workloads have emerged which require the 44 * kernel to perform significant amounts of CPU-intensive work. One 45 * example of such a workload is ZFS's transaction group sync processing. 46 * Each sync operation generates a large batch of I/Os, and each I/O 47 * may need to be compressed and/or checksummed before it is written to 48 * storage. The taskq threads which perform the compression and checksums 49 * will run nonstop as long as they have work to do; a large sync operation 50 * on a compression-heavy dataset can keep them busy for seconds on end. 51 * This causes human-time-scale dispatch latency bubbles for any other 52 * threads which have the misfortune to share a CPU with the taskq threads. 53 * 54 * The SDC scheduling class is a solution to this problem. 55 * 56 * 57 * Overview 58 * 59 * SDC is centered around the concept of a thread's duty cycle (DC): 60 * 61 * ONPROC time 62 * Duty Cycle = ---------------------- 63 * ONPROC + Runnable time 64 * 65 * This is the ratio of the time that the thread spent running on a CPU 66 * divided by the time it spent running or trying to run. It is unaffected 67 * by any time the thread spent sleeping, stopped, etc. 68 * 69 * A thread joining the SDC class specifies a "target" DC that it wants 70 * to run at. To implement this policy, the routine sysdc_update() scans 71 * the list of active SDC threads every few ticks and uses each thread's 72 * microstate data to compute the actual duty cycle that that thread 73 * has experienced recently. If the thread is under its target DC, its 74 * priority is increased to the maximum available (sysdc_maxpri, which is 75 * 99 by default). If the thread is over its target DC, its priority is 76 * reduced to the minimum available (sysdc_minpri, 0 by default). This 77 * is a fairly primitive approach, in that it doesn't use any of the 78 * intermediate priorities, but it's not completely inappropriate. Even 79 * though threads in the SDC class might take a while to do their job, they 80 * are by some definition important if they're running inside the kernel, 81 * so it is reasonable that they should get to run at priority 99. 82 * 83 * If a thread is running when sysdc_update() calculates its actual duty 84 * cycle, and there are other threads of equal or greater priority on its 85 * CPU's dispatch queue, sysdc_update() preempts that thread. The thread 86 * acknowledges the preemption by calling sysdc_preempt(), which calls 87 * setbackdq(), which gives other threads with the same priority a chance 88 * to run. This creates a de facto time quantum for threads in the SDC 89 * scheduling class. 90 * 91 * An SDC thread which is assigned priority 0 can continue to run if 92 * nothing else needs to use the CPU that it's running on. Similarly, an 93 * SDC thread at priority 99 might not get to run as much as it wants to 94 * if there are other priority-99 or higher threads on its CPU. These 95 * situations would cause the thread to get ahead of or behind its target 96 * DC; the longer the situations lasted, the further ahead or behind the 97 * thread would get. Rather than condemning a thread to a lifetime of 98 * paying for its youthful indiscretions, SDC keeps "base" values for 99 * ONPROC and Runnable times in each thread's sysdc data, and updates these 100 * values periodically. The duty cycle is then computed using the elapsed 101 * amount of ONPROC and Runnable times since those base times. 102 * 103 * Since sysdc_update() scans SDC threads fairly frequently, it tries to 104 * keep the list of "active" threads small by pruning out threads which 105 * have been asleep for a brief time. They are not pruned immediately upon 106 * going to sleep, since some threads may bounce back and forth between 107 * sleeping and being runnable. 108 * 109 * 110 * Interfaces 111 * 112 * void sysdc_thread_enter(t, dc, flags) 113 * 114 * Moves a kernel thread from the SYS scheduling class to the 115 * SDC class. t must have an associated LWP (created by calling 116 * lwp_kernel_create()). The thread will have a target DC of dc. 117 * Flags should be either 0 or SYSDC_THREAD_BATCH. If 118 * SYSDC_THREAD_BATCH is specified, the thread is expected to be 119 * doing large amounts of processing. 120 * 121 * 122 * Complications 123 * 124 * - Run queue balancing 125 * 126 * The Solaris dispatcher is biased towards letting a thread run 127 * on the same CPU which it last ran on, if no more than 3 ticks 128 * (i.e. rechoose_interval) have passed since the thread last ran. 129 * This helps to preserve cache warmth. On the other hand, it also 130 * tries to keep the per-CPU run queues fairly balanced; if the CPU 131 * chosen for a runnable thread has a run queue which is three or 132 * more threads longer than a neighboring CPU's queue, the runnable 133 * thread is dispatched onto the neighboring CPU instead. 134 * 135 * These policies work well for some workloads, but not for many SDC 136 * threads. The taskq client of SDC, for example, has many discrete 137 * units of work to do. The work units are largely independent, so 138 * cache warmth is not an important consideration. It is important 139 * that the threads fan out quickly to different CPUs, since the 140 * amount of work these threads have to do (a few seconds worth at a 141 * time) doesn't leave much time to correct thread placement errors 142 * (i.e. two SDC threads being dispatched to the same CPU). 143 * 144 * To fix this, SDC uses the TS_RUNQMATCH flag introduced for FSS. 145 * This tells the dispatcher to keep neighboring run queues' lengths 146 * more evenly matched, which allows SDC threads to migrate more 147 * easily. 148 * 149 * - LWPs and system processes 150 * 151 * SDC can only be used for kernel threads. Since SDC uses microstate 152 * accounting data to compute each thread's actual duty cycle, all 153 * threads entering the SDC class must have associated LWPs (which 154 * store the microstate data). This means that the threads have to 155 * be associated with an SSYS process, i.e. one created by newproc(). 156 * If the microstate accounting information is ever moved into the 157 * kthread_t, this restriction could be lifted. 158 * 159 * - Dealing with oversubscription 160 * 161 * Since SDC duty cycles are per-thread, it is possible that the 162 * aggregate requested duty cycle of all SDC threads in a processor 163 * set could be greater than the total CPU time available in that set. 164 * The FSS scheduling class has an analogous situation, which it deals 165 * with by reducing each thread's allotted CPU time proportionally. 166 * Since SDC doesn't need to be as precise as FSS, it uses a simpler 167 * solution to the oversubscription problem. 168 * 169 * sysdc_update() accumulates the amount of time that max-priority SDC 170 * threads have spent on-CPU in each processor set, and uses that sum 171 * to create an implied duty cycle for that processor set: 172 * 173 * accumulated CPU time 174 * pset DC = ----------------------------------- 175 * (# CPUs) * time since last update 176 * 177 * If this implied duty cycle is above a maximum pset duty cycle (90% 178 * by default), sysdc_update() sets the priority of all SDC threads 179 * in that processor set to sysdc_minpri for a "break" period. After 180 * the break period, it waits for a "nobreak" period before trying to 181 * enforce the pset duty cycle limit again. 182 * 183 * - Processor sets 184 * 185 * As the above implies, SDC is processor set aware, but it does not 186 * currently allow threads to change processor sets while in the SDC 187 * class. Instead, those threads must join the desired processor set 188 * before entering SDC. [1] 189 * 190 * - Batch threads 191 * 192 * A thread joining the SDC class can specify the SDC_THREAD_BATCH 193 * flag. This flag currently has no effect, but marks threads which 194 * do bulk processing. 195 * 196 * - Why not FSS? 197 * 198 * It might seem that the existing FSS scheduling class could solve 199 * the problems that SDC is attempting to solve. FSS's more precise 200 * solution to the oversubscription problem would hardly cause 201 * trouble, as long as it performed well. SDC is implemented as 202 * a separate scheduling class for two main reasons: the initial 203 * consumer of SDC does not map well onto the "project" abstraction 204 * that is central to FSS, and FSS does not expect to run at kernel 205 * priorities. 206 * 207 * 208 * Tunables 209 * 210 * - sysdc_update_interval_msec: Number of milliseconds between 211 * consecutive thread priority updates. 212 * 213 * - sysdc_reset_interval_msec: Number of milliseconds between 214 * consecutive resets of a thread's base ONPROC and Runnable 215 * times. 216 * 217 * - sysdc_prune_interval_msec: Number of milliseconds of sleeping 218 * before a thread is pruned from the active list. 219 * 220 * - sysdc_max_pset_DC: Allowable percentage of a processor set's 221 * CPU time which SDC can give to its high-priority threads. 222 * 223 * - sysdc_break_msec: Number of milliseconds of "break" taken when 224 * sysdc_max_pset_DC is exceeded. 225 * 226 * 227 * Future work (in SDC and related subsystems) 228 * 229 * - Per-thread rechoose interval (0 for SDC) 230 * 231 * Allow each thread to specify its own rechoose interval. SDC 232 * threads would specify an interval of zero, which would rechoose 233 * the CPU with the lowest priority once per update. 234 * 235 * - Allow threads to change processor sets after joining the SDC class 236 * 237 * - Thread groups and per-group DC 238 * 239 * It might be nice to be able to specify a duty cycle which applies 240 * to a group of threads in aggregate. 241 * 242 * - Per-group DC callback to allow dynamic DC tuning 243 * 244 * Currently, DCs are assigned when the thread joins SDC. Some 245 * workloads could benefit from being able to tune their DC using 246 * subsystem-specific knowledge about the workload. 247 * 248 * - Finer-grained priority updates 249 * 250 * - More nuanced management of oversubscription 251 * 252 * - Moving other CPU-intensive threads into SDC 253 * 254 * - Move msacct data into kthread_t 255 * 256 * This would allow kernel threads without LWPs to join SDC. 257 * 258 * 259 * Footnotes 260 * 261 * [1] The details of doing so are left as an exercise for the reader. 262 */ 263 264 #include <sys/types.h> 265 #include <sys/sysdc.h> 266 #include <sys/sysdc_impl.h> 267 268 #include <sys/class.h> 269 #include <sys/cmn_err.h> 270 #include <sys/cpuvar.h> 271 #include <sys/cpupart.h> 272 #include <sys/debug.h> 273 #include <sys/disp.h> 274 #include <sys/errno.h> 275 #include <sys/inline.h> 276 #include <sys/kmem.h> 277 #include <sys/modctl.h> 278 #include <sys/schedctl.h> 279 #include <sys/sdt.h> 280 #include <sys/sunddi.h> 281 #include <sys/sysmacros.h> 282 #include <sys/systm.h> 283 #include <sys/var.h> 284 285 /* 286 * Tunables - loaded into the internal state at module load time 287 */ 288 uint_t sysdc_update_interval_msec = 20; 289 uint_t sysdc_reset_interval_msec = 400; 290 uint_t sysdc_prune_interval_msec = 100; 291 uint_t sysdc_max_pset_DC = 90; 292 uint_t sysdc_break_msec = 80; 293 294 /* 295 * Internal state - constants set up by sysdc_initparam() 296 */ 297 static clock_t sysdc_update_ticks; /* ticks between updates */ 298 static uint_t sysdc_prune_updates; /* updates asleep before pruning */ 299 static uint_t sysdc_reset_updates; /* # of updates before reset */ 300 static uint_t sysdc_break_updates; /* updates to break */ 301 static uint_t sysdc_nobreak_updates; /* updates to not check */ 302 static uint_t sysdc_minDC; /* minimum allowed DC */ 303 static uint_t sysdc_maxDC; /* maximum allowed DC */ 304 static pri_t sysdc_minpri; /* minimum allowed priority */ 305 static pri_t sysdc_maxpri; /* maximum allowed priority */ 306 307 /* 308 * Internal state 309 */ 310 static kmutex_t sysdc_pset_lock; /* lock protecting pset data */ 311 static list_t sysdc_psets; /* list of psets with SDC threads */ 312 static uint_t sysdc_param_init; /* sysdc_initparam() has been called */ 313 static uint_t sysdc_update_timeout_started; /* update timeout is active */ 314 static hrtime_t sysdc_last_update; /* time of last sysdc_update() */ 315 static sysdc_t sysdc_dummy; /* used to terminate active lists */ 316 317 /* 318 * Internal state - active hash table 319 */ 320 #define SYSDC_NLISTS 8 321 #define SYSDC_HASH(sdc) (((uintptr_t)(sdc) >> 6) & (SYSDC_NLISTS - 1)) 322 static sysdc_list_t sysdc_active[SYSDC_NLISTS]; 323 #define SYSDC_LIST(sdc) (&sysdc_active[SYSDC_HASH(sdc)]) 324 325 #ifdef DEBUG 326 static struct { 327 uint64_t sysdc_update_times_asleep; 328 uint64_t sysdc_update_times_base_ran_backwards; 329 uint64_t sysdc_update_times_already_done; 330 uint64_t sysdc_update_times_cur_ran_backwards; 331 uint64_t sysdc_compute_pri_breaking; 332 uint64_t sysdc_activate_enter; 333 uint64_t sysdc_update_enter; 334 uint64_t sysdc_update_exited; 335 uint64_t sysdc_update_not_sdc; 336 uint64_t sysdc_update_idle; 337 uint64_t sysdc_update_take_break; 338 uint64_t sysdc_update_no_psets; 339 uint64_t sysdc_tick_not_sdc; 340 uint64_t sysdc_tick_quantum_expired; 341 uint64_t sysdc_thread_enter_enter; 342 } sysdc_stats; 343 344 #define SYSDC_INC_STAT(x) (sysdc_stats.x++) 345 #else 346 #define SYSDC_INC_STAT(x) ((void)0) 347 #endif 348 349 /* macros are UPPER CASE */ 350 #define HOWMANY(a, b) howmany((a), (b)) 351 #define MSECTOTICKS(a) HOWMANY((a) * 1000, usec_per_tick) 352 353 static void 354 sysdc_initparam(void) 355 { 356 uint_t sysdc_break_ticks; 357 358 /* update / prune intervals */ 359 sysdc_update_ticks = MSECTOTICKS(sysdc_update_interval_msec); 360 361 sysdc_prune_updates = HOWMANY(sysdc_prune_interval_msec, 362 sysdc_update_interval_msec); 363 sysdc_reset_updates = HOWMANY(sysdc_reset_interval_msec, 364 sysdc_update_interval_msec); 365 366 /* We must get at least a little time on CPU. */ 367 sysdc_minDC = 1; 368 sysdc_maxDC = SYSDC_DC_MAX; 369 sysdc_minpri = 0; 370 sysdc_maxpri = maxclsyspri - 1; 371 372 /* break parameters */ 373 if (sysdc_max_pset_DC > SYSDC_DC_MAX) { 374 sysdc_max_pset_DC = SYSDC_DC_MAX; 375 } 376 sysdc_break_ticks = MSECTOTICKS(sysdc_break_msec); 377 sysdc_break_updates = HOWMANY(sysdc_break_ticks, sysdc_update_ticks); 378 379 /* 380 * We want: 381 * 382 * sysdc_max_pset_DC = (nobreak / (break + nobreak)) 383 * 384 * ==> nobreak = sysdc_max_pset_DC * (break + nobreak) 385 * 386 * sysdc_max_pset_DC * break 387 * ==> nobreak = ------------------------- 388 * 1 - sysdc_max_pset_DC 389 */ 390 sysdc_nobreak_updates = 391 HOWMANY((uint64_t)sysdc_break_updates * sysdc_max_pset_DC, 392 (SYSDC_DC_MAX - sysdc_max_pset_DC)); 393 394 sysdc_param_init = 1; 395 } 396 397 #undef HOWMANY 398 #undef MSECTOTICKS 399 400 #define SDC_UPDATE_INITIAL 0x1 /* for the initial update */ 401 #define SDC_UPDATE_TIMEOUT 0x2 /* from sysdc_update() */ 402 #define SDC_UPDATE_TICK 0x4 /* from sysdc_tick(), on expiry */ 403 404 /* 405 * Updates the recorded times in the sdc, and returns the elapsed ONPROC 406 * and Runnable times since the last reset. 407 * 408 * newO is the thread's actual ONPROC time; it's used during sysdc_update() 409 * to track processor set usage. 410 */ 411 static void 412 sysdc_update_times(sysdc_t *sdc, uint_t flags, 413 hrtime_t *O, hrtime_t *R, hrtime_t *newO) 414 { 415 kthread_t *const t = sdc->sdc_thread; 416 const uint_t initial = (flags & SDC_UPDATE_INITIAL); 417 const uint_t update = (flags & SDC_UPDATE_TIMEOUT); 418 const clock_t now = ddi_get_lbolt(); 419 uint_t do_reset; 420 421 ASSERT(THREAD_LOCK_HELD(t)); 422 423 *O = *R = 0; 424 425 /* If we've been sleeping, we know we haven't had any ONPROC time. */ 426 if (sdc->sdc_sleep_updates != 0 && 427 sdc->sdc_sleep_updates != sdc->sdc_nupdates) { 428 *newO = sdc->sdc_last_base_O; 429 SYSDC_INC_STAT(sysdc_update_times_asleep); 430 return; 431 } 432 433 /* 434 * If this is our first update, or we've hit the reset point, 435 * we need to reset our base_{O,R}. Once we've updated them, we 436 * report O and R for the entire prior interval. 437 */ 438 do_reset = initial; 439 if (update) { 440 ++sdc->sdc_nupdates; 441 if ((sdc->sdc_nupdates % sysdc_reset_updates) == 0) 442 do_reset = 1; 443 } 444 if (do_reset) { 445 hrtime_t baseO, baseR; 446 if (initial) { 447 /* 448 * Start off our cycle count somewhere in the middle, 449 * to keep the resets from all happening at once. 450 * 451 * 4999 is a handy prime much larger than 452 * sysdc_reset_updates, so that we don't run into 453 * trouble if the resolution is a multiple of 454 * sysdc_reset_updates. 455 */ 456 sdc->sdc_nupdates = (uint_t)((gethrtime() % 4999) % 457 sysdc_reset_updates); 458 baseO = baseR = 0; 459 } else { 460 baseO = sdc->sdc_base_O; 461 baseR = sdc->sdc_base_R; 462 } 463 464 mstate_systhread_times(t, &sdc->sdc_base_O, &sdc->sdc_base_R); 465 *newO = sdc->sdc_base_O; 466 467 sdc->sdc_reset = now; 468 sdc->sdc_pri_check = -1; /* force mismatch below */ 469 470 /* 471 * See below for rationale. 472 */ 473 if (baseO > sdc->sdc_base_O || baseR > sdc->sdc_base_R) { 474 SYSDC_INC_STAT(sysdc_update_times_base_ran_backwards); 475 baseO = sdc->sdc_base_O; 476 baseR = sdc->sdc_base_R; 477 } 478 479 /* compute based on the entire interval */ 480 *O = (sdc->sdc_base_O - baseO); 481 *R = (sdc->sdc_base_R - baseR); 482 return; 483 } 484 485 /* 486 * If we're called from sysdc_update(), we *must* return a value 487 * for newO, so we always call mstate_systhread_times(). 488 * 489 * Otherwise, if we've already done a pri check this tick, 490 * we can skip it. 491 */ 492 if (!update && sdc->sdc_pri_check == now) { 493 SYSDC_INC_STAT(sysdc_update_times_already_done); 494 return; 495 } 496 497 /* Get the current times from the thread */ 498 sdc->sdc_pri_check = now; 499 mstate_systhread_times(t, &sdc->sdc_cur_O, &sdc->sdc_cur_R); 500 *newO = sdc->sdc_cur_O; 501 502 /* 503 * The updating of microstate accounting is not done under a 504 * consistent set of locks, particularly the t_waitrq field. This 505 * can lead to narrow windows in which we account for time in the 506 * wrong bucket, which on the next read will be accounted for 507 * correctly. 508 * 509 * If our sdc_base_* fields were affected by one of these blips, we 510 * throw away the old data, and pretend this tick didn't happen. 511 */ 512 if (sdc->sdc_cur_O < sdc->sdc_base_O || 513 sdc->sdc_cur_R < sdc->sdc_base_R) { 514 515 sdc->sdc_base_O = sdc->sdc_cur_O; 516 sdc->sdc_base_R = sdc->sdc_cur_R; 517 518 SYSDC_INC_STAT(sysdc_update_times_cur_ran_backwards); 519 return; 520 } 521 522 *O = sdc->sdc_cur_O - sdc->sdc_base_O; 523 *R = sdc->sdc_cur_R - sdc->sdc_base_R; 524 } 525 526 /* 527 * sysdc_compute_pri() 528 * 529 * Recomputes the priority of the thread, leaving the result in 530 * sdc->sdc_epri. Returns 1 if a priority update should occur 531 * (which will also trigger a cpu_surrender()), otherwise 532 * returns 0. 533 */ 534 static uint_t 535 sysdc_compute_pri(sysdc_t *sdc, uint_t flags) 536 { 537 kthread_t *const t = sdc->sdc_thread; 538 const uint_t update = (flags & SDC_UPDATE_TIMEOUT); 539 const uint_t tick = (flags & SDC_UPDATE_TICK); 540 541 hrtime_t O, R; 542 hrtime_t newO = -1; 543 544 ASSERT(THREAD_LOCK_HELD(t)); 545 546 sysdc_update_times(sdc, flags, &O, &R, &newO); 547 ASSERT(!update || newO != -1); 548 549 /* If we have new data, recompute our priority. */ 550 if ((O + R) != 0) { 551 sdc->sdc_cur_DC = (O * SYSDC_DC_MAX) / (O + R); 552 553 /* Adjust our priority to move our DC closer to the target. */ 554 if (sdc->sdc_cur_DC < sdc->sdc_target_DC) 555 sdc->sdc_pri = sdc->sdc_maxpri; 556 else 557 sdc->sdc_pri = sdc->sdc_minpri; 558 } 559 560 /* 561 * If our per-pset duty cycle goes over the max, we will take a break. 562 * This forces all sysdc threads in the pset to minimum priority, in 563 * order to let everyone else have a chance at the CPU. 564 */ 565 if (sdc->sdc_pset->sdp_need_break) { 566 SYSDC_INC_STAT(sysdc_compute_pri_breaking); 567 sdc->sdc_epri = sdc->sdc_minpri; 568 } else { 569 sdc->sdc_epri = sdc->sdc_pri; 570 } 571 572 DTRACE_PROBE4(sysdc__compute__pri, 573 kthread_t *, t, pri_t, sdc->sdc_epri, uint_t, sdc->sdc_cur_DC, 574 uint_t, sdc->sdc_target_DC); 575 576 /* 577 * For sysdc_update(), we compute the ONPROC time for high-priority 578 * threads, which is used to calculate the per-pset duty cycle. We 579 * will always tell our callers to update the thread's priority, 580 * since we want to force a cpu_surrender(). 581 * 582 * We reset sdc_update_ticks so that sysdc_tick() will only update 583 * the thread's priority if our timeout is delayed by a tick or 584 * more. 585 */ 586 if (update) { 587 /* SDC threads are not allowed to change cpupart bindings. */ 588 ASSERT(t->t_cpupart == sdc->sdc_pset->sdp_cpupart); 589 590 /* If we were at MAXPRI, account for our onproc time. */ 591 if (t->t_pri == sdc->sdc_maxpri && 592 sdc->sdc_last_base_O != 0 && 593 sdc->sdc_last_base_O < newO) { 594 sdc->sdc_last_O = newO - sdc->sdc_last_base_O; 595 sdc->sdc_pset->sdp_onproc_time += 596 (uint64_t)sdc->sdc_last_O; 597 sdc->sdc_pset->sdp_onproc_threads++; 598 } else { 599 sdc->sdc_last_O = 0; 600 } 601 sdc->sdc_last_base_O = newO; 602 603 sdc->sdc_update_ticks = sdc->sdc_ticks + sysdc_update_ticks + 1; 604 return (1); 605 } 606 607 /* 608 * Like sysdc_update(), sysdc_tick() always wants to update the 609 * thread's priority, so that the CPU is surrendered if necessary. 610 * We reset sdc_update_ticks so that if the timeout continues to be 611 * delayed, we'll update at the regular interval. 612 */ 613 if (tick) { 614 ASSERT(sdc->sdc_ticks == sdc->sdc_update_ticks); 615 sdc->sdc_update_ticks = sdc->sdc_ticks + sysdc_update_ticks; 616 return (1); 617 } 618 619 /* 620 * Otherwise, only tell our callers to update the priority if it has 621 * changed. 622 */ 623 return (sdc->sdc_epri != t->t_pri); 624 } 625 626 static void 627 sysdc_update_pri(sysdc_t *sdc, uint_t flags) 628 { 629 kthread_t *t = sdc->sdc_thread; 630 631 ASSERT(THREAD_LOCK_HELD(t)); 632 633 if (sysdc_compute_pri(sdc, flags)) { 634 if (!thread_change_pri(t, sdc->sdc_epri, 0)) { 635 cpu_surrender(t); 636 } 637 } 638 } 639 640 /* 641 * Add a thread onto the active list. It will only be removed by 642 * sysdc_update(). 643 */ 644 static void 645 sysdc_activate(sysdc_t *sdc) 646 { 647 sysdc_t *volatile *headp = &SYSDC_LIST(sdc)->sdl_list; 648 sysdc_t *head; 649 kthread_t *t = sdc->sdc_thread; 650 651 SYSDC_INC_STAT(sysdc_activate_enter); 652 653 ASSERT(sdc->sdc_next == NULL); 654 ASSERT(THREAD_LOCK_HELD(t)); 655 656 do { 657 head = *headp; 658 sdc->sdc_next = head; 659 } while (atomic_cas_ptr(headp, head, sdc) != head); 660 } 661 662 /* 663 * sysdc_update() has two jobs: 664 * 665 * 1. It updates the priorities of all active SDC threads on the system. 666 * 2. It measures pset CPU usage and enforces sysdc_max_pset_DC. 667 */ 668 static void 669 sysdc_update(void *arg) 670 { 671 int idx; 672 sysdc_t *freelist = NULL; 673 sysdc_pset_t *cur; 674 hrtime_t now, diff; 675 uint_t redeploy = 1; 676 677 SYSDC_INC_STAT(sysdc_update_enter); 678 679 ASSERT(sysdc_update_timeout_started); 680 681 /* 682 * If this is our first time through, diff will be gigantic, and 683 * no breaks will be necessary. 684 */ 685 now = gethrtime(); 686 diff = now - sysdc_last_update; 687 sysdc_last_update = now; 688 689 mutex_enter(&sysdc_pset_lock); 690 for (cur = list_head(&sysdc_psets); cur != NULL; 691 cur = list_next(&sysdc_psets, cur)) { 692 boolean_t breaking = (cur->sdp_should_break != 0); 693 694 if (cur->sdp_need_break != breaking) { 695 DTRACE_PROBE2(sdc__pset__break, sysdc_pset_t *, cur, 696 boolean_t, breaking); 697 } 698 cur->sdp_onproc_time = 0; 699 cur->sdp_onproc_threads = 0; 700 cur->sdp_need_break = breaking; 701 } 702 mutex_exit(&sysdc_pset_lock); 703 704 for (idx = 0; idx < SYSDC_NLISTS; idx++) { 705 sysdc_list_t *sdl = &sysdc_active[idx]; 706 sysdc_t *volatile *headp = &sdl->sdl_list; 707 sysdc_t *head, *tail; 708 sysdc_t **prevptr; 709 710 if (*headp == &sysdc_dummy) 711 continue; 712 713 /* Prevent any threads from exiting while we're poking them. */ 714 mutex_enter(&sdl->sdl_lock); 715 716 /* 717 * Each sdl_list contains a singly-linked list of active 718 * threads. Threads which become active while we are 719 * processing the list will be added to sdl_list. Since we 720 * don't want that to interfere with our own processing, we 721 * swap in an empty list. Any newly active threads will 722 * go on to this empty list. When finished, we'll put any 723 * such threads at the end of the processed list. 724 */ 725 head = atomic_swap_ptr(headp, &sysdc_dummy); 726 prevptr = &head; 727 while (*prevptr != &sysdc_dummy) { 728 sysdc_t *const sdc = *prevptr; 729 kthread_t *const t = sdc->sdc_thread; 730 731 /* 732 * If the thread has exited, move its sysdc_t onto 733 * freelist, to be freed later. 734 */ 735 if (t == NULL) { 736 *prevptr = sdc->sdc_next; 737 SYSDC_INC_STAT(sysdc_update_exited); 738 sdc->sdc_next = freelist; 739 freelist = sdc; 740 continue; 741 } 742 743 thread_lock(t); 744 if (t->t_cid != sysdccid) { 745 thread_unlock(t); 746 prevptr = &sdc->sdc_next; 747 SYSDC_INC_STAT(sysdc_update_not_sdc); 748 continue; 749 } 750 ASSERT(t->t_cldata == sdc); 751 752 /* 753 * If the thread has been sleeping for longer 754 * than sysdc_prune_interval, make it inactive by 755 * removing it from the list. 756 */ 757 if (!(t->t_state & (TS_RUN | TS_ONPROC)) && 758 sdc->sdc_sleep_updates != 0 && 759 (sdc->sdc_sleep_updates - sdc->sdc_nupdates) > 760 sysdc_prune_updates) { 761 *prevptr = sdc->sdc_next; 762 SYSDC_INC_STAT(sysdc_update_idle); 763 sdc->sdc_next = NULL; 764 thread_unlock(t); 765 continue; 766 } 767 sysdc_update_pri(sdc, SDC_UPDATE_TIMEOUT); 768 thread_unlock(t); 769 770 prevptr = &sdc->sdc_next; 771 } 772 773 /* 774 * Add our list to the bucket, putting any new entries 775 * added while we were working at the tail of the list. 776 */ 777 do { 778 tail = *headp; 779 *prevptr = tail; 780 } while (atomic_cas_ptr(headp, tail, head) != tail); 781 782 mutex_exit(&sdl->sdl_lock); 783 } 784 785 mutex_enter(&sysdc_pset_lock); 786 for (cur = list_head(&sysdc_psets); cur != NULL; 787 cur = list_next(&sysdc_psets, cur)) { 788 789 cur->sdp_vtime_last_interval = 790 diff * cur->sdp_cpupart->cp_ncpus; 791 cur->sdp_DC_last_interval = 792 (cur->sdp_onproc_time * SYSDC_DC_MAX) / 793 cur->sdp_vtime_last_interval; 794 795 if (cur->sdp_should_break > 0) { 796 cur->sdp_should_break--; /* breaking */ 797 continue; 798 } 799 if (cur->sdp_dont_break > 0) { 800 cur->sdp_dont_break--; /* waiting before checking */ 801 continue; 802 } 803 if (cur->sdp_DC_last_interval > sysdc_max_pset_DC) { 804 cur->sdp_should_break = sysdc_break_updates; 805 cur->sdp_dont_break = sysdc_nobreak_updates; 806 SYSDC_INC_STAT(sysdc_update_take_break); 807 } 808 } 809 810 /* 811 * If there are no sysdc_psets, there can be no threads, so 812 * we can stop doing our timeout. Since we're holding the 813 * sysdc_pset_lock, no new sysdc_psets can come in, which will 814 * prevent anyone from racing with this and dropping our timeout 815 * on the floor. 816 */ 817 if (list_is_empty(&sysdc_psets)) { 818 SYSDC_INC_STAT(sysdc_update_no_psets); 819 ASSERT(sysdc_update_timeout_started); 820 sysdc_update_timeout_started = 0; 821 822 redeploy = 0; 823 } 824 mutex_exit(&sysdc_pset_lock); 825 826 while (freelist != NULL) { 827 sysdc_t *cur = freelist; 828 freelist = cur->sdc_next; 829 kmem_free(cur, sizeof (*cur)); 830 } 831 832 if (redeploy) { 833 (void) timeout(sysdc_update, arg, sysdc_update_ticks); 834 } 835 } 836 837 static void 838 sysdc_preempt(kthread_t *t) 839 { 840 ASSERT(t == curthread); 841 ASSERT(THREAD_LOCK_HELD(t)); 842 843 setbackdq(t); /* give others a chance to run */ 844 } 845 846 static void 847 sysdc_tick(kthread_t *t) 848 { 849 sysdc_t *sdc; 850 851 thread_lock(t); 852 if (t->t_cid != sysdccid) { 853 SYSDC_INC_STAT(sysdc_tick_not_sdc); 854 thread_unlock(t); 855 return; 856 } 857 sdc = t->t_cldata; 858 if (t->t_state == TS_ONPROC && 859 t->t_pri < t->t_disp_queue->disp_maxrunpri) { 860 cpu_surrender(t); 861 } 862 863 if (t->t_state == TS_ONPROC || t->t_state == TS_RUN) { 864 ASSERT(sdc->sdc_sleep_updates == 0); 865 } 866 867 ASSERT(sdc->sdc_ticks != sdc->sdc_update_ticks); 868 sdc->sdc_ticks++; 869 if (sdc->sdc_ticks == sdc->sdc_update_ticks) { 870 SYSDC_INC_STAT(sysdc_tick_quantum_expired); 871 sysdc_update_pri(sdc, SDC_UPDATE_TICK); 872 ASSERT(sdc->sdc_ticks != sdc->sdc_update_ticks); 873 } 874 thread_unlock(t); 875 } 876 877 static void 878 sysdc_setrun(kthread_t *t) 879 { 880 sysdc_t *sdc = t->t_cldata; 881 882 ASSERT(THREAD_LOCK_HELD(t)); /* t should be in transition */ 883 884 sdc->sdc_sleep_updates = 0; 885 886 if (sdc->sdc_next == NULL) { 887 /* 888 * Since we're in transition, we don't want to use the 889 * full thread_update_pri(). 890 */ 891 if (sysdc_compute_pri(sdc, 0)) { 892 THREAD_CHANGE_PRI(t, sdc->sdc_epri); 893 } 894 sysdc_activate(sdc); 895 896 ASSERT(sdc->sdc_next != NULL); 897 } 898 899 setbackdq(t); 900 } 901 902 static void 903 sysdc_wakeup(kthread_t *t) 904 { 905 sysdc_setrun(t); 906 } 907 908 static void 909 sysdc_sleep(kthread_t *t) 910 { 911 sysdc_t *sdc = t->t_cldata; 912 913 ASSERT(THREAD_LOCK_HELD(t)); /* t should be in transition */ 914 915 sdc->sdc_sleep_updates = sdc->sdc_nupdates; 916 } 917 918 /*ARGSUSED*/ 919 static int 920 sysdc_enterclass(kthread_t *t, id_t cid, void *parmsp, cred_t *reqpcredp, 921 void *bufp) 922 { 923 cpupart_t *const cpupart = t->t_cpupart; 924 sysdc_t *sdc = bufp; 925 sysdc_params_t *sdpp = parmsp; 926 sysdc_pset_t *newpset = sdc->sdc_pset; 927 sysdc_pset_t *pset; 928 int start_timeout; 929 930 if (t->t_cid != syscid) 931 return (EPERM); 932 933 ASSERT(ttolwp(t) != NULL); 934 ASSERT(sdpp != NULL); 935 ASSERT(newpset != NULL); 936 ASSERT(sysdc_param_init); 937 938 ASSERT(sdpp->sdp_minpri >= sysdc_minpri); 939 ASSERT(sdpp->sdp_maxpri <= sysdc_maxpri); 940 ASSERT(sdpp->sdp_DC >= sysdc_minDC); 941 ASSERT(sdpp->sdp_DC <= sysdc_maxDC); 942 943 sdc->sdc_thread = t; 944 sdc->sdc_pri = sdpp->sdp_maxpri; /* start off maximally */ 945 sdc->sdc_minpri = sdpp->sdp_minpri; 946 sdc->sdc_maxpri = sdpp->sdp_maxpri; 947 sdc->sdc_target_DC = sdpp->sdp_DC; 948 sdc->sdc_ticks = 0; 949 sdc->sdc_update_ticks = sysdc_update_ticks + 1; 950 951 /* Assign ourselves to the appropriate pset. */ 952 sdc->sdc_pset = NULL; 953 mutex_enter(&sysdc_pset_lock); 954 for (pset = list_head(&sysdc_psets); pset != NULL; 955 pset = list_next(&sysdc_psets, pset)) { 956 if (pset->sdp_cpupart == cpupart) { 957 break; 958 } 959 } 960 if (pset == NULL) { 961 pset = newpset; 962 newpset = NULL; 963 pset->sdp_cpupart = cpupart; 964 list_insert_tail(&sysdc_psets, pset); 965 } 966 pset->sdp_nthreads++; 967 ASSERT(pset->sdp_nthreads > 0); 968 969 sdc->sdc_pset = pset; 970 971 start_timeout = (sysdc_update_timeout_started == 0); 972 sysdc_update_timeout_started = 1; 973 mutex_exit(&sysdc_pset_lock); 974 975 if (newpset != NULL) 976 kmem_free(newpset, sizeof (*newpset)); 977 978 /* Update t's scheduling class and priority. */ 979 thread_lock(t); 980 t->t_clfuncs = &(sclass[cid].cl_funcs->thread); 981 t->t_cid = cid; 982 t->t_cldata = sdc; 983 t->t_schedflag |= TS_RUNQMATCH; 984 985 sysdc_update_pri(sdc, SDC_UPDATE_INITIAL); 986 thread_unlock(t); 987 988 /* Kick off the thread timeout if we're the first one in. */ 989 if (start_timeout) { 990 (void) timeout(sysdc_update, NULL, sysdc_update_ticks); 991 } 992 993 return (0); 994 } 995 996 static void 997 sysdc_leave(sysdc_t *sdc) 998 { 999 sysdc_pset_t *sdp = sdc->sdc_pset; 1000 sysdc_list_t *sdl = SYSDC_LIST(sdc); 1001 uint_t freedc; 1002 1003 mutex_enter(&sdl->sdl_lock); /* block sysdc_update() */ 1004 sdc->sdc_thread = NULL; 1005 freedc = (sdc->sdc_next == NULL); 1006 mutex_exit(&sdl->sdl_lock); 1007 1008 mutex_enter(&sysdc_pset_lock); 1009 ASSERT(sdp != NULL); 1010 ASSERT(sdp->sdp_nthreads > 0); 1011 --sdp->sdp_nthreads; 1012 if (sdp->sdp_nthreads == 0) { 1013 list_remove(&sysdc_psets, sdp); 1014 } else { 1015 sdp = NULL; 1016 } 1017 mutex_exit(&sysdc_pset_lock); 1018 1019 if (freedc) 1020 kmem_free(sdc, sizeof (*sdc)); 1021 if (sdp != NULL) 1022 kmem_free(sdp, sizeof (*sdp)); 1023 } 1024 1025 static void 1026 sysdc_exitclass(void *buf) 1027 { 1028 sysdc_leave((sysdc_t *)buf); 1029 } 1030 1031 /*ARGSUSED*/ 1032 static int 1033 sysdc_canexit(kthread_t *t, cred_t *reqpcredp) 1034 { 1035 /* Threads cannot exit SDC once joined, except in a body bag. */ 1036 return (EPERM); 1037 } 1038 1039 static void 1040 sysdc_exit(kthread_t *t) 1041 { 1042 sysdc_t *sdc; 1043 1044 /* We're exiting, so we just rejoin the SYS class. */ 1045 thread_lock(t); 1046 ASSERT(t->t_cid == sysdccid); 1047 sdc = t->t_cldata; 1048 t->t_cid = syscid; 1049 t->t_cldata = NULL; 1050 t->t_clfuncs = &(sclass[syscid].cl_funcs->thread); 1051 (void) thread_change_pri(t, maxclsyspri, 0); 1052 t->t_schedflag &= ~TS_RUNQMATCH; 1053 thread_unlock_nopreempt(t); 1054 1055 /* Unlink the sdc from everything. */ 1056 sysdc_leave(sdc); 1057 } 1058 1059 /*ARGSUSED*/ 1060 static int 1061 sysdc_fork(kthread_t *t, kthread_t *ct, void *bufp) 1062 { 1063 /* 1064 * Threads cannot be created with SDC as their class; they must 1065 * be created as SYS and then added with sysdc_thread_enter(). 1066 * Because of this restriction, sysdc_fork() should never be called. 1067 */ 1068 panic("sysdc cannot be forked"); 1069 1070 return (ENOSYS); 1071 } 1072 1073 /*ARGSUSED*/ 1074 static void 1075 sysdc_forkret(kthread_t *t, kthread_t *ct) 1076 { 1077 /* SDC threads are part of system processes, which never fork. */ 1078 panic("sysdc cannot be forked"); 1079 } 1080 1081 static pri_t 1082 sysdc_globpri(kthread_t *t) 1083 { 1084 return (t->t_epri); 1085 } 1086 1087 /*ARGSUSED*/ 1088 static pri_t 1089 sysdc_no_swap(kthread_t *t, int flags) 1090 { 1091 /* SDC threads cannot be swapped. */ 1092 return (-1); 1093 } 1094 1095 /* 1096 * Get maximum and minimum priorities enjoyed by SDC threads. 1097 */ 1098 static int 1099 sysdc_getclpri(pcpri_t *pcprip) 1100 { 1101 pcprip->pc_clpmax = sysdc_maxpri; 1102 pcprip->pc_clpmin = sysdc_minpri; 1103 return (0); 1104 } 1105 1106 /*ARGSUSED*/ 1107 static int 1108 sysdc_getclinfo(void *arg) 1109 { 1110 return (0); /* no class-specific info */ 1111 } 1112 1113 /*ARGSUSED*/ 1114 static int 1115 sysdc_alloc(void **p, int flag) 1116 { 1117 sysdc_t *new; 1118 1119 *p = NULL; 1120 if ((new = kmem_zalloc(sizeof (*new), flag)) == NULL) { 1121 return (ENOMEM); 1122 } 1123 if ((new->sdc_pset = kmem_zalloc(sizeof (*new->sdc_pset), flag)) == 1124 NULL) { 1125 kmem_free(new, sizeof (*new)); 1126 return (ENOMEM); 1127 } 1128 *p = new; 1129 return (0); 1130 } 1131 1132 static void 1133 sysdc_free(void *p) 1134 { 1135 sysdc_t *sdc = p; 1136 1137 if (sdc != NULL) { 1138 /* 1139 * We must have failed CL_ENTERCLASS(), so our pset should be 1140 * there and unused. 1141 */ 1142 ASSERT(sdc->sdc_pset != NULL); 1143 ASSERT(sdc->sdc_pset->sdp_cpupart == NULL); 1144 kmem_free(sdc->sdc_pset, sizeof (*sdc->sdc_pset)); 1145 kmem_free(sdc, sizeof (*sdc)); 1146 } 1147 } 1148 1149 static int sysdc_enosys(); /* Boy, ANSI-C's K&R compatibility is weird. */ 1150 static int sysdc_einval(); 1151 static void sysdc_nullsys(); 1152 1153 static struct classfuncs sysdc_classfuncs = { 1154 /* messages to class manager */ 1155 { 1156 sysdc_enosys, /* admin */ 1157 sysdc_getclinfo, 1158 sysdc_enosys, /* parmsin */ 1159 sysdc_enosys, /* parmsout */ 1160 sysdc_enosys, /* vaparmsin */ 1161 sysdc_enosys, /* vaparmsout */ 1162 sysdc_getclpri, 1163 sysdc_alloc, 1164 sysdc_free, 1165 }, 1166 /* operations on threads */ 1167 { 1168 sysdc_enterclass, 1169 sysdc_exitclass, 1170 sysdc_canexit, 1171 sysdc_fork, 1172 sysdc_forkret, 1173 sysdc_nullsys, /* parmsget */ 1174 sysdc_enosys, /* parmsset */ 1175 sysdc_nullsys, /* stop */ 1176 sysdc_exit, 1177 sysdc_nullsys, /* active */ 1178 sysdc_nullsys, /* inactive */ 1179 sysdc_no_swap, /* swapin */ 1180 sysdc_no_swap, /* swapout */ 1181 sysdc_nullsys, /* trapret */ 1182 sysdc_preempt, 1183 sysdc_setrun, 1184 sysdc_sleep, 1185 sysdc_tick, 1186 sysdc_wakeup, 1187 sysdc_einval, /* donice */ 1188 sysdc_globpri, 1189 sysdc_nullsys, /* set_process_group */ 1190 sysdc_nullsys, /* yield */ 1191 sysdc_einval, /* doprio */ 1192 } 1193 }; 1194 1195 static int 1196 sysdc_enosys() 1197 { 1198 return (ENOSYS); 1199 } 1200 1201 static int 1202 sysdc_einval() 1203 { 1204 return (EINVAL); 1205 } 1206 1207 static void 1208 sysdc_nullsys() 1209 { 1210 } 1211 1212 /*ARGSUSED*/ 1213 static pri_t 1214 sysdc_init(id_t cid, int clparmsz, classfuncs_t **clfuncspp) 1215 { 1216 int idx; 1217 1218 list_create(&sysdc_psets, sizeof (sysdc_pset_t), 1219 offsetof(sysdc_pset_t, sdp_node)); 1220 1221 for (idx = 0; idx < SYSDC_NLISTS; idx++) { 1222 sysdc_active[idx].sdl_list = &sysdc_dummy; 1223 } 1224 1225 sysdc_initparam(); 1226 1227 sysdccid = cid; 1228 *clfuncspp = &sysdc_classfuncs; 1229 1230 return ((pri_t)v.v_maxsyspri); 1231 } 1232 1233 static struct sclass csw = { 1234 "SDC", 1235 sysdc_init, 1236 0 1237 }; 1238 1239 static struct modlsched modlsched = { 1240 &mod_schedops, "system duty cycle scheduling class", &csw 1241 }; 1242 1243 static struct modlinkage modlinkage = { 1244 MODREV_1, (void *)&modlsched, NULL 1245 }; 1246 1247 int 1248 _init() 1249 { 1250 return (mod_install(&modlinkage)); 1251 } 1252 1253 int 1254 _fini() 1255 { 1256 return (EBUSY); /* can't unload for now */ 1257 } 1258 1259 int 1260 _info(struct modinfo *modinfop) 1261 { 1262 return (mod_info(&modlinkage, modinfop)); 1263 } 1264 1265 /* --- consolidation-private interfaces --- */ 1266 void 1267 sysdc_thread_enter(kthread_t *t, uint_t dc, uint_t flags) 1268 { 1269 void *buf = NULL; 1270 sysdc_params_t sdp; 1271 1272 SYSDC_INC_STAT(sysdc_thread_enter_enter); 1273 1274 ASSERT(sysdc_param_init); 1275 ASSERT(sysdccid >= 0); 1276 1277 ASSERT((flags & ~SYSDC_THREAD_BATCH) == 0); 1278 1279 sdp.sdp_minpri = sysdc_minpri; 1280 sdp.sdp_maxpri = sysdc_maxpri; 1281 sdp.sdp_DC = MAX(MIN(dc, sysdc_maxDC), sysdc_minDC); 1282 1283 VERIFY0(CL_ALLOC(&buf, sysdccid, KM_SLEEP)); 1284 1285 ASSERT(t->t_lwp != NULL); 1286 ASSERT(t->t_cid == syscid); 1287 ASSERT(t->t_cldata == NULL); 1288 VERIFY0(CL_CANEXIT(t, NULL)); 1289 VERIFY0(CL_ENTERCLASS(t, sysdccid, &sdp, kcred, buf)); 1290 CL_EXITCLASS(syscid, NULL); 1291 }