Print this page
11909 THREAD_KPRI_RELEASE does nothing of the sort
Reviewed by: Bryan Cantrill <bryan@joyent.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/disp/sysdc.c
+++ new/usr/src/uts/common/disp/sysdc.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
23 23 * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
24 24 */
25 25
26 26 /*
27 27 * The System Duty Cycle (SDC) scheduling class
28 28 * --------------------------------------------
29 29 *
30 30 * Background
31 31 *
32 32 * Kernel threads in Solaris have traditionally not been large consumers
33 33 * of CPU time. They typically wake up, perform a small amount of
34 34 * work, then go back to sleep waiting for either a timeout or another
35 35 * signal. On the assumption that the small amount of work that they do
36 36 * is important for the behavior of the whole system, these threads are
37 37 * treated kindly by the dispatcher and the SYS scheduling class: they run
38 38 * without preemption from anything other than real-time and interrupt
39 39 * threads; when preempted, they are put at the front of the queue, so they
40 40 * generally do not migrate between CPUs; and they are allowed to stay
41 41 * running until they voluntarily give up the CPU.
42 42 *
43 43 * As Solaris has evolved, new workloads have emerged which require the
44 44 * kernel to perform significant amounts of CPU-intensive work. One
45 45 * example of such a workload is ZFS's transaction group sync processing.
46 46 * Each sync operation generates a large batch of I/Os, and each I/O
47 47 * may need to be compressed and/or checksummed before it is written to
48 48 * storage. The taskq threads which perform the compression and checksums
49 49 * will run nonstop as long as they have work to do; a large sync operation
50 50 * on a compression-heavy dataset can keep them busy for seconds on end.
51 51 * This causes human-time-scale dispatch latency bubbles for any other
52 52 * threads which have the misfortune to share a CPU with the taskq threads.
53 53 *
54 54 * The SDC scheduling class is a solution to this problem.
55 55 *
56 56 *
57 57 * Overview
58 58 *
59 59 * SDC is centered around the concept of a thread's duty cycle (DC):
60 60 *
61 61 * ONPROC time
62 62 * Duty Cycle = ----------------------
63 63 * ONPROC + Runnable time
64 64 *
65 65 * This is the ratio of the time that the thread spent running on a CPU
66 66 * divided by the time it spent running or trying to run. It is unaffected
67 67 * by any time the thread spent sleeping, stopped, etc.
68 68 *
69 69 * A thread joining the SDC class specifies a "target" DC that it wants
70 70 * to run at. To implement this policy, the routine sysdc_update() scans
71 71 * the list of active SDC threads every few ticks and uses each thread's
72 72 * microstate data to compute the actual duty cycle that that thread
73 73 * has experienced recently. If the thread is under its target DC, its
74 74 * priority is increased to the maximum available (sysdc_maxpri, which is
75 75 * 99 by default). If the thread is over its target DC, its priority is
76 76 * reduced to the minimum available (sysdc_minpri, 0 by default). This
77 77 * is a fairly primitive approach, in that it doesn't use any of the
78 78 * intermediate priorities, but it's not completely inappropriate. Even
79 79 * though threads in the SDC class might take a while to do their job, they
80 80 * are by some definition important if they're running inside the kernel,
81 81 * so it is reasonable that they should get to run at priority 99.
82 82 *
83 83 * If a thread is running when sysdc_update() calculates its actual duty
84 84 * cycle, and there are other threads of equal or greater priority on its
85 85 * CPU's dispatch queue, sysdc_update() preempts that thread. The thread
86 86 * acknowledges the preemption by calling sysdc_preempt(), which calls
87 87 * setbackdq(), which gives other threads with the same priority a chance
88 88 * to run. This creates a de facto time quantum for threads in the SDC
89 89 * scheduling class.
90 90 *
91 91 * An SDC thread which is assigned priority 0 can continue to run if
92 92 * nothing else needs to use the CPU that it's running on. Similarly, an
93 93 * SDC thread at priority 99 might not get to run as much as it wants to
94 94 * if there are other priority-99 or higher threads on its CPU. These
95 95 * situations would cause the thread to get ahead of or behind its target
96 96 * DC; the longer the situations lasted, the further ahead or behind the
97 97 * thread would get. Rather than condemning a thread to a lifetime of
98 98 * paying for its youthful indiscretions, SDC keeps "base" values for
99 99 * ONPROC and Runnable times in each thread's sysdc data, and updates these
100 100 * values periodically. The duty cycle is then computed using the elapsed
101 101 * amount of ONPROC and Runnable times since those base times.
102 102 *
103 103 * Since sysdc_update() scans SDC threads fairly frequently, it tries to
104 104 * keep the list of "active" threads small by pruning out threads which
105 105 * have been asleep for a brief time. They are not pruned immediately upon
106 106 * going to sleep, since some threads may bounce back and forth between
107 107 * sleeping and being runnable.
108 108 *
109 109 *
110 110 * Interfaces
111 111 *
112 112 * void sysdc_thread_enter(t, dc, flags)
113 113 *
114 114 * Moves a kernel thread from the SYS scheduling class to the
115 115 * SDC class. t must have an associated LWP (created by calling
116 116 * lwp_kernel_create()). The thread will have a target DC of dc.
117 117 * Flags should be either 0 or SYSDC_THREAD_BATCH. If
118 118 * SYSDC_THREAD_BATCH is specified, the thread is expected to be
119 119 * doing large amounts of processing.
120 120 *
121 121 *
122 122 * Complications
123 123 *
124 124 * - Run queue balancing
125 125 *
126 126 * The Solaris dispatcher is biased towards letting a thread run
127 127 * on the same CPU which it last ran on, if no more than 3 ticks
128 128 * (i.e. rechoose_interval) have passed since the thread last ran.
129 129 * This helps to preserve cache warmth. On the other hand, it also
130 130 * tries to keep the per-CPU run queues fairly balanced; if the CPU
131 131 * chosen for a runnable thread has a run queue which is three or
132 132 * more threads longer than a neighboring CPU's queue, the runnable
133 133 * thread is dispatched onto the neighboring CPU instead.
134 134 *
135 135 * These policies work well for some workloads, but not for many SDC
136 136 * threads. The taskq client of SDC, for example, has many discrete
137 137 * units of work to do. The work units are largely independent, so
138 138 * cache warmth is not an important consideration. It is important
139 139 * that the threads fan out quickly to different CPUs, since the
140 140 * amount of work these threads have to do (a few seconds worth at a
141 141 * time) doesn't leave much time to correct thread placement errors
142 142 * (i.e. two SDC threads being dispatched to the same CPU).
143 143 *
144 144 * To fix this, SDC uses the TS_RUNQMATCH flag introduced for FSS.
145 145 * This tells the dispatcher to keep neighboring run queues' lengths
146 146 * more evenly matched, which allows SDC threads to migrate more
147 147 * easily.
148 148 *
149 149 * - LWPs and system processes
150 150 *
151 151 * SDC can only be used for kernel threads. Since SDC uses microstate
152 152 * accounting data to compute each thread's actual duty cycle, all
153 153 * threads entering the SDC class must have associated LWPs (which
154 154 * store the microstate data). This means that the threads have to
155 155 * be associated with an SSYS process, i.e. one created by newproc().
156 156 * If the microstate accounting information is ever moved into the
157 157 * kthread_t, this restriction could be lifted.
158 158 *
159 159 * - Dealing with oversubscription
160 160 *
161 161 * Since SDC duty cycles are per-thread, it is possible that the
162 162 * aggregate requested duty cycle of all SDC threads in a processor
163 163 * set could be greater than the total CPU time available in that set.
164 164 * The FSS scheduling class has an analogous situation, which it deals
165 165 * with by reducing each thread's allotted CPU time proportionally.
166 166 * Since SDC doesn't need to be as precise as FSS, it uses a simpler
167 167 * solution to the oversubscription problem.
168 168 *
169 169 * sysdc_update() accumulates the amount of time that max-priority SDC
170 170 * threads have spent on-CPU in each processor set, and uses that sum
171 171 * to create an implied duty cycle for that processor set:
172 172 *
173 173 * accumulated CPU time
174 174 * pset DC = -----------------------------------
175 175 * (# CPUs) * time since last update
176 176 *
177 177 * If this implied duty cycle is above a maximum pset duty cycle (90%
178 178 * by default), sysdc_update() sets the priority of all SDC threads
179 179 * in that processor set to sysdc_minpri for a "break" period. After
180 180 * the break period, it waits for a "nobreak" period before trying to
181 181 * enforce the pset duty cycle limit again.
182 182 *
183 183 * - Processor sets
184 184 *
185 185 * As the above implies, SDC is processor set aware, but it does not
↓ open down ↓ |
185 lines elided |
↑ open up ↑ |
186 186 * currently allow threads to change processor sets while in the SDC
187 187 * class. Instead, those threads must join the desired processor set
188 188 * before entering SDC. [1]
189 189 *
190 190 * - Batch threads
191 191 *
192 192 * A thread joining the SDC class can specify the SDC_THREAD_BATCH
193 193 * flag. This flag currently has no effect, but marks threads which
194 194 * do bulk processing.
195 195 *
196 - * - t_kpri_req
197 - *
198 - * The TS and FSS scheduling classes pay attention to t_kpri_req,
199 - * which provides a simple form of priority inheritance for
200 - * synchronization primitives (such as rwlocks held as READER) which
201 - * cannot be traced to a unique thread. The SDC class does not honor
202 - * t_kpri_req, for a few reasons:
203 - *
204 - * 1. t_kpri_req is notoriously inaccurate. A measure of its
205 - * inaccuracy is that it needs to be cleared every time a thread
206 - * returns to user mode, because it is frequently non-zero at that
207 - * point. This can happen because "ownership" of synchronization
208 - * primitives that use t_kpri_req can be silently handed off,
209 - * leaving no opportunity to will the t_kpri_req inheritance.
210 - *
211 - * 2. Unlike in TS and FSS, threads in SDC *will* eventually run at
212 - * kernel priority. This means that even if an SDC thread
213 - * is holding a synchronization primitive and running at low
214 - * priority, its priority will eventually be raised above 60,
215 - * allowing it to drive on and release the resource.
216 - *
217 - * 3. The first consumer of SDC uses the taskq subsystem, which holds
218 - * a reader lock for the duration of the task's execution. This
219 - * would mean that SDC threads would never drop below kernel
220 - * priority in practice, which defeats one of the purposes of SDC.
221 - *
222 196 * - Why not FSS?
223 197 *
224 198 * It might seem that the existing FSS scheduling class could solve
225 199 * the problems that SDC is attempting to solve. FSS's more precise
226 200 * solution to the oversubscription problem would hardly cause
227 201 * trouble, as long as it performed well. SDC is implemented as
228 202 * a separate scheduling class for two main reasons: the initial
229 203 * consumer of SDC does not map well onto the "project" abstraction
230 204 * that is central to FSS, and FSS does not expect to run at kernel
231 205 * priorities.
232 206 *
233 207 *
234 208 * Tunables
235 209 *
236 210 * - sysdc_update_interval_msec: Number of milliseconds between
237 211 * consecutive thread priority updates.
238 212 *
239 213 * - sysdc_reset_interval_msec: Number of milliseconds between
240 214 * consecutive resets of a thread's base ONPROC and Runnable
241 215 * times.
242 216 *
243 217 * - sysdc_prune_interval_msec: Number of milliseconds of sleeping
244 218 * before a thread is pruned from the active list.
245 219 *
246 220 * - sysdc_max_pset_DC: Allowable percentage of a processor set's
247 221 * CPU time which SDC can give to its high-priority threads.
248 222 *
249 223 * - sysdc_break_msec: Number of milliseconds of "break" taken when
250 224 * sysdc_max_pset_DC is exceeded.
251 225 *
252 226 *
253 227 * Future work (in SDC and related subsystems)
254 228 *
255 229 * - Per-thread rechoose interval (0 for SDC)
256 230 *
257 231 * Allow each thread to specify its own rechoose interval. SDC
258 232 * threads would specify an interval of zero, which would rechoose
259 233 * the CPU with the lowest priority once per update.
260 234 *
261 235 * - Allow threads to change processor sets after joining the SDC class
262 236 *
263 237 * - Thread groups and per-group DC
264 238 *
265 239 * It might be nice to be able to specify a duty cycle which applies
266 240 * to a group of threads in aggregate.
267 241 *
268 242 * - Per-group DC callback to allow dynamic DC tuning
269 243 *
270 244 * Currently, DCs are assigned when the thread joins SDC. Some
271 245 * workloads could benefit from being able to tune their DC using
272 246 * subsystem-specific knowledge about the workload.
273 247 *
274 248 * - Finer-grained priority updates
275 249 *
276 250 * - More nuanced management of oversubscription
277 251 *
278 252 * - Moving other CPU-intensive threads into SDC
279 253 *
280 254 * - Move msacct data into kthread_t
281 255 *
282 256 * This would allow kernel threads without LWPs to join SDC.
283 257 *
284 258 *
285 259 * Footnotes
286 260 *
287 261 * [1] The details of doing so are left as an exercise for the reader.
288 262 */
289 263
290 264 #include <sys/types.h>
291 265 #include <sys/sysdc.h>
292 266 #include <sys/sysdc_impl.h>
293 267
294 268 #include <sys/class.h>
295 269 #include <sys/cmn_err.h>
296 270 #include <sys/cpuvar.h>
297 271 #include <sys/cpupart.h>
298 272 #include <sys/debug.h>
299 273 #include <sys/disp.h>
300 274 #include <sys/errno.h>
301 275 #include <sys/inline.h>
302 276 #include <sys/kmem.h>
303 277 #include <sys/modctl.h>
304 278 #include <sys/schedctl.h>
305 279 #include <sys/sdt.h>
306 280 #include <sys/sunddi.h>
307 281 #include <sys/sysmacros.h>
308 282 #include <sys/systm.h>
309 283 #include <sys/var.h>
310 284
311 285 /*
312 286 * Tunables - loaded into the internal state at module load time
313 287 */
314 288 uint_t sysdc_update_interval_msec = 20;
315 289 uint_t sysdc_reset_interval_msec = 400;
316 290 uint_t sysdc_prune_interval_msec = 100;
317 291 uint_t sysdc_max_pset_DC = 90;
318 292 uint_t sysdc_break_msec = 80;
319 293
320 294 /*
321 295 * Internal state - constants set up by sysdc_initparam()
322 296 */
323 297 static clock_t sysdc_update_ticks; /* ticks between updates */
324 298 static uint_t sysdc_prune_updates; /* updates asleep before pruning */
325 299 static uint_t sysdc_reset_updates; /* # of updates before reset */
326 300 static uint_t sysdc_break_updates; /* updates to break */
327 301 static uint_t sysdc_nobreak_updates; /* updates to not check */
328 302 static uint_t sysdc_minDC; /* minimum allowed DC */
329 303 static uint_t sysdc_maxDC; /* maximum allowed DC */
330 304 static pri_t sysdc_minpri; /* minimum allowed priority */
331 305 static pri_t sysdc_maxpri; /* maximum allowed priority */
332 306
333 307 /*
334 308 * Internal state
335 309 */
336 310 static kmutex_t sysdc_pset_lock; /* lock protecting pset data */
337 311 static list_t sysdc_psets; /* list of psets with SDC threads */
338 312 static uint_t sysdc_param_init; /* sysdc_initparam() has been called */
339 313 static uint_t sysdc_update_timeout_started; /* update timeout is active */
340 314 static hrtime_t sysdc_last_update; /* time of last sysdc_update() */
341 315 static sysdc_t sysdc_dummy; /* used to terminate active lists */
342 316
343 317 /*
344 318 * Internal state - active hash table
345 319 */
346 320 #define SYSDC_NLISTS 8
347 321 #define SYSDC_HASH(sdc) (((uintptr_t)(sdc) >> 6) & (SYSDC_NLISTS - 1))
348 322 static sysdc_list_t sysdc_active[SYSDC_NLISTS];
349 323 #define SYSDC_LIST(sdc) (&sysdc_active[SYSDC_HASH(sdc)])
350 324
351 325 #ifdef DEBUG
352 326 static struct {
353 327 uint64_t sysdc_update_times_asleep;
354 328 uint64_t sysdc_update_times_base_ran_backwards;
355 329 uint64_t sysdc_update_times_already_done;
356 330 uint64_t sysdc_update_times_cur_ran_backwards;
357 331 uint64_t sysdc_compute_pri_breaking;
358 332 uint64_t sysdc_activate_enter;
359 333 uint64_t sysdc_update_enter;
360 334 uint64_t sysdc_update_exited;
361 335 uint64_t sysdc_update_not_sdc;
362 336 uint64_t sysdc_update_idle;
363 337 uint64_t sysdc_update_take_break;
364 338 uint64_t sysdc_update_no_psets;
365 339 uint64_t sysdc_tick_not_sdc;
366 340 uint64_t sysdc_tick_quantum_expired;
367 341 uint64_t sysdc_thread_enter_enter;
368 342 } sysdc_stats;
369 343
370 344 #define SYSDC_INC_STAT(x) (sysdc_stats.x++)
371 345 #else
372 346 #define SYSDC_INC_STAT(x) ((void)0)
373 347 #endif
374 348
375 349 /* macros are UPPER CASE */
376 350 #define HOWMANY(a, b) howmany((a), (b))
377 351 #define MSECTOTICKS(a) HOWMANY((a) * 1000, usec_per_tick)
378 352
379 353 static void
380 354 sysdc_initparam(void)
381 355 {
382 356 uint_t sysdc_break_ticks;
383 357
384 358 /* update / prune intervals */
385 359 sysdc_update_ticks = MSECTOTICKS(sysdc_update_interval_msec);
386 360
387 361 sysdc_prune_updates = HOWMANY(sysdc_prune_interval_msec,
388 362 sysdc_update_interval_msec);
389 363 sysdc_reset_updates = HOWMANY(sysdc_reset_interval_msec,
390 364 sysdc_update_interval_msec);
391 365
392 366 /* We must get at least a little time on CPU. */
393 367 sysdc_minDC = 1;
394 368 sysdc_maxDC = SYSDC_DC_MAX;
395 369 sysdc_minpri = 0;
396 370 sysdc_maxpri = maxclsyspri - 1;
397 371
398 372 /* break parameters */
399 373 if (sysdc_max_pset_DC > SYSDC_DC_MAX) {
400 374 sysdc_max_pset_DC = SYSDC_DC_MAX;
401 375 }
402 376 sysdc_break_ticks = MSECTOTICKS(sysdc_break_msec);
403 377 sysdc_break_updates = HOWMANY(sysdc_break_ticks, sysdc_update_ticks);
404 378
405 379 /*
406 380 * We want:
407 381 *
408 382 * sysdc_max_pset_DC = (nobreak / (break + nobreak))
409 383 *
410 384 * ==> nobreak = sysdc_max_pset_DC * (break + nobreak)
411 385 *
412 386 * sysdc_max_pset_DC * break
413 387 * ==> nobreak = -------------------------
414 388 * 1 - sysdc_max_pset_DC
415 389 */
416 390 sysdc_nobreak_updates =
417 391 HOWMANY((uint64_t)sysdc_break_updates * sysdc_max_pset_DC,
418 392 (SYSDC_DC_MAX - sysdc_max_pset_DC));
419 393
420 394 sysdc_param_init = 1;
421 395 }
422 396
423 397 #undef HOWMANY
424 398 #undef MSECTOTICKS
425 399
426 400 #define SDC_UPDATE_INITIAL 0x1 /* for the initial update */
427 401 #define SDC_UPDATE_TIMEOUT 0x2 /* from sysdc_update() */
428 402 #define SDC_UPDATE_TICK 0x4 /* from sysdc_tick(), on expiry */
429 403
430 404 /*
431 405 * Updates the recorded times in the sdc, and returns the elapsed ONPROC
432 406 * and Runnable times since the last reset.
433 407 *
434 408 * newO is the thread's actual ONPROC time; it's used during sysdc_update()
435 409 * to track processor set usage.
436 410 */
437 411 static void
438 412 sysdc_update_times(sysdc_t *sdc, uint_t flags,
439 413 hrtime_t *O, hrtime_t *R, hrtime_t *newO)
440 414 {
441 415 kthread_t *const t = sdc->sdc_thread;
442 416 const uint_t initial = (flags & SDC_UPDATE_INITIAL);
443 417 const uint_t update = (flags & SDC_UPDATE_TIMEOUT);
444 418 const clock_t now = ddi_get_lbolt();
445 419 uint_t do_reset;
446 420
447 421 ASSERT(THREAD_LOCK_HELD(t));
448 422
449 423 *O = *R = 0;
450 424
451 425 /* If we've been sleeping, we know we haven't had any ONPROC time. */
452 426 if (sdc->sdc_sleep_updates != 0 &&
453 427 sdc->sdc_sleep_updates != sdc->sdc_nupdates) {
454 428 *newO = sdc->sdc_last_base_O;
455 429 SYSDC_INC_STAT(sysdc_update_times_asleep);
456 430 return;
457 431 }
458 432
459 433 /*
460 434 * If this is our first update, or we've hit the reset point,
461 435 * we need to reset our base_{O,R}. Once we've updated them, we
462 436 * report O and R for the entire prior interval.
463 437 */
464 438 do_reset = initial;
465 439 if (update) {
466 440 ++sdc->sdc_nupdates;
467 441 if ((sdc->sdc_nupdates % sysdc_reset_updates) == 0)
468 442 do_reset = 1;
469 443 }
470 444 if (do_reset) {
471 445 hrtime_t baseO, baseR;
472 446 if (initial) {
473 447 /*
474 448 * Start off our cycle count somewhere in the middle,
475 449 * to keep the resets from all happening at once.
476 450 *
477 451 * 4999 is a handy prime much larger than
478 452 * sysdc_reset_updates, so that we don't run into
479 453 * trouble if the resolution is a multiple of
480 454 * sysdc_reset_updates.
481 455 */
482 456 sdc->sdc_nupdates = (uint_t)((gethrtime() % 4999) %
483 457 sysdc_reset_updates);
484 458 baseO = baseR = 0;
485 459 } else {
486 460 baseO = sdc->sdc_base_O;
487 461 baseR = sdc->sdc_base_R;
488 462 }
489 463
490 464 mstate_systhread_times(t, &sdc->sdc_base_O, &sdc->sdc_base_R);
491 465 *newO = sdc->sdc_base_O;
492 466
493 467 sdc->sdc_reset = now;
494 468 sdc->sdc_pri_check = -1; /* force mismatch below */
495 469
496 470 /*
497 471 * See below for rationale.
498 472 */
499 473 if (baseO > sdc->sdc_base_O || baseR > sdc->sdc_base_R) {
500 474 SYSDC_INC_STAT(sysdc_update_times_base_ran_backwards);
501 475 baseO = sdc->sdc_base_O;
502 476 baseR = sdc->sdc_base_R;
503 477 }
504 478
505 479 /* compute based on the entire interval */
506 480 *O = (sdc->sdc_base_O - baseO);
507 481 *R = (sdc->sdc_base_R - baseR);
508 482 return;
509 483 }
510 484
511 485 /*
512 486 * If we're called from sysdc_update(), we *must* return a value
513 487 * for newO, so we always call mstate_systhread_times().
514 488 *
515 489 * Otherwise, if we've already done a pri check this tick,
516 490 * we can skip it.
517 491 */
518 492 if (!update && sdc->sdc_pri_check == now) {
519 493 SYSDC_INC_STAT(sysdc_update_times_already_done);
520 494 return;
521 495 }
522 496
523 497 /* Get the current times from the thread */
524 498 sdc->sdc_pri_check = now;
525 499 mstate_systhread_times(t, &sdc->sdc_cur_O, &sdc->sdc_cur_R);
526 500 *newO = sdc->sdc_cur_O;
527 501
528 502 /*
529 503 * The updating of microstate accounting is not done under a
530 504 * consistent set of locks, particularly the t_waitrq field. This
531 505 * can lead to narrow windows in which we account for time in the
532 506 * wrong bucket, which on the next read will be accounted for
533 507 * correctly.
534 508 *
535 509 * If our sdc_base_* fields were affected by one of these blips, we
536 510 * throw away the old data, and pretend this tick didn't happen.
537 511 */
538 512 if (sdc->sdc_cur_O < sdc->sdc_base_O ||
539 513 sdc->sdc_cur_R < sdc->sdc_base_R) {
540 514
541 515 sdc->sdc_base_O = sdc->sdc_cur_O;
542 516 sdc->sdc_base_R = sdc->sdc_cur_R;
543 517
544 518 SYSDC_INC_STAT(sysdc_update_times_cur_ran_backwards);
545 519 return;
546 520 }
547 521
548 522 *O = sdc->sdc_cur_O - sdc->sdc_base_O;
549 523 *R = sdc->sdc_cur_R - sdc->sdc_base_R;
550 524 }
551 525
552 526 /*
553 527 * sysdc_compute_pri()
554 528 *
555 529 * Recomputes the priority of the thread, leaving the result in
556 530 * sdc->sdc_epri. Returns 1 if a priority update should occur
557 531 * (which will also trigger a cpu_surrender()), otherwise
558 532 * returns 0.
559 533 */
560 534 static uint_t
561 535 sysdc_compute_pri(sysdc_t *sdc, uint_t flags)
562 536 {
563 537 kthread_t *const t = sdc->sdc_thread;
564 538 const uint_t update = (flags & SDC_UPDATE_TIMEOUT);
565 539 const uint_t tick = (flags & SDC_UPDATE_TICK);
566 540
567 541 hrtime_t O, R;
568 542 hrtime_t newO = -1;
569 543
570 544 ASSERT(THREAD_LOCK_HELD(t));
571 545
572 546 sysdc_update_times(sdc, flags, &O, &R, &newO);
573 547 ASSERT(!update || newO != -1);
574 548
575 549 /* If we have new data, recompute our priority. */
576 550 if ((O + R) != 0) {
577 551 sdc->sdc_cur_DC = (O * SYSDC_DC_MAX) / (O + R);
578 552
579 553 /* Adjust our priority to move our DC closer to the target. */
580 554 if (sdc->sdc_cur_DC < sdc->sdc_target_DC)
581 555 sdc->sdc_pri = sdc->sdc_maxpri;
582 556 else
583 557 sdc->sdc_pri = sdc->sdc_minpri;
584 558 }
585 559
586 560 /*
587 561 * If our per-pset duty cycle goes over the max, we will take a break.
588 562 * This forces all sysdc threads in the pset to minimum priority, in
589 563 * order to let everyone else have a chance at the CPU.
590 564 */
591 565 if (sdc->sdc_pset->sdp_need_break) {
592 566 SYSDC_INC_STAT(sysdc_compute_pri_breaking);
593 567 sdc->sdc_epri = sdc->sdc_minpri;
594 568 } else {
595 569 sdc->sdc_epri = sdc->sdc_pri;
596 570 }
597 571
598 572 DTRACE_PROBE4(sysdc__compute__pri,
599 573 kthread_t *, t, pri_t, sdc->sdc_epri, uint_t, sdc->sdc_cur_DC,
600 574 uint_t, sdc->sdc_target_DC);
601 575
602 576 /*
603 577 * For sysdc_update(), we compute the ONPROC time for high-priority
604 578 * threads, which is used to calculate the per-pset duty cycle. We
605 579 * will always tell our callers to update the thread's priority,
606 580 * since we want to force a cpu_surrender().
607 581 *
608 582 * We reset sdc_update_ticks so that sysdc_tick() will only update
609 583 * the thread's priority if our timeout is delayed by a tick or
610 584 * more.
611 585 */
612 586 if (update) {
613 587 /* SDC threads are not allowed to change cpupart bindings. */
614 588 ASSERT(t->t_cpupart == sdc->sdc_pset->sdp_cpupart);
615 589
616 590 /* If we were at MAXPRI, account for our onproc time. */
617 591 if (t->t_pri == sdc->sdc_maxpri &&
618 592 sdc->sdc_last_base_O != 0 &&
619 593 sdc->sdc_last_base_O < newO) {
620 594 sdc->sdc_last_O = newO - sdc->sdc_last_base_O;
621 595 sdc->sdc_pset->sdp_onproc_time +=
622 596 (uint64_t)sdc->sdc_last_O;
623 597 sdc->sdc_pset->sdp_onproc_threads++;
624 598 } else {
625 599 sdc->sdc_last_O = 0;
626 600 }
627 601 sdc->sdc_last_base_O = newO;
628 602
629 603 sdc->sdc_update_ticks = sdc->sdc_ticks + sysdc_update_ticks + 1;
630 604 return (1);
631 605 }
632 606
633 607 /*
634 608 * Like sysdc_update(), sysdc_tick() always wants to update the
635 609 * thread's priority, so that the CPU is surrendered if necessary.
636 610 * We reset sdc_update_ticks so that if the timeout continues to be
637 611 * delayed, we'll update at the regular interval.
638 612 */
639 613 if (tick) {
640 614 ASSERT(sdc->sdc_ticks == sdc->sdc_update_ticks);
641 615 sdc->sdc_update_ticks = sdc->sdc_ticks + sysdc_update_ticks;
642 616 return (1);
643 617 }
644 618
645 619 /*
646 620 * Otherwise, only tell our callers to update the priority if it has
647 621 * changed.
648 622 */
649 623 return (sdc->sdc_epri != t->t_pri);
650 624 }
651 625
652 626 static void
653 627 sysdc_update_pri(sysdc_t *sdc, uint_t flags)
654 628 {
655 629 kthread_t *t = sdc->sdc_thread;
656 630
657 631 ASSERT(THREAD_LOCK_HELD(t));
658 632
659 633 if (sysdc_compute_pri(sdc, flags)) {
660 634 if (!thread_change_pri(t, sdc->sdc_epri, 0)) {
661 635 cpu_surrender(t);
662 636 }
663 637 }
664 638 }
665 639
666 640 /*
667 641 * Add a thread onto the active list. It will only be removed by
668 642 * sysdc_update().
669 643 */
670 644 static void
671 645 sysdc_activate(sysdc_t *sdc)
672 646 {
673 647 sysdc_t *volatile *headp = &SYSDC_LIST(sdc)->sdl_list;
674 648 sysdc_t *head;
675 649 kthread_t *t = sdc->sdc_thread;
676 650
677 651 SYSDC_INC_STAT(sysdc_activate_enter);
678 652
679 653 ASSERT(sdc->sdc_next == NULL);
680 654 ASSERT(THREAD_LOCK_HELD(t));
681 655
682 656 do {
683 657 head = *headp;
684 658 sdc->sdc_next = head;
685 659 } while (atomic_cas_ptr(headp, head, sdc) != head);
686 660 }
687 661
688 662 /*
689 663 * sysdc_update() has two jobs:
690 664 *
691 665 * 1. It updates the priorities of all active SDC threads on the system.
692 666 * 2. It measures pset CPU usage and enforces sysdc_max_pset_DC.
693 667 */
694 668 static void
695 669 sysdc_update(void *arg)
696 670 {
697 671 int idx;
698 672 sysdc_t *freelist = NULL;
699 673 sysdc_pset_t *cur;
700 674 hrtime_t now, diff;
701 675 uint_t redeploy = 1;
702 676
703 677 SYSDC_INC_STAT(sysdc_update_enter);
704 678
705 679 ASSERT(sysdc_update_timeout_started);
706 680
707 681 /*
708 682 * If this is our first time through, diff will be gigantic, and
709 683 * no breaks will be necessary.
710 684 */
711 685 now = gethrtime();
712 686 diff = now - sysdc_last_update;
713 687 sysdc_last_update = now;
714 688
715 689 mutex_enter(&sysdc_pset_lock);
716 690 for (cur = list_head(&sysdc_psets); cur != NULL;
717 691 cur = list_next(&sysdc_psets, cur)) {
718 692 boolean_t breaking = (cur->sdp_should_break != 0);
719 693
720 694 if (cur->sdp_need_break != breaking) {
721 695 DTRACE_PROBE2(sdc__pset__break, sysdc_pset_t *, cur,
722 696 boolean_t, breaking);
723 697 }
724 698 cur->sdp_onproc_time = 0;
725 699 cur->sdp_onproc_threads = 0;
726 700 cur->sdp_need_break = breaking;
727 701 }
728 702 mutex_exit(&sysdc_pset_lock);
729 703
730 704 for (idx = 0; idx < SYSDC_NLISTS; idx++) {
731 705 sysdc_list_t *sdl = &sysdc_active[idx];
732 706 sysdc_t *volatile *headp = &sdl->sdl_list;
733 707 sysdc_t *head, *tail;
734 708 sysdc_t **prevptr;
735 709
736 710 if (*headp == &sysdc_dummy)
737 711 continue;
738 712
739 713 /* Prevent any threads from exiting while we're poking them. */
740 714 mutex_enter(&sdl->sdl_lock);
741 715
742 716 /*
743 717 * Each sdl_list contains a singly-linked list of active
744 718 * threads. Threads which become active while we are
745 719 * processing the list will be added to sdl_list. Since we
746 720 * don't want that to interfere with our own processing, we
747 721 * swap in an empty list. Any newly active threads will
748 722 * go on to this empty list. When finished, we'll put any
749 723 * such threads at the end of the processed list.
750 724 */
751 725 head = atomic_swap_ptr(headp, &sysdc_dummy);
752 726 prevptr = &head;
753 727 while (*prevptr != &sysdc_dummy) {
754 728 sysdc_t *const sdc = *prevptr;
755 729 kthread_t *const t = sdc->sdc_thread;
756 730
757 731 /*
758 732 * If the thread has exited, move its sysdc_t onto
759 733 * freelist, to be freed later.
760 734 */
761 735 if (t == NULL) {
762 736 *prevptr = sdc->sdc_next;
763 737 SYSDC_INC_STAT(sysdc_update_exited);
764 738 sdc->sdc_next = freelist;
765 739 freelist = sdc;
766 740 continue;
767 741 }
768 742
769 743 thread_lock(t);
770 744 if (t->t_cid != sysdccid) {
771 745 thread_unlock(t);
772 746 prevptr = &sdc->sdc_next;
773 747 SYSDC_INC_STAT(sysdc_update_not_sdc);
774 748 continue;
775 749 }
776 750 ASSERT(t->t_cldata == sdc);
777 751
778 752 /*
779 753 * If the thread has been sleeping for longer
780 754 * than sysdc_prune_interval, make it inactive by
781 755 * removing it from the list.
782 756 */
783 757 if (!(t->t_state & (TS_RUN | TS_ONPROC)) &&
784 758 sdc->sdc_sleep_updates != 0 &&
785 759 (sdc->sdc_sleep_updates - sdc->sdc_nupdates) >
786 760 sysdc_prune_updates) {
787 761 *prevptr = sdc->sdc_next;
788 762 SYSDC_INC_STAT(sysdc_update_idle);
789 763 sdc->sdc_next = NULL;
790 764 thread_unlock(t);
791 765 continue;
792 766 }
793 767 sysdc_update_pri(sdc, SDC_UPDATE_TIMEOUT);
794 768 thread_unlock(t);
795 769
796 770 prevptr = &sdc->sdc_next;
797 771 }
798 772
799 773 /*
800 774 * Add our list to the bucket, putting any new entries
801 775 * added while we were working at the tail of the list.
802 776 */
803 777 do {
804 778 tail = *headp;
805 779 *prevptr = tail;
806 780 } while (atomic_cas_ptr(headp, tail, head) != tail);
807 781
808 782 mutex_exit(&sdl->sdl_lock);
809 783 }
810 784
811 785 mutex_enter(&sysdc_pset_lock);
812 786 for (cur = list_head(&sysdc_psets); cur != NULL;
813 787 cur = list_next(&sysdc_psets, cur)) {
814 788
815 789 cur->sdp_vtime_last_interval =
816 790 diff * cur->sdp_cpupart->cp_ncpus;
817 791 cur->sdp_DC_last_interval =
818 792 (cur->sdp_onproc_time * SYSDC_DC_MAX) /
819 793 cur->sdp_vtime_last_interval;
820 794
821 795 if (cur->sdp_should_break > 0) {
822 796 cur->sdp_should_break--; /* breaking */
823 797 continue;
824 798 }
825 799 if (cur->sdp_dont_break > 0) {
826 800 cur->sdp_dont_break--; /* waiting before checking */
827 801 continue;
828 802 }
829 803 if (cur->sdp_DC_last_interval > sysdc_max_pset_DC) {
830 804 cur->sdp_should_break = sysdc_break_updates;
831 805 cur->sdp_dont_break = sysdc_nobreak_updates;
832 806 SYSDC_INC_STAT(sysdc_update_take_break);
833 807 }
834 808 }
835 809
836 810 /*
837 811 * If there are no sysdc_psets, there can be no threads, so
838 812 * we can stop doing our timeout. Since we're holding the
839 813 * sysdc_pset_lock, no new sysdc_psets can come in, which will
840 814 * prevent anyone from racing with this and dropping our timeout
841 815 * on the floor.
842 816 */
843 817 if (list_is_empty(&sysdc_psets)) {
844 818 SYSDC_INC_STAT(sysdc_update_no_psets);
845 819 ASSERT(sysdc_update_timeout_started);
846 820 sysdc_update_timeout_started = 0;
847 821
848 822 redeploy = 0;
849 823 }
850 824 mutex_exit(&sysdc_pset_lock);
851 825
852 826 while (freelist != NULL) {
853 827 sysdc_t *cur = freelist;
854 828 freelist = cur->sdc_next;
855 829 kmem_free(cur, sizeof (*cur));
856 830 }
857 831
858 832 if (redeploy) {
859 833 (void) timeout(sysdc_update, arg, sysdc_update_ticks);
860 834 }
861 835 }
862 836
863 837 static void
864 838 sysdc_preempt(kthread_t *t)
865 839 {
866 840 ASSERT(t == curthread);
867 841 ASSERT(THREAD_LOCK_HELD(t));
868 842
869 843 setbackdq(t); /* give others a chance to run */
870 844 }
871 845
872 846 static void
873 847 sysdc_tick(kthread_t *t)
874 848 {
875 849 sysdc_t *sdc;
876 850
877 851 thread_lock(t);
878 852 if (t->t_cid != sysdccid) {
879 853 SYSDC_INC_STAT(sysdc_tick_not_sdc);
880 854 thread_unlock(t);
881 855 return;
882 856 }
883 857 sdc = t->t_cldata;
884 858 if (t->t_state == TS_ONPROC &&
885 859 t->t_pri < t->t_disp_queue->disp_maxrunpri) {
886 860 cpu_surrender(t);
887 861 }
888 862
889 863 if (t->t_state == TS_ONPROC || t->t_state == TS_RUN) {
890 864 ASSERT(sdc->sdc_sleep_updates == 0);
891 865 }
892 866
893 867 ASSERT(sdc->sdc_ticks != sdc->sdc_update_ticks);
894 868 sdc->sdc_ticks++;
895 869 if (sdc->sdc_ticks == sdc->sdc_update_ticks) {
896 870 SYSDC_INC_STAT(sysdc_tick_quantum_expired);
897 871 sysdc_update_pri(sdc, SDC_UPDATE_TICK);
898 872 ASSERT(sdc->sdc_ticks != sdc->sdc_update_ticks);
899 873 }
900 874 thread_unlock(t);
901 875 }
902 876
903 877 static void
904 878 sysdc_setrun(kthread_t *t)
905 879 {
906 880 sysdc_t *sdc = t->t_cldata;
907 881
908 882 ASSERT(THREAD_LOCK_HELD(t)); /* t should be in transition */
909 883
910 884 sdc->sdc_sleep_updates = 0;
911 885
912 886 if (sdc->sdc_next == NULL) {
913 887 /*
914 888 * Since we're in transition, we don't want to use the
915 889 * full thread_update_pri().
916 890 */
917 891 if (sysdc_compute_pri(sdc, 0)) {
918 892 THREAD_CHANGE_PRI(t, sdc->sdc_epri);
919 893 }
920 894 sysdc_activate(sdc);
921 895
922 896 ASSERT(sdc->sdc_next != NULL);
923 897 }
924 898
925 899 setbackdq(t);
926 900 }
927 901
928 902 static void
929 903 sysdc_wakeup(kthread_t *t)
930 904 {
931 905 sysdc_setrun(t);
932 906 }
933 907
934 908 static void
935 909 sysdc_sleep(kthread_t *t)
936 910 {
937 911 sysdc_t *sdc = t->t_cldata;
938 912
939 913 ASSERT(THREAD_LOCK_HELD(t)); /* t should be in transition */
940 914
941 915 sdc->sdc_sleep_updates = sdc->sdc_nupdates;
942 916 }
943 917
944 918 /*ARGSUSED*/
945 919 static int
946 920 sysdc_enterclass(kthread_t *t, id_t cid, void *parmsp, cred_t *reqpcredp,
947 921 void *bufp)
948 922 {
949 923 cpupart_t *const cpupart = t->t_cpupart;
950 924 sysdc_t *sdc = bufp;
951 925 sysdc_params_t *sdpp = parmsp;
952 926 sysdc_pset_t *newpset = sdc->sdc_pset;
953 927 sysdc_pset_t *pset;
954 928 int start_timeout;
955 929
956 930 if (t->t_cid != syscid)
957 931 return (EPERM);
958 932
959 933 ASSERT(ttolwp(t) != NULL);
960 934 ASSERT(sdpp != NULL);
961 935 ASSERT(newpset != NULL);
962 936 ASSERT(sysdc_param_init);
963 937
964 938 ASSERT(sdpp->sdp_minpri >= sysdc_minpri);
965 939 ASSERT(sdpp->sdp_maxpri <= sysdc_maxpri);
966 940 ASSERT(sdpp->sdp_DC >= sysdc_minDC);
967 941 ASSERT(sdpp->sdp_DC <= sysdc_maxDC);
968 942
969 943 sdc->sdc_thread = t;
970 944 sdc->sdc_pri = sdpp->sdp_maxpri; /* start off maximally */
971 945 sdc->sdc_minpri = sdpp->sdp_minpri;
972 946 sdc->sdc_maxpri = sdpp->sdp_maxpri;
973 947 sdc->sdc_target_DC = sdpp->sdp_DC;
974 948 sdc->sdc_ticks = 0;
975 949 sdc->sdc_update_ticks = sysdc_update_ticks + 1;
976 950
977 951 /* Assign ourselves to the appropriate pset. */
978 952 sdc->sdc_pset = NULL;
979 953 mutex_enter(&sysdc_pset_lock);
980 954 for (pset = list_head(&sysdc_psets); pset != NULL;
981 955 pset = list_next(&sysdc_psets, pset)) {
982 956 if (pset->sdp_cpupart == cpupart) {
983 957 break;
984 958 }
985 959 }
986 960 if (pset == NULL) {
987 961 pset = newpset;
988 962 newpset = NULL;
989 963 pset->sdp_cpupart = cpupart;
990 964 list_insert_tail(&sysdc_psets, pset);
991 965 }
992 966 pset->sdp_nthreads++;
993 967 ASSERT(pset->sdp_nthreads > 0);
994 968
995 969 sdc->sdc_pset = pset;
996 970
997 971 start_timeout = (sysdc_update_timeout_started == 0);
998 972 sysdc_update_timeout_started = 1;
999 973 mutex_exit(&sysdc_pset_lock);
1000 974
1001 975 if (newpset != NULL)
1002 976 kmem_free(newpset, sizeof (*newpset));
1003 977
1004 978 /* Update t's scheduling class and priority. */
1005 979 thread_lock(t);
1006 980 t->t_clfuncs = &(sclass[cid].cl_funcs->thread);
1007 981 t->t_cid = cid;
1008 982 t->t_cldata = sdc;
1009 983 t->t_schedflag |= TS_RUNQMATCH;
1010 984
1011 985 sysdc_update_pri(sdc, SDC_UPDATE_INITIAL);
1012 986 thread_unlock(t);
1013 987
1014 988 /* Kick off the thread timeout if we're the first one in. */
1015 989 if (start_timeout) {
1016 990 (void) timeout(sysdc_update, NULL, sysdc_update_ticks);
1017 991 }
1018 992
1019 993 return (0);
1020 994 }
1021 995
1022 996 static void
1023 997 sysdc_leave(sysdc_t *sdc)
1024 998 {
1025 999 sysdc_pset_t *sdp = sdc->sdc_pset;
1026 1000 sysdc_list_t *sdl = SYSDC_LIST(sdc);
1027 1001 uint_t freedc;
1028 1002
1029 1003 mutex_enter(&sdl->sdl_lock); /* block sysdc_update() */
1030 1004 sdc->sdc_thread = NULL;
1031 1005 freedc = (sdc->sdc_next == NULL);
1032 1006 mutex_exit(&sdl->sdl_lock);
1033 1007
1034 1008 mutex_enter(&sysdc_pset_lock);
1035 1009 ASSERT(sdp != NULL);
1036 1010 ASSERT(sdp->sdp_nthreads > 0);
1037 1011 --sdp->sdp_nthreads;
1038 1012 if (sdp->sdp_nthreads == 0) {
1039 1013 list_remove(&sysdc_psets, sdp);
1040 1014 } else {
1041 1015 sdp = NULL;
1042 1016 }
1043 1017 mutex_exit(&sysdc_pset_lock);
1044 1018
1045 1019 if (freedc)
1046 1020 kmem_free(sdc, sizeof (*sdc));
1047 1021 if (sdp != NULL)
1048 1022 kmem_free(sdp, sizeof (*sdp));
1049 1023 }
1050 1024
1051 1025 static void
1052 1026 sysdc_exitclass(void *buf)
1053 1027 {
1054 1028 sysdc_leave((sysdc_t *)buf);
1055 1029 }
1056 1030
1057 1031 /*ARGSUSED*/
1058 1032 static int
1059 1033 sysdc_canexit(kthread_t *t, cred_t *reqpcredp)
1060 1034 {
1061 1035 /* Threads cannot exit SDC once joined, except in a body bag. */
1062 1036 return (EPERM);
1063 1037 }
1064 1038
1065 1039 static void
1066 1040 sysdc_exit(kthread_t *t)
1067 1041 {
1068 1042 sysdc_t *sdc;
1069 1043
1070 1044 /* We're exiting, so we just rejoin the SYS class. */
1071 1045 thread_lock(t);
1072 1046 ASSERT(t->t_cid == sysdccid);
1073 1047 sdc = t->t_cldata;
1074 1048 t->t_cid = syscid;
1075 1049 t->t_cldata = NULL;
1076 1050 t->t_clfuncs = &(sclass[syscid].cl_funcs->thread);
1077 1051 (void) thread_change_pri(t, maxclsyspri, 0);
1078 1052 t->t_schedflag &= ~TS_RUNQMATCH;
1079 1053 thread_unlock_nopreempt(t);
1080 1054
1081 1055 /* Unlink the sdc from everything. */
1082 1056 sysdc_leave(sdc);
1083 1057 }
1084 1058
1085 1059 /*ARGSUSED*/
1086 1060 static int
1087 1061 sysdc_fork(kthread_t *t, kthread_t *ct, void *bufp)
1088 1062 {
1089 1063 /*
1090 1064 * Threads cannot be created with SDC as their class; they must
1091 1065 * be created as SYS and then added with sysdc_thread_enter().
1092 1066 * Because of this restriction, sysdc_fork() should never be called.
1093 1067 */
1094 1068 panic("sysdc cannot be forked");
1095 1069
1096 1070 return (ENOSYS);
1097 1071 }
1098 1072
1099 1073 /*ARGSUSED*/
1100 1074 static void
1101 1075 sysdc_forkret(kthread_t *t, kthread_t *ct)
1102 1076 {
1103 1077 /* SDC threads are part of system processes, which never fork. */
1104 1078 panic("sysdc cannot be forked");
1105 1079 }
1106 1080
1107 1081 static pri_t
1108 1082 sysdc_globpri(kthread_t *t)
1109 1083 {
1110 1084 return (t->t_epri);
1111 1085 }
1112 1086
1113 1087 /*ARGSUSED*/
1114 1088 static pri_t
1115 1089 sysdc_no_swap(kthread_t *t, int flags)
1116 1090 {
1117 1091 /* SDC threads cannot be swapped. */
1118 1092 return (-1);
1119 1093 }
1120 1094
1121 1095 /*
1122 1096 * Get maximum and minimum priorities enjoyed by SDC threads.
1123 1097 */
1124 1098 static int
1125 1099 sysdc_getclpri(pcpri_t *pcprip)
1126 1100 {
1127 1101 pcprip->pc_clpmax = sysdc_maxpri;
1128 1102 pcprip->pc_clpmin = sysdc_minpri;
1129 1103 return (0);
1130 1104 }
1131 1105
1132 1106 /*ARGSUSED*/
1133 1107 static int
1134 1108 sysdc_getclinfo(void *arg)
1135 1109 {
1136 1110 return (0); /* no class-specific info */
1137 1111 }
1138 1112
1139 1113 /*ARGSUSED*/
1140 1114 static int
1141 1115 sysdc_alloc(void **p, int flag)
1142 1116 {
1143 1117 sysdc_t *new;
1144 1118
1145 1119 *p = NULL;
1146 1120 if ((new = kmem_zalloc(sizeof (*new), flag)) == NULL) {
1147 1121 return (ENOMEM);
1148 1122 }
1149 1123 if ((new->sdc_pset = kmem_zalloc(sizeof (*new->sdc_pset), flag)) ==
1150 1124 NULL) {
1151 1125 kmem_free(new, sizeof (*new));
1152 1126 return (ENOMEM);
1153 1127 }
1154 1128 *p = new;
1155 1129 return (0);
1156 1130 }
1157 1131
1158 1132 static void
1159 1133 sysdc_free(void *p)
1160 1134 {
1161 1135 sysdc_t *sdc = p;
1162 1136
1163 1137 if (sdc != NULL) {
1164 1138 /*
1165 1139 * We must have failed CL_ENTERCLASS(), so our pset should be
1166 1140 * there and unused.
1167 1141 */
1168 1142 ASSERT(sdc->sdc_pset != NULL);
1169 1143 ASSERT(sdc->sdc_pset->sdp_cpupart == NULL);
1170 1144 kmem_free(sdc->sdc_pset, sizeof (*sdc->sdc_pset));
1171 1145 kmem_free(sdc, sizeof (*sdc));
1172 1146 }
1173 1147 }
1174 1148
1175 1149 static int sysdc_enosys(); /* Boy, ANSI-C's K&R compatibility is weird. */
1176 1150 static int sysdc_einval();
1177 1151 static void sysdc_nullsys();
1178 1152
1179 1153 static struct classfuncs sysdc_classfuncs = {
1180 1154 /* messages to class manager */
1181 1155 {
1182 1156 sysdc_enosys, /* admin */
1183 1157 sysdc_getclinfo,
1184 1158 sysdc_enosys, /* parmsin */
1185 1159 sysdc_enosys, /* parmsout */
1186 1160 sysdc_enosys, /* vaparmsin */
1187 1161 sysdc_enosys, /* vaparmsout */
1188 1162 sysdc_getclpri,
1189 1163 sysdc_alloc,
1190 1164 sysdc_free,
1191 1165 },
1192 1166 /* operations on threads */
1193 1167 {
1194 1168 sysdc_enterclass,
1195 1169 sysdc_exitclass,
1196 1170 sysdc_canexit,
1197 1171 sysdc_fork,
1198 1172 sysdc_forkret,
1199 1173 sysdc_nullsys, /* parmsget */
1200 1174 sysdc_enosys, /* parmsset */
1201 1175 sysdc_nullsys, /* stop */
1202 1176 sysdc_exit,
1203 1177 sysdc_nullsys, /* active */
1204 1178 sysdc_nullsys, /* inactive */
1205 1179 sysdc_no_swap, /* swapin */
1206 1180 sysdc_no_swap, /* swapout */
1207 1181 sysdc_nullsys, /* trapret */
1208 1182 sysdc_preempt,
1209 1183 sysdc_setrun,
1210 1184 sysdc_sleep,
1211 1185 sysdc_tick,
1212 1186 sysdc_wakeup,
1213 1187 sysdc_einval, /* donice */
1214 1188 sysdc_globpri,
1215 1189 sysdc_nullsys, /* set_process_group */
1216 1190 sysdc_nullsys, /* yield */
1217 1191 sysdc_einval, /* doprio */
1218 1192 }
1219 1193 };
1220 1194
1221 1195 static int
1222 1196 sysdc_enosys()
1223 1197 {
1224 1198 return (ENOSYS);
1225 1199 }
1226 1200
1227 1201 static int
1228 1202 sysdc_einval()
1229 1203 {
1230 1204 return (EINVAL);
1231 1205 }
1232 1206
1233 1207 static void
1234 1208 sysdc_nullsys()
1235 1209 {
1236 1210 }
1237 1211
1238 1212 /*ARGSUSED*/
1239 1213 static pri_t
1240 1214 sysdc_init(id_t cid, int clparmsz, classfuncs_t **clfuncspp)
1241 1215 {
1242 1216 int idx;
1243 1217
1244 1218 list_create(&sysdc_psets, sizeof (sysdc_pset_t),
1245 1219 offsetof(sysdc_pset_t, sdp_node));
1246 1220
1247 1221 for (idx = 0; idx < SYSDC_NLISTS; idx++) {
1248 1222 sysdc_active[idx].sdl_list = &sysdc_dummy;
1249 1223 }
1250 1224
1251 1225 sysdc_initparam();
1252 1226
1253 1227 sysdccid = cid;
1254 1228 *clfuncspp = &sysdc_classfuncs;
1255 1229
1256 1230 return ((pri_t)v.v_maxsyspri);
1257 1231 }
1258 1232
1259 1233 static struct sclass csw = {
1260 1234 "SDC",
1261 1235 sysdc_init,
1262 1236 0
1263 1237 };
1264 1238
1265 1239 static struct modlsched modlsched = {
1266 1240 &mod_schedops, "system duty cycle scheduling class", &csw
1267 1241 };
1268 1242
1269 1243 static struct modlinkage modlinkage = {
1270 1244 MODREV_1, (void *)&modlsched, NULL
1271 1245 };
1272 1246
1273 1247 int
1274 1248 _init()
1275 1249 {
1276 1250 return (mod_install(&modlinkage));
1277 1251 }
1278 1252
1279 1253 int
1280 1254 _fini()
1281 1255 {
1282 1256 return (EBUSY); /* can't unload for now */
1283 1257 }
1284 1258
1285 1259 int
1286 1260 _info(struct modinfo *modinfop)
1287 1261 {
1288 1262 return (mod_info(&modlinkage, modinfop));
1289 1263 }
1290 1264
1291 1265 /* --- consolidation-private interfaces --- */
1292 1266 void
1293 1267 sysdc_thread_enter(kthread_t *t, uint_t dc, uint_t flags)
1294 1268 {
1295 1269 void *buf = NULL;
1296 1270 sysdc_params_t sdp;
1297 1271
1298 1272 SYSDC_INC_STAT(sysdc_thread_enter_enter);
1299 1273
1300 1274 ASSERT(sysdc_param_init);
1301 1275 ASSERT(sysdccid >= 0);
1302 1276
1303 1277 ASSERT((flags & ~SYSDC_THREAD_BATCH) == 0);
1304 1278
1305 1279 sdp.sdp_minpri = sysdc_minpri;
1306 1280 sdp.sdp_maxpri = sysdc_maxpri;
1307 1281 sdp.sdp_DC = MAX(MIN(dc, sysdc_maxDC), sysdc_minDC);
1308 1282
1309 1283 VERIFY0(CL_ALLOC(&buf, sysdccid, KM_SLEEP));
1310 1284
1311 1285 ASSERT(t->t_lwp != NULL);
1312 1286 ASSERT(t->t_cid == syscid);
1313 1287 ASSERT(t->t_cldata == NULL);
1314 1288 VERIFY0(CL_CANEXIT(t, NULL));
1315 1289 VERIFY0(CL_ENTERCLASS(t, sysdccid, &sdp, kcred, buf));
1316 1290 CL_EXITCLASS(syscid, NULL);
1317 1291 }
↓ open down ↓ |
1086 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX