Print this page
3006 VERIFY[S,U,P] and ASSERT[S,U,P] frequently check if first argument is zero
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/disp/sysdc.c
+++ new/usr/src/uts/common/disp/sysdc.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
↓ open down ↓ |
12 lines elided |
↑ open up ↑ |
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
23 + * Copyright (c) 2012 by Delphix. All rights reserved.
23 24 */
24 25
25 26 /*
26 27 * The System Duty Cycle (SDC) scheduling class
27 28 * --------------------------------------------
28 29 *
29 30 * Background
30 31 *
31 32 * Kernel threads in Solaris have traditionally not been large consumers
32 33 * of CPU time. They typically wake up, perform a small amount of
33 34 * work, then go back to sleep waiting for either a timeout or another
34 35 * signal. On the assumption that the small amount of work that they do
35 36 * is important for the behavior of the whole system, these threads are
36 37 * treated kindly by the dispatcher and the SYS scheduling class: they run
37 38 * without preemption from anything other than real-time and interrupt
38 39 * threads; when preempted, they are put at the front of the queue, so they
39 40 * generally do not migrate between CPUs; and they are allowed to stay
40 41 * running until they voluntarily give up the CPU.
41 42 *
42 43 * As Solaris has evolved, new workloads have emerged which require the
43 44 * kernel to perform significant amounts of CPU-intensive work. One
44 45 * example of such a workload is ZFS's transaction group sync processing.
45 46 * Each sync operation generates a large batch of I/Os, and each I/O
46 47 * may need to be compressed and/or checksummed before it is written to
47 48 * storage. The taskq threads which perform the compression and checksums
48 49 * will run nonstop as long as they have work to do; a large sync operation
49 50 * on a compression-heavy dataset can keep them busy for seconds on end.
50 51 * This causes human-time-scale dispatch latency bubbles for any other
51 52 * threads which have the misfortune to share a CPU with the taskq threads.
52 53 *
53 54 * The SDC scheduling class is a solution to this problem.
54 55 *
55 56 *
56 57 * Overview
57 58 *
58 59 * SDC is centered around the concept of a thread's duty cycle (DC):
59 60 *
60 61 * ONPROC time
61 62 * Duty Cycle = ----------------------
62 63 * ONPROC + Runnable time
63 64 *
64 65 * This is the ratio of the time that the thread spent running on a CPU
65 66 * divided by the time it spent running or trying to run. It is unaffected
66 67 * by any time the thread spent sleeping, stopped, etc.
67 68 *
68 69 * A thread joining the SDC class specifies a "target" DC that it wants
69 70 * to run at. To implement this policy, the routine sysdc_update() scans
70 71 * the list of active SDC threads every few ticks and uses each thread's
71 72 * microstate data to compute the actual duty cycle that that thread
72 73 * has experienced recently. If the thread is under its target DC, its
73 74 * priority is increased to the maximum available (sysdc_maxpri, which is
74 75 * 99 by default). If the thread is over its target DC, its priority is
75 76 * reduced to the minimum available (sysdc_minpri, 0 by default). This
76 77 * is a fairly primitive approach, in that it doesn't use any of the
77 78 * intermediate priorities, but it's not completely inappropriate. Even
78 79 * though threads in the SDC class might take a while to do their job, they
79 80 * are by some definition important if they're running inside the kernel,
80 81 * so it is reasonable that they should get to run at priority 99.
81 82 *
82 83 * If a thread is running when sysdc_update() calculates its actual duty
83 84 * cycle, and there are other threads of equal or greater priority on its
84 85 * CPU's dispatch queue, sysdc_update() preempts that thread. The thread
85 86 * acknowledges the preemption by calling sysdc_preempt(), which calls
86 87 * setbackdq(), which gives other threads with the same priority a chance
87 88 * to run. This creates a de facto time quantum for threads in the SDC
88 89 * scheduling class.
89 90 *
90 91 * An SDC thread which is assigned priority 0 can continue to run if
91 92 * nothing else needs to use the CPU that it's running on. Similarly, an
92 93 * SDC thread at priority 99 might not get to run as much as it wants to
93 94 * if there are other priority-99 or higher threads on its CPU. These
94 95 * situations would cause the thread to get ahead of or behind its target
95 96 * DC; the longer the situations lasted, the further ahead or behind the
96 97 * thread would get. Rather than condemning a thread to a lifetime of
97 98 * paying for its youthful indiscretions, SDC keeps "base" values for
98 99 * ONPROC and Runnable times in each thread's sysdc data, and updates these
99 100 * values periodically. The duty cycle is then computed using the elapsed
100 101 * amount of ONPROC and Runnable times since those base times.
101 102 *
102 103 * Since sysdc_update() scans SDC threads fairly frequently, it tries to
103 104 * keep the list of "active" threads small by pruning out threads which
104 105 * have been asleep for a brief time. They are not pruned immediately upon
105 106 * going to sleep, since some threads may bounce back and forth between
106 107 * sleeping and being runnable.
107 108 *
108 109 *
109 110 * Interfaces
110 111 *
111 112 * void sysdc_thread_enter(t, dc, flags)
112 113 *
113 114 * Moves a kernel thread from the SYS scheduling class to the
114 115 * SDC class. t must have an associated LWP (created by calling
115 116 * lwp_kernel_create()). The thread will have a target DC of dc.
116 117 * Flags should be either 0 or SYSDC_THREAD_BATCH. If
117 118 * SYSDC_THREAD_BATCH is specified, the thread is expected to be
118 119 * doing large amounts of processing.
119 120 *
120 121 *
121 122 * Complications
122 123 *
123 124 * - Run queue balancing
124 125 *
125 126 * The Solaris dispatcher is biased towards letting a thread run
126 127 * on the same CPU which it last ran on, if no more than 3 ticks
127 128 * (i.e. rechoose_interval) have passed since the thread last ran.
128 129 * This helps to preserve cache warmth. On the other hand, it also
129 130 * tries to keep the per-CPU run queues fairly balanced; if the CPU
130 131 * chosen for a runnable thread has a run queue which is three or
131 132 * more threads longer than a neighboring CPU's queue, the runnable
132 133 * thread is dispatched onto the neighboring CPU instead.
133 134 *
134 135 * These policies work well for some workloads, but not for many SDC
135 136 * threads. The taskq client of SDC, for example, has many discrete
136 137 * units of work to do. The work units are largely independent, so
137 138 * cache warmth is not an important consideration. It is important
138 139 * that the threads fan out quickly to different CPUs, since the
139 140 * amount of work these threads have to do (a few seconds worth at a
140 141 * time) doesn't leave much time to correct thread placement errors
141 142 * (i.e. two SDC threads being dispatched to the same CPU).
142 143 *
143 144 * To fix this, SDC uses the TS_RUNQMATCH flag introduced for FSS.
144 145 * This tells the dispatcher to keep neighboring run queues' lengths
145 146 * more evenly matched, which allows SDC threads to migrate more
146 147 * easily.
147 148 *
148 149 * - LWPs and system processes
149 150 *
150 151 * SDC can only be used for kernel threads. Since SDC uses microstate
151 152 * accounting data to compute each thread's actual duty cycle, all
152 153 * threads entering the SDC class must have associated LWPs (which
153 154 * store the microstate data). This means that the threads have to
154 155 * be associated with an SSYS process, i.e. one created by newproc().
155 156 * If the microstate accounting information is ever moved into the
156 157 * kthread_t, this restriction could be lifted.
157 158 *
158 159 * - Dealing with oversubscription
159 160 *
160 161 * Since SDC duty cycles are per-thread, it is possible that the
161 162 * aggregate requested duty cycle of all SDC threads in a processor
162 163 * set could be greater than the total CPU time available in that set.
163 164 * The FSS scheduling class has an analogous situation, which it deals
164 165 * with by reducing each thread's allotted CPU time proportionally.
165 166 * Since SDC doesn't need to be as precise as FSS, it uses a simpler
166 167 * solution to the oversubscription problem.
167 168 *
168 169 * sysdc_update() accumulates the amount of time that max-priority SDC
169 170 * threads have spent on-CPU in each processor set, and uses that sum
170 171 * to create an implied duty cycle for that processor set:
171 172 *
172 173 * accumulated CPU time
173 174 * pset DC = -----------------------------------
174 175 * (# CPUs) * time since last update
175 176 *
176 177 * If this implied duty cycle is above a maximum pset duty cycle (90%
177 178 * by default), sysdc_update() sets the priority of all SDC threads
178 179 * in that processor set to sysdc_minpri for a "break" period. After
179 180 * the break period, it waits for a "nobreak" period before trying to
180 181 * enforce the pset duty cycle limit again.
181 182 *
182 183 * - Processor sets
183 184 *
184 185 * As the above implies, SDC is processor set aware, but it does not
185 186 * currently allow threads to change processor sets while in the SDC
186 187 * class. Instead, those threads must join the desired processor set
187 188 * before entering SDC. [1]
188 189 *
189 190 * - Batch threads
190 191 *
191 192 * A thread joining the SDC class can specify the SDC_THREAD_BATCH
192 193 * flag. This flag currently has no effect, but marks threads which
193 194 * do bulk processing.
194 195 *
195 196 * - t_kpri_req
196 197 *
197 198 * The TS and FSS scheduling classes pay attention to t_kpri_req,
198 199 * which provides a simple form of priority inheritance for
199 200 * synchronization primitives (such as rwlocks held as READER) which
200 201 * cannot be traced to a unique thread. The SDC class does not honor
201 202 * t_kpri_req, for a few reasons:
202 203 *
203 204 * 1. t_kpri_req is notoriously inaccurate. A measure of its
204 205 * inaccuracy is that it needs to be cleared every time a thread
205 206 * returns to user mode, because it is frequently non-zero at that
206 207 * point. This can happen because "ownership" of synchronization
207 208 * primitives that use t_kpri_req can be silently handed off,
208 209 * leaving no opportunity to will the t_kpri_req inheritance.
209 210 *
210 211 * 2. Unlike in TS and FSS, threads in SDC *will* eventually run at
211 212 * kernel priority. This means that even if an SDC thread
212 213 * is holding a synchronization primitive and running at low
213 214 * priority, its priority will eventually be raised above 60,
214 215 * allowing it to drive on and release the resource.
215 216 *
216 217 * 3. The first consumer of SDC uses the taskq subsystem, which holds
217 218 * a reader lock for the duration of the task's execution. This
218 219 * would mean that SDC threads would never drop below kernel
219 220 * priority in practice, which defeats one of the purposes of SDC.
220 221 *
221 222 * - Why not FSS?
222 223 *
223 224 * It might seem that the existing FSS scheduling class could solve
224 225 * the problems that SDC is attempting to solve. FSS's more precise
225 226 * solution to the oversubscription problem would hardly cause
226 227 * trouble, as long as it performed well. SDC is implemented as
227 228 * a separate scheduling class for two main reasons: the initial
228 229 * consumer of SDC does not map well onto the "project" abstraction
229 230 * that is central to FSS, and FSS does not expect to run at kernel
230 231 * priorities.
231 232 *
232 233 *
233 234 * Tunables
234 235 *
235 236 * - sysdc_update_interval_msec: Number of milliseconds between
236 237 * consecutive thread priority updates.
237 238 *
238 239 * - sysdc_reset_interval_msec: Number of milliseconds between
239 240 * consecutive resets of a thread's base ONPROC and Runnable
240 241 * times.
241 242 *
242 243 * - sysdc_prune_interval_msec: Number of milliseconds of sleeping
243 244 * before a thread is pruned from the active list.
244 245 *
245 246 * - sysdc_max_pset_DC: Allowable percentage of a processor set's
246 247 * CPU time which SDC can give to its high-priority threads.
247 248 *
248 249 * - sysdc_break_msec: Number of milliseconds of "break" taken when
249 250 * sysdc_max_pset_DC is exceeded.
250 251 *
251 252 *
252 253 * Future work (in SDC and related subsystems)
253 254 *
254 255 * - Per-thread rechoose interval (0 for SDC)
255 256 *
256 257 * Allow each thread to specify its own rechoose interval. SDC
257 258 * threads would specify an interval of zero, which would rechoose
258 259 * the CPU with the lowest priority once per update.
259 260 *
260 261 * - Allow threads to change processor sets after joining the SDC class
261 262 *
262 263 * - Thread groups and per-group DC
263 264 *
264 265 * It might be nice to be able to specify a duty cycle which applies
265 266 * to a group of threads in aggregate.
266 267 *
267 268 * - Per-group DC callback to allow dynamic DC tuning
268 269 *
269 270 * Currently, DCs are assigned when the thread joins SDC. Some
270 271 * workloads could benefit from being able to tune their DC using
271 272 * subsystem-specific knowledge about the workload.
272 273 *
273 274 * - Finer-grained priority updates
274 275 *
275 276 * - More nuanced management of oversubscription
276 277 *
277 278 * - Moving other CPU-intensive threads into SDC
278 279 *
279 280 * - Move msacct data into kthread_t
280 281 *
281 282 * This would allow kernel threads without LWPs to join SDC.
282 283 *
283 284 *
284 285 * Footnotes
285 286 *
286 287 * [1] The details of doing so are left as an exercise for the reader.
287 288 */
288 289
289 290 #include <sys/types.h>
290 291 #include <sys/sysdc.h>
291 292 #include <sys/sysdc_impl.h>
292 293
293 294 #include <sys/class.h>
294 295 #include <sys/cmn_err.h>
295 296 #include <sys/cpuvar.h>
296 297 #include <sys/cpupart.h>
297 298 #include <sys/debug.h>
298 299 #include <sys/disp.h>
299 300 #include <sys/errno.h>
300 301 #include <sys/inline.h>
301 302 #include <sys/kmem.h>
302 303 #include <sys/modctl.h>
303 304 #include <sys/schedctl.h>
304 305 #include <sys/sdt.h>
305 306 #include <sys/sunddi.h>
306 307 #include <sys/sysmacros.h>
307 308 #include <sys/systm.h>
308 309 #include <sys/var.h>
309 310
310 311 /*
311 312 * Tunables - loaded into the internal state at module load time
312 313 */
313 314 uint_t sysdc_update_interval_msec = 20;
314 315 uint_t sysdc_reset_interval_msec = 400;
315 316 uint_t sysdc_prune_interval_msec = 100;
316 317 uint_t sysdc_max_pset_DC = 90;
317 318 uint_t sysdc_break_msec = 80;
318 319
319 320 /*
320 321 * Internal state - constants set up by sysdc_initparam()
321 322 */
322 323 static clock_t sysdc_update_ticks; /* ticks between updates */
323 324 static uint_t sysdc_prune_updates; /* updates asleep before pruning */
324 325 static uint_t sysdc_reset_updates; /* # of updates before reset */
325 326 static uint_t sysdc_break_updates; /* updates to break */
326 327 static uint_t sysdc_nobreak_updates; /* updates to not check */
327 328 static uint_t sysdc_minDC; /* minimum allowed DC */
328 329 static uint_t sysdc_maxDC; /* maximum allowed DC */
329 330 static pri_t sysdc_minpri; /* minimum allowed priority */
330 331 static pri_t sysdc_maxpri; /* maximum allowed priority */
331 332
332 333 /*
333 334 * Internal state
334 335 */
335 336 static kmutex_t sysdc_pset_lock; /* lock protecting pset data */
336 337 static list_t sysdc_psets; /* list of psets with SDC threads */
337 338 static uint_t sysdc_param_init; /* sysdc_initparam() has been called */
338 339 static uint_t sysdc_update_timeout_started; /* update timeout is active */
339 340 static hrtime_t sysdc_last_update; /* time of last sysdc_update() */
340 341 static sysdc_t sysdc_dummy; /* used to terminate active lists */
341 342
342 343 /*
343 344 * Internal state - active hash table
344 345 */
345 346 #define SYSDC_NLISTS 8
346 347 #define SYSDC_HASH(sdc) (((uintptr_t)(sdc) >> 6) & (SYSDC_NLISTS - 1))
347 348 static sysdc_list_t sysdc_active[SYSDC_NLISTS];
348 349 #define SYSDC_LIST(sdc) (&sysdc_active[SYSDC_HASH(sdc)])
349 350
350 351 #ifdef DEBUG
351 352 static struct {
352 353 uint64_t sysdc_update_times_asleep;
353 354 uint64_t sysdc_update_times_base_ran_backwards;
354 355 uint64_t sysdc_update_times_already_done;
355 356 uint64_t sysdc_update_times_cur_ran_backwards;
356 357 uint64_t sysdc_compute_pri_breaking;
357 358 uint64_t sysdc_activate_enter;
358 359 uint64_t sysdc_update_enter;
359 360 uint64_t sysdc_update_exited;
360 361 uint64_t sysdc_update_not_sdc;
361 362 uint64_t sysdc_update_idle;
362 363 uint64_t sysdc_update_take_break;
363 364 uint64_t sysdc_update_no_psets;
364 365 uint64_t sysdc_tick_not_sdc;
365 366 uint64_t sysdc_tick_quantum_expired;
366 367 uint64_t sysdc_thread_enter_enter;
367 368 } sysdc_stats;
368 369
369 370 #define SYSDC_INC_STAT(x) (sysdc_stats.x++)
370 371 #else
371 372 #define SYSDC_INC_STAT(x) ((void)0)
372 373 #endif
373 374
374 375 /* macros are UPPER CASE */
375 376 #define HOWMANY(a, b) howmany((a), (b))
376 377 #define MSECTOTICKS(a) HOWMANY((a) * 1000, usec_per_tick)
377 378
378 379 static void
379 380 sysdc_initparam(void)
380 381 {
381 382 uint_t sysdc_break_ticks;
382 383
383 384 /* update / prune intervals */
384 385 sysdc_update_ticks = MSECTOTICKS(sysdc_update_interval_msec);
385 386
386 387 sysdc_prune_updates = HOWMANY(sysdc_prune_interval_msec,
387 388 sysdc_update_interval_msec);
388 389 sysdc_reset_updates = HOWMANY(sysdc_reset_interval_msec,
389 390 sysdc_update_interval_msec);
390 391
391 392 /* We must get at least a little time on CPU. */
392 393 sysdc_minDC = 1;
393 394 sysdc_maxDC = SYSDC_DC_MAX;
394 395 sysdc_minpri = 0;
395 396 sysdc_maxpri = maxclsyspri;
396 397
397 398 /* break parameters */
398 399 if (sysdc_max_pset_DC > SYSDC_DC_MAX) {
399 400 sysdc_max_pset_DC = SYSDC_DC_MAX;
400 401 }
401 402 sysdc_break_ticks = MSECTOTICKS(sysdc_break_msec);
402 403 sysdc_break_updates = HOWMANY(sysdc_break_ticks, sysdc_update_ticks);
403 404
404 405 /*
405 406 * We want:
406 407 *
407 408 * sysdc_max_pset_DC = (nobreak / (break + nobreak))
408 409 *
409 410 * ==> nobreak = sysdc_max_pset_DC * (break + nobreak)
410 411 *
411 412 * sysdc_max_pset_DC * break
412 413 * ==> nobreak = -------------------------
413 414 * 1 - sysdc_max_pset_DC
414 415 */
415 416 sysdc_nobreak_updates =
416 417 HOWMANY((uint64_t)sysdc_break_updates * sysdc_max_pset_DC,
417 418 (SYSDC_DC_MAX - sysdc_max_pset_DC));
418 419
419 420 sysdc_param_init = 1;
420 421 }
421 422
422 423 #undef HOWMANY
423 424 #undef MSECTOTICKS
424 425
425 426 #define SDC_UPDATE_INITIAL 0x1 /* for the initial update */
426 427 #define SDC_UPDATE_TIMEOUT 0x2 /* from sysdc_update() */
427 428 #define SDC_UPDATE_TICK 0x4 /* from sysdc_tick(), on expiry */
428 429
429 430 /*
430 431 * Updates the recorded times in the sdc, and returns the elapsed ONPROC
431 432 * and Runnable times since the last reset.
432 433 *
433 434 * newO is the thread's actual ONPROC time; it's used during sysdc_update()
434 435 * to track processor set usage.
435 436 */
436 437 static void
437 438 sysdc_update_times(sysdc_t *sdc, uint_t flags,
438 439 hrtime_t *O, hrtime_t *R, hrtime_t *newO)
439 440 {
440 441 kthread_t *const t = sdc->sdc_thread;
441 442 const uint_t initial = (flags & SDC_UPDATE_INITIAL);
442 443 const uint_t update = (flags & SDC_UPDATE_TIMEOUT);
443 444 const clock_t now = ddi_get_lbolt();
444 445 uint_t do_reset;
445 446
446 447 ASSERT(THREAD_LOCK_HELD(t));
447 448
448 449 *O = *R = 0;
449 450
450 451 /* If we've been sleeping, we know we haven't had any ONPROC time. */
451 452 if (sdc->sdc_sleep_updates != 0 &&
452 453 sdc->sdc_sleep_updates != sdc->sdc_nupdates) {
453 454 *newO = sdc->sdc_last_base_O;
454 455 SYSDC_INC_STAT(sysdc_update_times_asleep);
455 456 return;
456 457 }
457 458
458 459 /*
459 460 * If this is our first update, or we've hit the reset point,
460 461 * we need to reset our base_{O,R}. Once we've updated them, we
461 462 * report O and R for the entire prior interval.
462 463 */
463 464 do_reset = initial;
464 465 if (update) {
465 466 ++sdc->sdc_nupdates;
466 467 if ((sdc->sdc_nupdates % sysdc_reset_updates) == 0)
467 468 do_reset = 1;
468 469 }
469 470 if (do_reset) {
470 471 hrtime_t baseO, baseR;
471 472 if (initial) {
472 473 /*
473 474 * Start off our cycle count somewhere in the middle,
474 475 * to keep the resets from all happening at once.
475 476 *
476 477 * 4999 is a handy prime much larger than
477 478 * sysdc_reset_updates, so that we don't run into
478 479 * trouble if the resolution is a multiple of
479 480 * sysdc_reset_updates.
480 481 */
481 482 sdc->sdc_nupdates = (uint_t)((gethrtime() % 4999) %
482 483 sysdc_reset_updates);
483 484 baseO = baseR = 0;
484 485 } else {
485 486 baseO = sdc->sdc_base_O;
486 487 baseR = sdc->sdc_base_R;
487 488 }
488 489
489 490 mstate_systhread_times(t, &sdc->sdc_base_O, &sdc->sdc_base_R);
490 491 *newO = sdc->sdc_base_O;
491 492
492 493 sdc->sdc_reset = now;
493 494 sdc->sdc_pri_check = -1; /* force mismatch below */
494 495
495 496 /*
496 497 * See below for rationale.
497 498 */
498 499 if (baseO > sdc->sdc_base_O || baseR > sdc->sdc_base_R) {
499 500 SYSDC_INC_STAT(sysdc_update_times_base_ran_backwards);
500 501 baseO = sdc->sdc_base_O;
501 502 baseR = sdc->sdc_base_R;
502 503 }
503 504
504 505 /* compute based on the entire interval */
505 506 *O = (sdc->sdc_base_O - baseO);
506 507 *R = (sdc->sdc_base_R - baseR);
507 508 return;
508 509 }
509 510
510 511 /*
511 512 * If we're called from sysdc_update(), we *must* return a value
512 513 * for newO, so we always call mstate_systhread_times().
513 514 *
514 515 * Otherwise, if we've already done a pri check this tick,
515 516 * we can skip it.
516 517 */
517 518 if (!update && sdc->sdc_pri_check == now) {
518 519 SYSDC_INC_STAT(sysdc_update_times_already_done);
519 520 return;
520 521 }
521 522
522 523 /* Get the current times from the thread */
523 524 sdc->sdc_pri_check = now;
524 525 mstate_systhread_times(t, &sdc->sdc_cur_O, &sdc->sdc_cur_R);
525 526 *newO = sdc->sdc_cur_O;
526 527
527 528 /*
528 529 * The updating of microstate accounting is not done under a
529 530 * consistent set of locks, particularly the t_waitrq field. This
530 531 * can lead to narrow windows in which we account for time in the
531 532 * wrong bucket, which on the next read will be accounted for
532 533 * correctly.
533 534 *
534 535 * If our sdc_base_* fields were affected by one of these blips, we
535 536 * throw away the old data, and pretend this tick didn't happen.
536 537 */
537 538 if (sdc->sdc_cur_O < sdc->sdc_base_O ||
538 539 sdc->sdc_cur_R < sdc->sdc_base_R) {
539 540
540 541 sdc->sdc_base_O = sdc->sdc_cur_O;
541 542 sdc->sdc_base_R = sdc->sdc_cur_R;
542 543
543 544 SYSDC_INC_STAT(sysdc_update_times_cur_ran_backwards);
544 545 return;
545 546 }
546 547
547 548 *O = sdc->sdc_cur_O - sdc->sdc_base_O;
548 549 *R = sdc->sdc_cur_R - sdc->sdc_base_R;
549 550 }
550 551
551 552 /*
552 553 * sysdc_compute_pri()
553 554 *
554 555 * Recomputes the priority of the thread, leaving the result in
555 556 * sdc->sdc_epri. Returns 1 if a priority update should occur
556 557 * (which will also trigger a cpu_surrender()), otherwise
557 558 * returns 0.
558 559 */
559 560 static uint_t
560 561 sysdc_compute_pri(sysdc_t *sdc, uint_t flags)
561 562 {
562 563 kthread_t *const t = sdc->sdc_thread;
563 564 const uint_t update = (flags & SDC_UPDATE_TIMEOUT);
564 565 const uint_t tick = (flags & SDC_UPDATE_TICK);
565 566
566 567 hrtime_t O, R;
567 568 hrtime_t newO = -1;
568 569
569 570 ASSERT(THREAD_LOCK_HELD(t));
570 571
571 572 sysdc_update_times(sdc, flags, &O, &R, &newO);
572 573 ASSERT(!update || newO != -1);
573 574
574 575 /* If we have new data, recompute our priority. */
575 576 if ((O + R) != 0) {
576 577 sdc->sdc_cur_DC = (O * SYSDC_DC_MAX) / (O + R);
577 578
578 579 /* Adjust our priority to move our DC closer to the target. */
579 580 if (sdc->sdc_cur_DC < sdc->sdc_target_DC)
580 581 sdc->sdc_pri = sdc->sdc_maxpri;
581 582 else
582 583 sdc->sdc_pri = sdc->sdc_minpri;
583 584 }
584 585
585 586 /*
586 587 * If our per-pset duty cycle goes over the max, we will take a break.
587 588 * This forces all sysdc threads in the pset to minimum priority, in
588 589 * order to let everyone else have a chance at the CPU.
589 590 */
590 591 if (sdc->sdc_pset->sdp_need_break) {
591 592 SYSDC_INC_STAT(sysdc_compute_pri_breaking);
592 593 sdc->sdc_epri = sdc->sdc_minpri;
593 594 } else {
594 595 sdc->sdc_epri = sdc->sdc_pri;
595 596 }
596 597
597 598 DTRACE_PROBE4(sysdc__compute__pri,
598 599 kthread_t *, t, pri_t, sdc->sdc_epri, uint_t, sdc->sdc_cur_DC,
599 600 uint_t, sdc->sdc_target_DC);
600 601
601 602 /*
602 603 * For sysdc_update(), we compute the ONPROC time for high-priority
603 604 * threads, which is used to calculate the per-pset duty cycle. We
604 605 * will always tell our callers to update the thread's priority,
605 606 * since we want to force a cpu_surrender().
606 607 *
607 608 * We reset sdc_update_ticks so that sysdc_tick() will only update
608 609 * the thread's priority if our timeout is delayed by a tick or
609 610 * more.
610 611 */
611 612 if (update) {
612 613 /* SDC threads are not allowed to change cpupart bindings. */
613 614 ASSERT(t->t_cpupart == sdc->sdc_pset->sdp_cpupart);
614 615
615 616 /* If we were at MAXPRI, account for our onproc time. */
616 617 if (t->t_pri == sdc->sdc_maxpri &&
617 618 sdc->sdc_last_base_O != 0 &&
618 619 sdc->sdc_last_base_O < newO) {
619 620 sdc->sdc_last_O = newO - sdc->sdc_last_base_O;
620 621 sdc->sdc_pset->sdp_onproc_time +=
621 622 (uint64_t)sdc->sdc_last_O;
622 623 sdc->sdc_pset->sdp_onproc_threads++;
623 624 } else {
624 625 sdc->sdc_last_O = 0;
625 626 }
626 627 sdc->sdc_last_base_O = newO;
627 628
628 629 sdc->sdc_update_ticks = sdc->sdc_ticks + sysdc_update_ticks + 1;
629 630 return (1);
630 631 }
631 632
632 633 /*
633 634 * Like sysdc_update(), sysdc_tick() always wants to update the
634 635 * thread's priority, so that the CPU is surrendered if necessary.
635 636 * We reset sdc_update_ticks so that if the timeout continues to be
636 637 * delayed, we'll update at the regular interval.
637 638 */
638 639 if (tick) {
639 640 ASSERT(sdc->sdc_ticks == sdc->sdc_update_ticks);
640 641 sdc->sdc_update_ticks = sdc->sdc_ticks + sysdc_update_ticks;
641 642 return (1);
642 643 }
643 644
644 645 /*
645 646 * Otherwise, only tell our callers to update the priority if it has
646 647 * changed.
647 648 */
648 649 return (sdc->sdc_epri != t->t_pri);
649 650 }
650 651
651 652 static void
652 653 sysdc_update_pri(sysdc_t *sdc, uint_t flags)
653 654 {
654 655 kthread_t *t = sdc->sdc_thread;
655 656
656 657 ASSERT(THREAD_LOCK_HELD(t));
657 658
658 659 if (sysdc_compute_pri(sdc, flags)) {
659 660 if (!thread_change_pri(t, sdc->sdc_epri, 0)) {
660 661 cpu_surrender(t);
661 662 }
662 663 }
663 664 }
664 665
665 666 /*
666 667 * Add a thread onto the active list. It will only be removed by
667 668 * sysdc_update().
668 669 */
669 670 static void
670 671 sysdc_activate(sysdc_t *sdc)
671 672 {
672 673 sysdc_t *volatile *headp = &SYSDC_LIST(sdc)->sdl_list;
673 674 sysdc_t *head;
674 675 kthread_t *t = sdc->sdc_thread;
675 676
676 677 SYSDC_INC_STAT(sysdc_activate_enter);
677 678
678 679 ASSERT(sdc->sdc_next == NULL);
679 680 ASSERT(THREAD_LOCK_HELD(t));
680 681
681 682 do {
682 683 head = *headp;
683 684 sdc->sdc_next = head;
684 685 } while (atomic_cas_ptr(headp, head, sdc) != head);
685 686 }
686 687
687 688 /*
688 689 * sysdc_update() has two jobs:
689 690 *
690 691 * 1. It updates the priorities of all active SDC threads on the system.
691 692 * 2. It measures pset CPU usage and enforces sysdc_max_pset_DC.
692 693 */
693 694 static void
694 695 sysdc_update(void *arg)
695 696 {
696 697 int idx;
697 698 sysdc_t *freelist = NULL;
698 699 sysdc_pset_t *cur;
699 700 hrtime_t now, diff;
700 701 uint_t redeploy = 1;
701 702
702 703 SYSDC_INC_STAT(sysdc_update_enter);
703 704
704 705 ASSERT(sysdc_update_timeout_started);
705 706
706 707 /*
707 708 * If this is our first time through, diff will be gigantic, and
708 709 * no breaks will be necessary.
709 710 */
710 711 now = gethrtime();
711 712 diff = now - sysdc_last_update;
712 713 sysdc_last_update = now;
713 714
714 715 mutex_enter(&sysdc_pset_lock);
715 716 for (cur = list_head(&sysdc_psets); cur != NULL;
716 717 cur = list_next(&sysdc_psets, cur)) {
717 718 boolean_t breaking = (cur->sdp_should_break != 0);
718 719
719 720 if (cur->sdp_need_break != breaking) {
720 721 DTRACE_PROBE2(sdc__pset__break, sysdc_pset_t *, cur,
721 722 boolean_t, breaking);
722 723 }
723 724 cur->sdp_onproc_time = 0;
724 725 cur->sdp_onproc_threads = 0;
725 726 cur->sdp_need_break = breaking;
726 727 }
727 728 mutex_exit(&sysdc_pset_lock);
728 729
729 730 for (idx = 0; idx < SYSDC_NLISTS; idx++) {
730 731 sysdc_list_t *sdl = &sysdc_active[idx];
731 732 sysdc_t *volatile *headp = &sdl->sdl_list;
732 733 sysdc_t *head, *tail;
733 734 sysdc_t **prevptr;
734 735
735 736 if (*headp == &sysdc_dummy)
736 737 continue;
737 738
738 739 /* Prevent any threads from exiting while we're poking them. */
739 740 mutex_enter(&sdl->sdl_lock);
740 741
741 742 /*
742 743 * Each sdl_list contains a singly-linked list of active
743 744 * threads. Threads which become active while we are
744 745 * processing the list will be added to sdl_list. Since we
745 746 * don't want that to interfere with our own processing, we
746 747 * swap in an empty list. Any newly active threads will
747 748 * go on to this empty list. When finished, we'll put any
748 749 * such threads at the end of the processed list.
749 750 */
750 751 head = atomic_swap_ptr(headp, &sysdc_dummy);
751 752 prevptr = &head;
752 753 while (*prevptr != &sysdc_dummy) {
753 754 sysdc_t *const sdc = *prevptr;
754 755 kthread_t *const t = sdc->sdc_thread;
755 756
756 757 /*
757 758 * If the thread has exited, move its sysdc_t onto
758 759 * freelist, to be freed later.
759 760 */
760 761 if (t == NULL) {
761 762 *prevptr = sdc->sdc_next;
762 763 SYSDC_INC_STAT(sysdc_update_exited);
763 764 sdc->sdc_next = freelist;
764 765 freelist = sdc;
765 766 continue;
766 767 }
767 768
768 769 thread_lock(t);
769 770 if (t->t_cid != sysdccid) {
770 771 thread_unlock(t);
771 772 prevptr = &sdc->sdc_next;
772 773 SYSDC_INC_STAT(sysdc_update_not_sdc);
773 774 continue;
774 775 }
775 776 ASSERT(t->t_cldata == sdc);
776 777
777 778 /*
778 779 * If the thread has been sleeping for longer
779 780 * than sysdc_prune_interval, make it inactive by
780 781 * removing it from the list.
781 782 */
782 783 if (!(t->t_state & (TS_RUN | TS_ONPROC)) &&
783 784 sdc->sdc_sleep_updates != 0 &&
784 785 (sdc->sdc_sleep_updates - sdc->sdc_nupdates) >
785 786 sysdc_prune_updates) {
786 787 *prevptr = sdc->sdc_next;
787 788 SYSDC_INC_STAT(sysdc_update_idle);
788 789 sdc->sdc_next = NULL;
789 790 thread_unlock(t);
790 791 continue;
791 792 }
792 793 sysdc_update_pri(sdc, SDC_UPDATE_TIMEOUT);
793 794 thread_unlock(t);
794 795
795 796 prevptr = &sdc->sdc_next;
796 797 }
797 798
798 799 /*
799 800 * Add our list to the bucket, putting any new entries
800 801 * added while we were working at the tail of the list.
801 802 */
802 803 do {
803 804 tail = *headp;
804 805 *prevptr = tail;
805 806 } while (atomic_cas_ptr(headp, tail, head) != tail);
806 807
807 808 mutex_exit(&sdl->sdl_lock);
808 809 }
809 810
810 811 mutex_enter(&sysdc_pset_lock);
811 812 for (cur = list_head(&sysdc_psets); cur != NULL;
812 813 cur = list_next(&sysdc_psets, cur)) {
813 814
814 815 cur->sdp_vtime_last_interval =
815 816 diff * cur->sdp_cpupart->cp_ncpus;
816 817 cur->sdp_DC_last_interval =
817 818 (cur->sdp_onproc_time * SYSDC_DC_MAX) /
818 819 cur->sdp_vtime_last_interval;
819 820
820 821 if (cur->sdp_should_break > 0) {
821 822 cur->sdp_should_break--; /* breaking */
822 823 continue;
823 824 }
824 825 if (cur->sdp_dont_break > 0) {
825 826 cur->sdp_dont_break--; /* waiting before checking */
826 827 continue;
827 828 }
828 829 if (cur->sdp_DC_last_interval > sysdc_max_pset_DC) {
829 830 cur->sdp_should_break = sysdc_break_updates;
830 831 cur->sdp_dont_break = sysdc_nobreak_updates;
831 832 SYSDC_INC_STAT(sysdc_update_take_break);
832 833 }
833 834 }
834 835
835 836 /*
836 837 * If there are no sysdc_psets, there can be no threads, so
837 838 * we can stop doing our timeout. Since we're holding the
838 839 * sysdc_pset_lock, no new sysdc_psets can come in, which will
839 840 * prevent anyone from racing with this and dropping our timeout
840 841 * on the floor.
841 842 */
842 843 if (list_is_empty(&sysdc_psets)) {
843 844 SYSDC_INC_STAT(sysdc_update_no_psets);
844 845 ASSERT(sysdc_update_timeout_started);
845 846 sysdc_update_timeout_started = 0;
846 847
847 848 redeploy = 0;
848 849 }
849 850 mutex_exit(&sysdc_pset_lock);
850 851
851 852 while (freelist != NULL) {
852 853 sysdc_t *cur = freelist;
853 854 freelist = cur->sdc_next;
854 855 kmem_free(cur, sizeof (*cur));
855 856 }
856 857
857 858 if (redeploy) {
858 859 (void) timeout(sysdc_update, arg, sysdc_update_ticks);
859 860 }
860 861 }
861 862
862 863 static void
863 864 sysdc_preempt(kthread_t *t)
864 865 {
865 866 ASSERT(t == curthread);
866 867 ASSERT(THREAD_LOCK_HELD(t));
867 868
868 869 setbackdq(t); /* give others a chance to run */
869 870 }
870 871
871 872 static void
872 873 sysdc_tick(kthread_t *t)
873 874 {
874 875 sysdc_t *sdc;
875 876
876 877 thread_lock(t);
877 878 if (t->t_cid != sysdccid) {
878 879 SYSDC_INC_STAT(sysdc_tick_not_sdc);
879 880 thread_unlock(t);
880 881 return;
881 882 }
882 883 sdc = t->t_cldata;
883 884 if (t->t_state == TS_ONPROC &&
884 885 t->t_pri < t->t_disp_queue->disp_maxrunpri) {
885 886 cpu_surrender(t);
886 887 }
887 888
888 889 if (t->t_state == TS_ONPROC || t->t_state == TS_RUN) {
889 890 ASSERT(sdc->sdc_sleep_updates == 0);
890 891 }
891 892
892 893 ASSERT(sdc->sdc_ticks != sdc->sdc_update_ticks);
893 894 sdc->sdc_ticks++;
894 895 if (sdc->sdc_ticks == sdc->sdc_update_ticks) {
895 896 SYSDC_INC_STAT(sysdc_tick_quantum_expired);
896 897 sysdc_update_pri(sdc, SDC_UPDATE_TICK);
897 898 ASSERT(sdc->sdc_ticks != sdc->sdc_update_ticks);
898 899 }
899 900 thread_unlock(t);
900 901 }
901 902
902 903 static void
903 904 sysdc_setrun(kthread_t *t)
904 905 {
905 906 sysdc_t *sdc = t->t_cldata;
906 907
907 908 ASSERT(THREAD_LOCK_HELD(t)); /* t should be in transition */
908 909
909 910 sdc->sdc_sleep_updates = 0;
910 911
911 912 if (sdc->sdc_next == NULL) {
912 913 /*
913 914 * Since we're in transition, we don't want to use the
914 915 * full thread_update_pri().
915 916 */
916 917 if (sysdc_compute_pri(sdc, 0)) {
917 918 THREAD_CHANGE_PRI(t, sdc->sdc_epri);
918 919 }
919 920 sysdc_activate(sdc);
920 921
921 922 ASSERT(sdc->sdc_next != NULL);
922 923 }
923 924
924 925 setbackdq(t);
925 926 }
926 927
927 928 static void
928 929 sysdc_wakeup(kthread_t *t)
929 930 {
930 931 sysdc_setrun(t);
931 932 }
932 933
933 934 static void
934 935 sysdc_sleep(kthread_t *t)
935 936 {
936 937 sysdc_t *sdc = t->t_cldata;
937 938
938 939 ASSERT(THREAD_LOCK_HELD(t)); /* t should be in transition */
939 940
940 941 sdc->sdc_sleep_updates = sdc->sdc_nupdates;
941 942 }
942 943
943 944 /*ARGSUSED*/
944 945 static int
945 946 sysdc_enterclass(kthread_t *t, id_t cid, void *parmsp, cred_t *reqpcredp,
946 947 void *bufp)
947 948 {
948 949 cpupart_t *const cpupart = t->t_cpupart;
949 950 sysdc_t *sdc = bufp;
950 951 sysdc_params_t *sdpp = parmsp;
951 952 sysdc_pset_t *newpset = sdc->sdc_pset;
952 953 sysdc_pset_t *pset;
953 954 int start_timeout;
954 955
955 956 if (t->t_cid != syscid)
956 957 return (EPERM);
957 958
958 959 ASSERT(ttolwp(t) != NULL);
959 960 ASSERT(sdpp != NULL);
960 961 ASSERT(newpset != NULL);
961 962 ASSERT(sysdc_param_init);
962 963
963 964 ASSERT(sdpp->sdp_minpri >= sysdc_minpri);
964 965 ASSERT(sdpp->sdp_maxpri <= sysdc_maxpri);
965 966 ASSERT(sdpp->sdp_DC >= sysdc_minDC);
966 967 ASSERT(sdpp->sdp_DC <= sysdc_maxDC);
967 968
968 969 sdc->sdc_thread = t;
969 970 sdc->sdc_pri = sdpp->sdp_maxpri; /* start off maximally */
970 971 sdc->sdc_minpri = sdpp->sdp_minpri;
971 972 sdc->sdc_maxpri = sdpp->sdp_maxpri;
972 973 sdc->sdc_target_DC = sdpp->sdp_DC;
973 974 sdc->sdc_ticks = 0;
974 975 sdc->sdc_update_ticks = sysdc_update_ticks + 1;
975 976
976 977 /* Assign ourselves to the appropriate pset. */
977 978 sdc->sdc_pset = NULL;
978 979 mutex_enter(&sysdc_pset_lock);
979 980 for (pset = list_head(&sysdc_psets); pset != NULL;
980 981 pset = list_next(&sysdc_psets, pset)) {
981 982 if (pset->sdp_cpupart == cpupart) {
982 983 break;
983 984 }
984 985 }
985 986 if (pset == NULL) {
986 987 pset = newpset;
987 988 newpset = NULL;
988 989 pset->sdp_cpupart = cpupart;
989 990 list_insert_tail(&sysdc_psets, pset);
990 991 }
991 992 pset->sdp_nthreads++;
992 993 ASSERT(pset->sdp_nthreads > 0);
993 994
994 995 sdc->sdc_pset = pset;
995 996
996 997 start_timeout = (sysdc_update_timeout_started == 0);
997 998 sysdc_update_timeout_started = 1;
998 999 mutex_exit(&sysdc_pset_lock);
999 1000
1000 1001 if (newpset != NULL)
1001 1002 kmem_free(newpset, sizeof (*newpset));
1002 1003
1003 1004 /* Update t's scheduling class and priority. */
1004 1005 thread_lock(t);
1005 1006 t->t_clfuncs = &(sclass[cid].cl_funcs->thread);
1006 1007 t->t_cid = cid;
1007 1008 t->t_cldata = sdc;
1008 1009 t->t_schedflag |= TS_RUNQMATCH;
1009 1010
1010 1011 sysdc_update_pri(sdc, SDC_UPDATE_INITIAL);
1011 1012 thread_unlock(t);
1012 1013
1013 1014 /* Kick off the thread timeout if we're the first one in. */
1014 1015 if (start_timeout) {
1015 1016 (void) timeout(sysdc_update, NULL, sysdc_update_ticks);
1016 1017 }
1017 1018
1018 1019 return (0);
1019 1020 }
1020 1021
1021 1022 static void
1022 1023 sysdc_leave(sysdc_t *sdc)
1023 1024 {
1024 1025 sysdc_pset_t *sdp = sdc->sdc_pset;
1025 1026 sysdc_list_t *sdl = SYSDC_LIST(sdc);
1026 1027 uint_t freedc;
1027 1028
1028 1029 mutex_enter(&sdl->sdl_lock); /* block sysdc_update() */
1029 1030 sdc->sdc_thread = NULL;
1030 1031 freedc = (sdc->sdc_next == NULL);
1031 1032 mutex_exit(&sdl->sdl_lock);
1032 1033
1033 1034 mutex_enter(&sysdc_pset_lock);
1034 1035 ASSERT(sdp != NULL);
1035 1036 ASSERT(sdp->sdp_nthreads > 0);
1036 1037 --sdp->sdp_nthreads;
1037 1038 if (sdp->sdp_nthreads == 0) {
1038 1039 list_remove(&sysdc_psets, sdp);
1039 1040 } else {
1040 1041 sdp = NULL;
1041 1042 }
1042 1043 mutex_exit(&sysdc_pset_lock);
1043 1044
1044 1045 if (freedc)
1045 1046 kmem_free(sdc, sizeof (*sdc));
1046 1047 if (sdp != NULL)
1047 1048 kmem_free(sdp, sizeof (*sdp));
1048 1049 }
1049 1050
1050 1051 static void
1051 1052 sysdc_exitclass(void *buf)
1052 1053 {
1053 1054 sysdc_leave((sysdc_t *)buf);
1054 1055 }
1055 1056
1056 1057 /*ARGSUSED*/
1057 1058 static int
1058 1059 sysdc_canexit(kthread_t *t, cred_t *reqpcredp)
1059 1060 {
1060 1061 /* Threads cannot exit SDC once joined, except in a body bag. */
1061 1062 return (EPERM);
1062 1063 }
1063 1064
1064 1065 static void
1065 1066 sysdc_exit(kthread_t *t)
1066 1067 {
1067 1068 sysdc_t *sdc;
1068 1069
1069 1070 /* We're exiting, so we just rejoin the SYS class. */
1070 1071 thread_lock(t);
1071 1072 ASSERT(t->t_cid == sysdccid);
1072 1073 sdc = t->t_cldata;
1073 1074 t->t_cid = syscid;
1074 1075 t->t_cldata = NULL;
1075 1076 t->t_clfuncs = &(sclass[syscid].cl_funcs->thread);
1076 1077 (void) thread_change_pri(t, maxclsyspri, 0);
1077 1078 t->t_schedflag &= ~TS_RUNQMATCH;
1078 1079 thread_unlock_nopreempt(t);
1079 1080
1080 1081 /* Unlink the sdc from everything. */
1081 1082 sysdc_leave(sdc);
1082 1083 }
1083 1084
1084 1085 /*ARGSUSED*/
1085 1086 static int
1086 1087 sysdc_fork(kthread_t *t, kthread_t *ct, void *bufp)
1087 1088 {
1088 1089 /*
1089 1090 * Threads cannot be created with SDC as their class; they must
1090 1091 * be created as SYS and then added with sysdc_thread_enter().
1091 1092 * Because of this restriction, sysdc_fork() should never be called.
1092 1093 */
1093 1094 panic("sysdc cannot be forked");
1094 1095
1095 1096 return (ENOSYS);
1096 1097 }
1097 1098
1098 1099 /*ARGSUSED*/
1099 1100 static void
1100 1101 sysdc_forkret(kthread_t *t, kthread_t *ct)
1101 1102 {
1102 1103 /* SDC threads are part of system processes, which never fork. */
1103 1104 panic("sysdc cannot be forked");
1104 1105 }
1105 1106
1106 1107 static pri_t
1107 1108 sysdc_globpri(kthread_t *t)
1108 1109 {
1109 1110 return (t->t_epri);
1110 1111 }
1111 1112
1112 1113 /*ARGSUSED*/
1113 1114 static pri_t
1114 1115 sysdc_no_swap(kthread_t *t, int flags)
1115 1116 {
1116 1117 /* SDC threads cannot be swapped. */
1117 1118 return (-1);
1118 1119 }
1119 1120
1120 1121 /*
1121 1122 * Get maximum and minimum priorities enjoyed by SDC threads.
1122 1123 */
1123 1124 static int
1124 1125 sysdc_getclpri(pcpri_t *pcprip)
1125 1126 {
1126 1127 pcprip->pc_clpmax = sysdc_maxpri;
1127 1128 pcprip->pc_clpmin = sysdc_minpri;
1128 1129 return (0);
1129 1130 }
1130 1131
1131 1132 /*ARGSUSED*/
1132 1133 static int
1133 1134 sysdc_getclinfo(void *arg)
1134 1135 {
1135 1136 return (0); /* no class-specific info */
1136 1137 }
1137 1138
1138 1139 /*ARGSUSED*/
1139 1140 static int
1140 1141 sysdc_alloc(void **p, int flag)
1141 1142 {
1142 1143 sysdc_t *new;
1143 1144
1144 1145 *p = NULL;
1145 1146 if ((new = kmem_zalloc(sizeof (*new), flag)) == NULL) {
1146 1147 return (ENOMEM);
1147 1148 }
1148 1149 if ((new->sdc_pset = kmem_zalloc(sizeof (*new->sdc_pset), flag)) ==
1149 1150 NULL) {
1150 1151 kmem_free(new, sizeof (*new));
1151 1152 return (ENOMEM);
1152 1153 }
1153 1154 *p = new;
1154 1155 return (0);
1155 1156 }
1156 1157
1157 1158 static void
1158 1159 sysdc_free(void *p)
1159 1160 {
1160 1161 sysdc_t *sdc = p;
1161 1162
1162 1163 if (sdc != NULL) {
1163 1164 /*
1164 1165 * We must have failed CL_ENTERCLASS(), so our pset should be
1165 1166 * there and unused.
1166 1167 */
1167 1168 ASSERT(sdc->sdc_pset != NULL);
1168 1169 ASSERT(sdc->sdc_pset->sdp_cpupart == NULL);
1169 1170 kmem_free(sdc->sdc_pset, sizeof (*sdc->sdc_pset));
1170 1171 kmem_free(sdc, sizeof (*sdc));
1171 1172 }
1172 1173 }
1173 1174
1174 1175 static int sysdc_enosys(); /* Boy, ANSI-C's K&R compatibility is weird. */
1175 1176 static int sysdc_einval();
1176 1177 static void sysdc_nullsys();
1177 1178
1178 1179 static struct classfuncs sysdc_classfuncs = {
1179 1180 /* messages to class manager */
1180 1181 {
1181 1182 sysdc_enosys, /* admin */
1182 1183 sysdc_getclinfo,
1183 1184 sysdc_enosys, /* parmsin */
1184 1185 sysdc_enosys, /* parmsout */
1185 1186 sysdc_enosys, /* vaparmsin */
1186 1187 sysdc_enosys, /* vaparmsout */
1187 1188 sysdc_getclpri,
1188 1189 sysdc_alloc,
1189 1190 sysdc_free,
1190 1191 },
1191 1192 /* operations on threads */
1192 1193 {
1193 1194 sysdc_enterclass,
1194 1195 sysdc_exitclass,
1195 1196 sysdc_canexit,
1196 1197 sysdc_fork,
1197 1198 sysdc_forkret,
1198 1199 sysdc_nullsys, /* parmsget */
1199 1200 sysdc_enosys, /* parmsset */
1200 1201 sysdc_nullsys, /* stop */
1201 1202 sysdc_exit,
1202 1203 sysdc_nullsys, /* active */
1203 1204 sysdc_nullsys, /* inactive */
1204 1205 sysdc_no_swap, /* swapin */
1205 1206 sysdc_no_swap, /* swapout */
1206 1207 sysdc_nullsys, /* trapret */
1207 1208 sysdc_preempt,
1208 1209 sysdc_setrun,
1209 1210 sysdc_sleep,
1210 1211 sysdc_tick,
1211 1212 sysdc_wakeup,
1212 1213 sysdc_einval, /* donice */
1213 1214 sysdc_globpri,
1214 1215 sysdc_nullsys, /* set_process_group */
1215 1216 sysdc_nullsys, /* yield */
1216 1217 sysdc_einval, /* doprio */
1217 1218 }
1218 1219 };
1219 1220
1220 1221 static int
1221 1222 sysdc_enosys()
1222 1223 {
1223 1224 return (ENOSYS);
1224 1225 }
1225 1226
1226 1227 static int
1227 1228 sysdc_einval()
1228 1229 {
1229 1230 return (EINVAL);
1230 1231 }
1231 1232
1232 1233 static void
1233 1234 sysdc_nullsys()
1234 1235 {
1235 1236 }
1236 1237
1237 1238 /*ARGSUSED*/
1238 1239 static pri_t
1239 1240 sysdc_init(id_t cid, int clparmsz, classfuncs_t **clfuncspp)
1240 1241 {
1241 1242 int idx;
1242 1243
1243 1244 list_create(&sysdc_psets, sizeof (sysdc_pset_t),
1244 1245 offsetof(sysdc_pset_t, sdp_node));
1245 1246
1246 1247 for (idx = 0; idx < SYSDC_NLISTS; idx++) {
1247 1248 sysdc_active[idx].sdl_list = &sysdc_dummy;
1248 1249 }
1249 1250
1250 1251 sysdc_initparam();
1251 1252
1252 1253 sysdccid = cid;
1253 1254 *clfuncspp = &sysdc_classfuncs;
1254 1255
1255 1256 return ((pri_t)v.v_maxsyspri);
1256 1257 }
1257 1258
1258 1259 static struct sclass csw = {
1259 1260 "SDC",
1260 1261 sysdc_init,
1261 1262 0
1262 1263 };
1263 1264
1264 1265 static struct modlsched modlsched = {
1265 1266 &mod_schedops, "system duty cycle scheduling class", &csw
1266 1267 };
1267 1268
1268 1269 static struct modlinkage modlinkage = {
1269 1270 MODREV_1, (void *)&modlsched, NULL
1270 1271 };
1271 1272
1272 1273 int
1273 1274 _init()
1274 1275 {
1275 1276 return (mod_install(&modlinkage));
1276 1277 }
1277 1278
1278 1279 int
1279 1280 _fini()
1280 1281 {
1281 1282 return (EBUSY); /* can't unload for now */
1282 1283 }
1283 1284
1284 1285 int
1285 1286 _info(struct modinfo *modinfop)
1286 1287 {
1287 1288 return (mod_info(&modlinkage, modinfop));
1288 1289 }
1289 1290
1290 1291 /* --- consolidation-private interfaces --- */
1291 1292 void
1292 1293 sysdc_thread_enter(kthread_t *t, uint_t dc, uint_t flags)
1293 1294 {
1294 1295 void *buf = NULL;
1295 1296 sysdc_params_t sdp;
1296 1297
1297 1298 SYSDC_INC_STAT(sysdc_thread_enter_enter);
↓ open down ↓ |
1265 lines elided |
↑ open up ↑ |
1298 1299
1299 1300 ASSERT(sysdc_param_init);
1300 1301 ASSERT(sysdccid >= 0);
1301 1302
1302 1303 ASSERT((flags & ~SYSDC_THREAD_BATCH) == 0);
1303 1304
1304 1305 sdp.sdp_minpri = sysdc_minpri;
1305 1306 sdp.sdp_maxpri = sysdc_maxpri;
1306 1307 sdp.sdp_DC = MAX(MIN(dc, sysdc_maxDC), sysdc_minDC);
1307 1308
1308 - VERIFY3U(CL_ALLOC(&buf, sysdccid, KM_SLEEP), ==, 0);
1309 + VERIFY0(CL_ALLOC(&buf, sysdccid, KM_SLEEP));
1309 1310
1310 1311 ASSERT(t->t_lwp != NULL);
1311 1312 ASSERT(t->t_cid == syscid);
1312 1313 ASSERT(t->t_cldata == NULL);
1313 - VERIFY3U(CL_CANEXIT(t, NULL), ==, 0);
1314 - VERIFY3U(CL_ENTERCLASS(t, sysdccid, &sdp, kcred, buf), ==, 0);
1314 + VERIFY0(CL_CANEXIT(t, NULL));
1315 + VERIFY0(CL_ENTERCLASS(t, sysdccid, &sdp, kcred, buf));
1315 1316 CL_EXITCLASS(syscid, NULL);
1316 1317 }
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX