Print this page
3006 VERIFY[S,U,P] and ASSERT[S,U,P] frequently check if first argument is zero
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/disp/sysdc.c
+++ new/usr/src/uts/common/disp/sysdc.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
↓ open down ↓ |
15 lines elided |
↑ open up ↑ |
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
23 23 */
24 24
25 25 /*
26 + * Copyright (c) 2012 by Delphix. All rights reserved.
27 + */
28 +
29 +/*
26 30 * The System Duty Cycle (SDC) scheduling class
27 31 * --------------------------------------------
28 32 *
29 33 * Background
30 34 *
31 35 * Kernel threads in Solaris have traditionally not been large consumers
32 36 * of CPU time. They typically wake up, perform a small amount of
33 37 * work, then go back to sleep waiting for either a timeout or another
34 38 * signal. On the assumption that the small amount of work that they do
35 39 * is important for the behavior of the whole system, these threads are
36 40 * treated kindly by the dispatcher and the SYS scheduling class: they run
37 41 * without preemption from anything other than real-time and interrupt
38 42 * threads; when preempted, they are put at the front of the queue, so they
39 43 * generally do not migrate between CPUs; and they are allowed to stay
40 44 * running until they voluntarily give up the CPU.
41 45 *
42 46 * As Solaris has evolved, new workloads have emerged which require the
43 47 * kernel to perform significant amounts of CPU-intensive work. One
44 48 * example of such a workload is ZFS's transaction group sync processing.
45 49 * Each sync operation generates a large batch of I/Os, and each I/O
46 50 * may need to be compressed and/or checksummed before it is written to
47 51 * storage. The taskq threads which perform the compression and checksums
48 52 * will run nonstop as long as they have work to do; a large sync operation
49 53 * on a compression-heavy dataset can keep them busy for seconds on end.
50 54 * This causes human-time-scale dispatch latency bubbles for any other
51 55 * threads which have the misfortune to share a CPU with the taskq threads.
52 56 *
53 57 * The SDC scheduling class is a solution to this problem.
54 58 *
55 59 *
56 60 * Overview
57 61 *
58 62 * SDC is centered around the concept of a thread's duty cycle (DC):
59 63 *
60 64 * ONPROC time
61 65 * Duty Cycle = ----------------------
62 66 * ONPROC + Runnable time
63 67 *
64 68 * This is the ratio of the time that the thread spent running on a CPU
65 69 * divided by the time it spent running or trying to run. It is unaffected
66 70 * by any time the thread spent sleeping, stopped, etc.
67 71 *
68 72 * A thread joining the SDC class specifies a "target" DC that it wants
69 73 * to run at. To implement this policy, the routine sysdc_update() scans
70 74 * the list of active SDC threads every few ticks and uses each thread's
71 75 * microstate data to compute the actual duty cycle that that thread
72 76 * has experienced recently. If the thread is under its target DC, its
73 77 * priority is increased to the maximum available (sysdc_maxpri, which is
74 78 * 99 by default). If the thread is over its target DC, its priority is
75 79 * reduced to the minimum available (sysdc_minpri, 0 by default). This
76 80 * is a fairly primitive approach, in that it doesn't use any of the
77 81 * intermediate priorities, but it's not completely inappropriate. Even
78 82 * though threads in the SDC class might take a while to do their job, they
79 83 * are by some definition important if they're running inside the kernel,
80 84 * so it is reasonable that they should get to run at priority 99.
81 85 *
82 86 * If a thread is running when sysdc_update() calculates its actual duty
83 87 * cycle, and there are other threads of equal or greater priority on its
84 88 * CPU's dispatch queue, sysdc_update() preempts that thread. The thread
85 89 * acknowledges the preemption by calling sysdc_preempt(), which calls
86 90 * setbackdq(), which gives other threads with the same priority a chance
87 91 * to run. This creates a de facto time quantum for threads in the SDC
88 92 * scheduling class.
89 93 *
90 94 * An SDC thread which is assigned priority 0 can continue to run if
91 95 * nothing else needs to use the CPU that it's running on. Similarly, an
92 96 * SDC thread at priority 99 might not get to run as much as it wants to
93 97 * if there are other priority-99 or higher threads on its CPU. These
94 98 * situations would cause the thread to get ahead of or behind its target
95 99 * DC; the longer the situations lasted, the further ahead or behind the
96 100 * thread would get. Rather than condemning a thread to a lifetime of
97 101 * paying for its youthful indiscretions, SDC keeps "base" values for
98 102 * ONPROC and Runnable times in each thread's sysdc data, and updates these
99 103 * values periodically. The duty cycle is then computed using the elapsed
100 104 * amount of ONPROC and Runnable times since those base times.
101 105 *
102 106 * Since sysdc_update() scans SDC threads fairly frequently, it tries to
103 107 * keep the list of "active" threads small by pruning out threads which
104 108 * have been asleep for a brief time. They are not pruned immediately upon
105 109 * going to sleep, since some threads may bounce back and forth between
106 110 * sleeping and being runnable.
107 111 *
108 112 *
109 113 * Interfaces
110 114 *
111 115 * void sysdc_thread_enter(t, dc, flags)
112 116 *
113 117 * Moves a kernel thread from the SYS scheduling class to the
114 118 * SDC class. t must have an associated LWP (created by calling
115 119 * lwp_kernel_create()). The thread will have a target DC of dc.
116 120 * Flags should be either 0 or SYSDC_THREAD_BATCH. If
117 121 * SYSDC_THREAD_BATCH is specified, the thread is expected to be
118 122 * doing large amounts of processing.
119 123 *
120 124 *
121 125 * Complications
122 126 *
123 127 * - Run queue balancing
124 128 *
125 129 * The Solaris dispatcher is biased towards letting a thread run
126 130 * on the same CPU which it last ran on, if no more than 3 ticks
127 131 * (i.e. rechoose_interval) have passed since the thread last ran.
128 132 * This helps to preserve cache warmth. On the other hand, it also
129 133 * tries to keep the per-CPU run queues fairly balanced; if the CPU
130 134 * chosen for a runnable thread has a run queue which is three or
131 135 * more threads longer than a neighboring CPU's queue, the runnable
132 136 * thread is dispatched onto the neighboring CPU instead.
133 137 *
134 138 * These policies work well for some workloads, but not for many SDC
135 139 * threads. The taskq client of SDC, for example, has many discrete
136 140 * units of work to do. The work units are largely independent, so
137 141 * cache warmth is not an important consideration. It is important
138 142 * that the threads fan out quickly to different CPUs, since the
139 143 * amount of work these threads have to do (a few seconds worth at a
140 144 * time) doesn't leave much time to correct thread placement errors
141 145 * (i.e. two SDC threads being dispatched to the same CPU).
142 146 *
143 147 * To fix this, SDC uses the TS_RUNQMATCH flag introduced for FSS.
144 148 * This tells the dispatcher to keep neighboring run queues' lengths
145 149 * more evenly matched, which allows SDC threads to migrate more
146 150 * easily.
147 151 *
148 152 * - LWPs and system processes
149 153 *
150 154 * SDC can only be used for kernel threads. Since SDC uses microstate
151 155 * accounting data to compute each thread's actual duty cycle, all
152 156 * threads entering the SDC class must have associated LWPs (which
153 157 * store the microstate data). This means that the threads have to
154 158 * be associated with an SSYS process, i.e. one created by newproc().
155 159 * If the microstate accounting information is ever moved into the
156 160 * kthread_t, this restriction could be lifted.
157 161 *
158 162 * - Dealing with oversubscription
159 163 *
160 164 * Since SDC duty cycles are per-thread, it is possible that the
161 165 * aggregate requested duty cycle of all SDC threads in a processor
162 166 * set could be greater than the total CPU time available in that set.
163 167 * The FSS scheduling class has an analogous situation, which it deals
164 168 * with by reducing each thread's allotted CPU time proportionally.
165 169 * Since SDC doesn't need to be as precise as FSS, it uses a simpler
166 170 * solution to the oversubscription problem.
167 171 *
168 172 * sysdc_update() accumulates the amount of time that max-priority SDC
169 173 * threads have spent on-CPU in each processor set, and uses that sum
170 174 * to create an implied duty cycle for that processor set:
171 175 *
172 176 * accumulated CPU time
173 177 * pset DC = -----------------------------------
174 178 * (# CPUs) * time since last update
175 179 *
176 180 * If this implied duty cycle is above a maximum pset duty cycle (90%
177 181 * by default), sysdc_update() sets the priority of all SDC threads
178 182 * in that processor set to sysdc_minpri for a "break" period. After
179 183 * the break period, it waits for a "nobreak" period before trying to
180 184 * enforce the pset duty cycle limit again.
181 185 *
182 186 * - Processor sets
183 187 *
184 188 * As the above implies, SDC is processor set aware, but it does not
185 189 * currently allow threads to change processor sets while in the SDC
186 190 * class. Instead, those threads must join the desired processor set
187 191 * before entering SDC. [1]
188 192 *
189 193 * - Batch threads
190 194 *
191 195 * A thread joining the SDC class can specify the SDC_THREAD_BATCH
192 196 * flag. This flag currently has no effect, but marks threads which
193 197 * do bulk processing.
194 198 *
195 199 * - t_kpri_req
196 200 *
197 201 * The TS and FSS scheduling classes pay attention to t_kpri_req,
198 202 * which provides a simple form of priority inheritance for
199 203 * synchronization primitives (such as rwlocks held as READER) which
200 204 * cannot be traced to a unique thread. The SDC class does not honor
201 205 * t_kpri_req, for a few reasons:
202 206 *
203 207 * 1. t_kpri_req is notoriously inaccurate. A measure of its
204 208 * inaccuracy is that it needs to be cleared every time a thread
205 209 * returns to user mode, because it is frequently non-zero at that
206 210 * point. This can happen because "ownership" of synchronization
207 211 * primitives that use t_kpri_req can be silently handed off,
208 212 * leaving no opportunity to will the t_kpri_req inheritance.
209 213 *
210 214 * 2. Unlike in TS and FSS, threads in SDC *will* eventually run at
211 215 * kernel priority. This means that even if an SDC thread
212 216 * is holding a synchronization primitive and running at low
213 217 * priority, its priority will eventually be raised above 60,
214 218 * allowing it to drive on and release the resource.
215 219 *
216 220 * 3. The first consumer of SDC uses the taskq subsystem, which holds
217 221 * a reader lock for the duration of the task's execution. This
218 222 * would mean that SDC threads would never drop below kernel
219 223 * priority in practice, which defeats one of the purposes of SDC.
220 224 *
221 225 * - Why not FSS?
222 226 *
223 227 * It might seem that the existing FSS scheduling class could solve
224 228 * the problems that SDC is attempting to solve. FSS's more precise
225 229 * solution to the oversubscription problem would hardly cause
226 230 * trouble, as long as it performed well. SDC is implemented as
227 231 * a separate scheduling class for two main reasons: the initial
228 232 * consumer of SDC does not map well onto the "project" abstraction
229 233 * that is central to FSS, and FSS does not expect to run at kernel
230 234 * priorities.
231 235 *
232 236 *
233 237 * Tunables
234 238 *
235 239 * - sysdc_update_interval_msec: Number of milliseconds between
236 240 * consecutive thread priority updates.
237 241 *
238 242 * - sysdc_reset_interval_msec: Number of milliseconds between
239 243 * consecutive resets of a thread's base ONPROC and Runnable
240 244 * times.
241 245 *
242 246 * - sysdc_prune_interval_msec: Number of milliseconds of sleeping
243 247 * before a thread is pruned from the active list.
244 248 *
245 249 * - sysdc_max_pset_DC: Allowable percentage of a processor set's
246 250 * CPU time which SDC can give to its high-priority threads.
247 251 *
248 252 * - sysdc_break_msec: Number of milliseconds of "break" taken when
249 253 * sysdc_max_pset_DC is exceeded.
250 254 *
251 255 *
252 256 * Future work (in SDC and related subsystems)
253 257 *
254 258 * - Per-thread rechoose interval (0 for SDC)
255 259 *
256 260 * Allow each thread to specify its own rechoose interval. SDC
257 261 * threads would specify an interval of zero, which would rechoose
258 262 * the CPU with the lowest priority once per update.
259 263 *
260 264 * - Allow threads to change processor sets after joining the SDC class
261 265 *
262 266 * - Thread groups and per-group DC
263 267 *
264 268 * It might be nice to be able to specify a duty cycle which applies
265 269 * to a group of threads in aggregate.
266 270 *
267 271 * - Per-group DC callback to allow dynamic DC tuning
268 272 *
269 273 * Currently, DCs are assigned when the thread joins SDC. Some
270 274 * workloads could benefit from being able to tune their DC using
271 275 * subsystem-specific knowledge about the workload.
272 276 *
273 277 * - Finer-grained priority updates
274 278 *
275 279 * - More nuanced management of oversubscription
276 280 *
277 281 * - Moving other CPU-intensive threads into SDC
278 282 *
279 283 * - Move msacct data into kthread_t
280 284 *
281 285 * This would allow kernel threads without LWPs to join SDC.
282 286 *
283 287 *
284 288 * Footnotes
285 289 *
286 290 * [1] The details of doing so are left as an exercise for the reader.
287 291 */
288 292
289 293 #include <sys/types.h>
290 294 #include <sys/sysdc.h>
291 295 #include <sys/sysdc_impl.h>
292 296
293 297 #include <sys/class.h>
294 298 #include <sys/cmn_err.h>
295 299 #include <sys/cpuvar.h>
296 300 #include <sys/cpupart.h>
297 301 #include <sys/debug.h>
298 302 #include <sys/disp.h>
299 303 #include <sys/errno.h>
300 304 #include <sys/inline.h>
301 305 #include <sys/kmem.h>
302 306 #include <sys/modctl.h>
303 307 #include <sys/schedctl.h>
304 308 #include <sys/sdt.h>
305 309 #include <sys/sunddi.h>
306 310 #include <sys/sysmacros.h>
307 311 #include <sys/systm.h>
308 312 #include <sys/var.h>
309 313
310 314 /*
311 315 * Tunables - loaded into the internal state at module load time
312 316 */
313 317 uint_t sysdc_update_interval_msec = 20;
314 318 uint_t sysdc_reset_interval_msec = 400;
315 319 uint_t sysdc_prune_interval_msec = 100;
316 320 uint_t sysdc_max_pset_DC = 90;
317 321 uint_t sysdc_break_msec = 80;
318 322
319 323 /*
320 324 * Internal state - constants set up by sysdc_initparam()
321 325 */
322 326 static clock_t sysdc_update_ticks; /* ticks between updates */
323 327 static uint_t sysdc_prune_updates; /* updates asleep before pruning */
324 328 static uint_t sysdc_reset_updates; /* # of updates before reset */
325 329 static uint_t sysdc_break_updates; /* updates to break */
326 330 static uint_t sysdc_nobreak_updates; /* updates to not check */
327 331 static uint_t sysdc_minDC; /* minimum allowed DC */
328 332 static uint_t sysdc_maxDC; /* maximum allowed DC */
329 333 static pri_t sysdc_minpri; /* minimum allowed priority */
330 334 static pri_t sysdc_maxpri; /* maximum allowed priority */
331 335
332 336 /*
333 337 * Internal state
334 338 */
335 339 static kmutex_t sysdc_pset_lock; /* lock protecting pset data */
336 340 static list_t sysdc_psets; /* list of psets with SDC threads */
337 341 static uint_t sysdc_param_init; /* sysdc_initparam() has been called */
338 342 static uint_t sysdc_update_timeout_started; /* update timeout is active */
339 343 static hrtime_t sysdc_last_update; /* time of last sysdc_update() */
340 344 static sysdc_t sysdc_dummy; /* used to terminate active lists */
341 345
342 346 /*
343 347 * Internal state - active hash table
344 348 */
345 349 #define SYSDC_NLISTS 8
346 350 #define SYSDC_HASH(sdc) (((uintptr_t)(sdc) >> 6) & (SYSDC_NLISTS - 1))
347 351 static sysdc_list_t sysdc_active[SYSDC_NLISTS];
348 352 #define SYSDC_LIST(sdc) (&sysdc_active[SYSDC_HASH(sdc)])
349 353
350 354 #ifdef DEBUG
351 355 static struct {
352 356 uint64_t sysdc_update_times_asleep;
353 357 uint64_t sysdc_update_times_base_ran_backwards;
354 358 uint64_t sysdc_update_times_already_done;
355 359 uint64_t sysdc_update_times_cur_ran_backwards;
356 360 uint64_t sysdc_compute_pri_breaking;
357 361 uint64_t sysdc_activate_enter;
358 362 uint64_t sysdc_update_enter;
359 363 uint64_t sysdc_update_exited;
360 364 uint64_t sysdc_update_not_sdc;
361 365 uint64_t sysdc_update_idle;
362 366 uint64_t sysdc_update_take_break;
363 367 uint64_t sysdc_update_no_psets;
364 368 uint64_t sysdc_tick_not_sdc;
365 369 uint64_t sysdc_tick_quantum_expired;
366 370 uint64_t sysdc_thread_enter_enter;
367 371 } sysdc_stats;
368 372
369 373 #define SYSDC_INC_STAT(x) (sysdc_stats.x++)
370 374 #else
371 375 #define SYSDC_INC_STAT(x) ((void)0)
372 376 #endif
373 377
374 378 /* macros are UPPER CASE */
375 379 #define HOWMANY(a, b) howmany((a), (b))
376 380 #define MSECTOTICKS(a) HOWMANY((a) * 1000, usec_per_tick)
377 381
378 382 static void
379 383 sysdc_initparam(void)
380 384 {
381 385 uint_t sysdc_break_ticks;
382 386
383 387 /* update / prune intervals */
384 388 sysdc_update_ticks = MSECTOTICKS(sysdc_update_interval_msec);
385 389
386 390 sysdc_prune_updates = HOWMANY(sysdc_prune_interval_msec,
387 391 sysdc_update_interval_msec);
388 392 sysdc_reset_updates = HOWMANY(sysdc_reset_interval_msec,
389 393 sysdc_update_interval_msec);
390 394
391 395 /* We must get at least a little time on CPU. */
392 396 sysdc_minDC = 1;
393 397 sysdc_maxDC = SYSDC_DC_MAX;
394 398 sysdc_minpri = 0;
395 399 sysdc_maxpri = maxclsyspri;
396 400
397 401 /* break parameters */
398 402 if (sysdc_max_pset_DC > SYSDC_DC_MAX) {
399 403 sysdc_max_pset_DC = SYSDC_DC_MAX;
400 404 }
401 405 sysdc_break_ticks = MSECTOTICKS(sysdc_break_msec);
402 406 sysdc_break_updates = HOWMANY(sysdc_break_ticks, sysdc_update_ticks);
403 407
404 408 /*
405 409 * We want:
406 410 *
407 411 * sysdc_max_pset_DC = (nobreak / (break + nobreak))
408 412 *
409 413 * ==> nobreak = sysdc_max_pset_DC * (break + nobreak)
410 414 *
411 415 * sysdc_max_pset_DC * break
412 416 * ==> nobreak = -------------------------
413 417 * 1 - sysdc_max_pset_DC
414 418 */
415 419 sysdc_nobreak_updates =
416 420 HOWMANY((uint64_t)sysdc_break_updates * sysdc_max_pset_DC,
417 421 (SYSDC_DC_MAX - sysdc_max_pset_DC));
418 422
419 423 sysdc_param_init = 1;
420 424 }
421 425
422 426 #undef HOWMANY
423 427 #undef MSECTOTICKS
424 428
425 429 #define SDC_UPDATE_INITIAL 0x1 /* for the initial update */
426 430 #define SDC_UPDATE_TIMEOUT 0x2 /* from sysdc_update() */
427 431 #define SDC_UPDATE_TICK 0x4 /* from sysdc_tick(), on expiry */
428 432
429 433 /*
430 434 * Updates the recorded times in the sdc, and returns the elapsed ONPROC
431 435 * and Runnable times since the last reset.
432 436 *
433 437 * newO is the thread's actual ONPROC time; it's used during sysdc_update()
434 438 * to track processor set usage.
435 439 */
436 440 static void
437 441 sysdc_update_times(sysdc_t *sdc, uint_t flags,
438 442 hrtime_t *O, hrtime_t *R, hrtime_t *newO)
439 443 {
440 444 kthread_t *const t = sdc->sdc_thread;
441 445 const uint_t initial = (flags & SDC_UPDATE_INITIAL);
442 446 const uint_t update = (flags & SDC_UPDATE_TIMEOUT);
443 447 const clock_t now = ddi_get_lbolt();
444 448 uint_t do_reset;
445 449
446 450 ASSERT(THREAD_LOCK_HELD(t));
447 451
448 452 *O = *R = 0;
449 453
450 454 /* If we've been sleeping, we know we haven't had any ONPROC time. */
451 455 if (sdc->sdc_sleep_updates != 0 &&
452 456 sdc->sdc_sleep_updates != sdc->sdc_nupdates) {
453 457 *newO = sdc->sdc_last_base_O;
454 458 SYSDC_INC_STAT(sysdc_update_times_asleep);
455 459 return;
456 460 }
457 461
458 462 /*
459 463 * If this is our first update, or we've hit the reset point,
460 464 * we need to reset our base_{O,R}. Once we've updated them, we
461 465 * report O and R for the entire prior interval.
462 466 */
463 467 do_reset = initial;
464 468 if (update) {
465 469 ++sdc->sdc_nupdates;
466 470 if ((sdc->sdc_nupdates % sysdc_reset_updates) == 0)
467 471 do_reset = 1;
468 472 }
469 473 if (do_reset) {
470 474 hrtime_t baseO, baseR;
471 475 if (initial) {
472 476 /*
473 477 * Start off our cycle count somewhere in the middle,
474 478 * to keep the resets from all happening at once.
475 479 *
476 480 * 4999 is a handy prime much larger than
477 481 * sysdc_reset_updates, so that we don't run into
478 482 * trouble if the resolution is a multiple of
479 483 * sysdc_reset_updates.
480 484 */
481 485 sdc->sdc_nupdates = (uint_t)((gethrtime() % 4999) %
482 486 sysdc_reset_updates);
483 487 baseO = baseR = 0;
484 488 } else {
485 489 baseO = sdc->sdc_base_O;
486 490 baseR = sdc->sdc_base_R;
487 491 }
488 492
489 493 mstate_systhread_times(t, &sdc->sdc_base_O, &sdc->sdc_base_R);
490 494 *newO = sdc->sdc_base_O;
491 495
492 496 sdc->sdc_reset = now;
493 497 sdc->sdc_pri_check = -1; /* force mismatch below */
494 498
495 499 /*
496 500 * See below for rationale.
497 501 */
498 502 if (baseO > sdc->sdc_base_O || baseR > sdc->sdc_base_R) {
499 503 SYSDC_INC_STAT(sysdc_update_times_base_ran_backwards);
500 504 baseO = sdc->sdc_base_O;
501 505 baseR = sdc->sdc_base_R;
502 506 }
503 507
504 508 /* compute based on the entire interval */
505 509 *O = (sdc->sdc_base_O - baseO);
506 510 *R = (sdc->sdc_base_R - baseR);
507 511 return;
508 512 }
509 513
510 514 /*
511 515 * If we're called from sysdc_update(), we *must* return a value
512 516 * for newO, so we always call mstate_systhread_times().
513 517 *
514 518 * Otherwise, if we've already done a pri check this tick,
515 519 * we can skip it.
516 520 */
517 521 if (!update && sdc->sdc_pri_check == now) {
518 522 SYSDC_INC_STAT(sysdc_update_times_already_done);
519 523 return;
520 524 }
521 525
522 526 /* Get the current times from the thread */
523 527 sdc->sdc_pri_check = now;
524 528 mstate_systhread_times(t, &sdc->sdc_cur_O, &sdc->sdc_cur_R);
525 529 *newO = sdc->sdc_cur_O;
526 530
527 531 /*
528 532 * The updating of microstate accounting is not done under a
529 533 * consistent set of locks, particularly the t_waitrq field. This
530 534 * can lead to narrow windows in which we account for time in the
531 535 * wrong bucket, which on the next read will be accounted for
532 536 * correctly.
533 537 *
534 538 * If our sdc_base_* fields were affected by one of these blips, we
535 539 * throw away the old data, and pretend this tick didn't happen.
536 540 */
537 541 if (sdc->sdc_cur_O < sdc->sdc_base_O ||
538 542 sdc->sdc_cur_R < sdc->sdc_base_R) {
539 543
540 544 sdc->sdc_base_O = sdc->sdc_cur_O;
541 545 sdc->sdc_base_R = sdc->sdc_cur_R;
542 546
543 547 SYSDC_INC_STAT(sysdc_update_times_cur_ran_backwards);
544 548 return;
545 549 }
546 550
547 551 *O = sdc->sdc_cur_O - sdc->sdc_base_O;
548 552 *R = sdc->sdc_cur_R - sdc->sdc_base_R;
549 553 }
550 554
551 555 /*
552 556 * sysdc_compute_pri()
553 557 *
554 558 * Recomputes the priority of the thread, leaving the result in
555 559 * sdc->sdc_epri. Returns 1 if a priority update should occur
556 560 * (which will also trigger a cpu_surrender()), otherwise
557 561 * returns 0.
558 562 */
559 563 static uint_t
560 564 sysdc_compute_pri(sysdc_t *sdc, uint_t flags)
561 565 {
562 566 kthread_t *const t = sdc->sdc_thread;
563 567 const uint_t update = (flags & SDC_UPDATE_TIMEOUT);
564 568 const uint_t tick = (flags & SDC_UPDATE_TICK);
565 569
566 570 hrtime_t O, R;
567 571 hrtime_t newO = -1;
568 572
569 573 ASSERT(THREAD_LOCK_HELD(t));
570 574
571 575 sysdc_update_times(sdc, flags, &O, &R, &newO);
572 576 ASSERT(!update || newO != -1);
573 577
574 578 /* If we have new data, recompute our priority. */
575 579 if ((O + R) != 0) {
576 580 sdc->sdc_cur_DC = (O * SYSDC_DC_MAX) / (O + R);
577 581
578 582 /* Adjust our priority to move our DC closer to the target. */
579 583 if (sdc->sdc_cur_DC < sdc->sdc_target_DC)
580 584 sdc->sdc_pri = sdc->sdc_maxpri;
581 585 else
582 586 sdc->sdc_pri = sdc->sdc_minpri;
583 587 }
584 588
585 589 /*
586 590 * If our per-pset duty cycle goes over the max, we will take a break.
587 591 * This forces all sysdc threads in the pset to minimum priority, in
588 592 * order to let everyone else have a chance at the CPU.
589 593 */
590 594 if (sdc->sdc_pset->sdp_need_break) {
591 595 SYSDC_INC_STAT(sysdc_compute_pri_breaking);
592 596 sdc->sdc_epri = sdc->sdc_minpri;
593 597 } else {
594 598 sdc->sdc_epri = sdc->sdc_pri;
595 599 }
596 600
597 601 DTRACE_PROBE4(sysdc__compute__pri,
598 602 kthread_t *, t, pri_t, sdc->sdc_epri, uint_t, sdc->sdc_cur_DC,
599 603 uint_t, sdc->sdc_target_DC);
600 604
601 605 /*
602 606 * For sysdc_update(), we compute the ONPROC time for high-priority
603 607 * threads, which is used to calculate the per-pset duty cycle. We
604 608 * will always tell our callers to update the thread's priority,
605 609 * since we want to force a cpu_surrender().
606 610 *
607 611 * We reset sdc_update_ticks so that sysdc_tick() will only update
608 612 * the thread's priority if our timeout is delayed by a tick or
609 613 * more.
610 614 */
611 615 if (update) {
612 616 /* SDC threads are not allowed to change cpupart bindings. */
613 617 ASSERT(t->t_cpupart == sdc->sdc_pset->sdp_cpupart);
614 618
615 619 /* If we were at MAXPRI, account for our onproc time. */
616 620 if (t->t_pri == sdc->sdc_maxpri &&
617 621 sdc->sdc_last_base_O != 0 &&
618 622 sdc->sdc_last_base_O < newO) {
619 623 sdc->sdc_last_O = newO - sdc->sdc_last_base_O;
620 624 sdc->sdc_pset->sdp_onproc_time +=
621 625 (uint64_t)sdc->sdc_last_O;
622 626 sdc->sdc_pset->sdp_onproc_threads++;
623 627 } else {
624 628 sdc->sdc_last_O = 0;
625 629 }
626 630 sdc->sdc_last_base_O = newO;
627 631
628 632 sdc->sdc_update_ticks = sdc->sdc_ticks + sysdc_update_ticks + 1;
629 633 return (1);
630 634 }
631 635
632 636 /*
633 637 * Like sysdc_update(), sysdc_tick() always wants to update the
634 638 * thread's priority, so that the CPU is surrendered if necessary.
635 639 * We reset sdc_update_ticks so that if the timeout continues to be
636 640 * delayed, we'll update at the regular interval.
637 641 */
638 642 if (tick) {
639 643 ASSERT(sdc->sdc_ticks == sdc->sdc_update_ticks);
640 644 sdc->sdc_update_ticks = sdc->sdc_ticks + sysdc_update_ticks;
641 645 return (1);
642 646 }
643 647
644 648 /*
645 649 * Otherwise, only tell our callers to update the priority if it has
646 650 * changed.
647 651 */
648 652 return (sdc->sdc_epri != t->t_pri);
649 653 }
650 654
651 655 static void
652 656 sysdc_update_pri(sysdc_t *sdc, uint_t flags)
653 657 {
654 658 kthread_t *t = sdc->sdc_thread;
655 659
656 660 ASSERT(THREAD_LOCK_HELD(t));
657 661
658 662 if (sysdc_compute_pri(sdc, flags)) {
659 663 if (!thread_change_pri(t, sdc->sdc_epri, 0)) {
660 664 cpu_surrender(t);
661 665 }
662 666 }
663 667 }
664 668
665 669 /*
666 670 * Add a thread onto the active list. It will only be removed by
667 671 * sysdc_update().
668 672 */
669 673 static void
670 674 sysdc_activate(sysdc_t *sdc)
671 675 {
672 676 sysdc_t *volatile *headp = &SYSDC_LIST(sdc)->sdl_list;
673 677 sysdc_t *head;
674 678 kthread_t *t = sdc->sdc_thread;
675 679
676 680 SYSDC_INC_STAT(sysdc_activate_enter);
677 681
678 682 ASSERT(sdc->sdc_next == NULL);
679 683 ASSERT(THREAD_LOCK_HELD(t));
680 684
681 685 do {
682 686 head = *headp;
683 687 sdc->sdc_next = head;
684 688 } while (atomic_cas_ptr(headp, head, sdc) != head);
685 689 }
686 690
687 691 /*
688 692 * sysdc_update() has two jobs:
689 693 *
690 694 * 1. It updates the priorities of all active SDC threads on the system.
691 695 * 2. It measures pset CPU usage and enforces sysdc_max_pset_DC.
692 696 */
693 697 static void
694 698 sysdc_update(void *arg)
695 699 {
696 700 int idx;
697 701 sysdc_t *freelist = NULL;
698 702 sysdc_pset_t *cur;
699 703 hrtime_t now, diff;
700 704 uint_t redeploy = 1;
701 705
702 706 SYSDC_INC_STAT(sysdc_update_enter);
703 707
704 708 ASSERT(sysdc_update_timeout_started);
705 709
706 710 /*
707 711 * If this is our first time through, diff will be gigantic, and
708 712 * no breaks will be necessary.
709 713 */
710 714 now = gethrtime();
711 715 diff = now - sysdc_last_update;
712 716 sysdc_last_update = now;
713 717
714 718 mutex_enter(&sysdc_pset_lock);
715 719 for (cur = list_head(&sysdc_psets); cur != NULL;
716 720 cur = list_next(&sysdc_psets, cur)) {
717 721 boolean_t breaking = (cur->sdp_should_break != 0);
718 722
719 723 if (cur->sdp_need_break != breaking) {
720 724 DTRACE_PROBE2(sdc__pset__break, sysdc_pset_t *, cur,
721 725 boolean_t, breaking);
722 726 }
723 727 cur->sdp_onproc_time = 0;
724 728 cur->sdp_onproc_threads = 0;
725 729 cur->sdp_need_break = breaking;
726 730 }
727 731 mutex_exit(&sysdc_pset_lock);
728 732
729 733 for (idx = 0; idx < SYSDC_NLISTS; idx++) {
730 734 sysdc_list_t *sdl = &sysdc_active[idx];
731 735 sysdc_t *volatile *headp = &sdl->sdl_list;
732 736 sysdc_t *head, *tail;
733 737 sysdc_t **prevptr;
734 738
735 739 if (*headp == &sysdc_dummy)
736 740 continue;
737 741
738 742 /* Prevent any threads from exiting while we're poking them. */
739 743 mutex_enter(&sdl->sdl_lock);
740 744
741 745 /*
742 746 * Each sdl_list contains a singly-linked list of active
743 747 * threads. Threads which become active while we are
744 748 * processing the list will be added to sdl_list. Since we
745 749 * don't want that to interfere with our own processing, we
746 750 * swap in an empty list. Any newly active threads will
747 751 * go on to this empty list. When finished, we'll put any
748 752 * such threads at the end of the processed list.
749 753 */
750 754 head = atomic_swap_ptr(headp, &sysdc_dummy);
751 755 prevptr = &head;
752 756 while (*prevptr != &sysdc_dummy) {
753 757 sysdc_t *const sdc = *prevptr;
754 758 kthread_t *const t = sdc->sdc_thread;
755 759
756 760 /*
757 761 * If the thread has exited, move its sysdc_t onto
758 762 * freelist, to be freed later.
759 763 */
760 764 if (t == NULL) {
761 765 *prevptr = sdc->sdc_next;
762 766 SYSDC_INC_STAT(sysdc_update_exited);
763 767 sdc->sdc_next = freelist;
764 768 freelist = sdc;
765 769 continue;
766 770 }
767 771
768 772 thread_lock(t);
769 773 if (t->t_cid != sysdccid) {
770 774 thread_unlock(t);
771 775 prevptr = &sdc->sdc_next;
772 776 SYSDC_INC_STAT(sysdc_update_not_sdc);
773 777 continue;
774 778 }
775 779 ASSERT(t->t_cldata == sdc);
776 780
777 781 /*
778 782 * If the thread has been sleeping for longer
779 783 * than sysdc_prune_interval, make it inactive by
780 784 * removing it from the list.
781 785 */
782 786 if (!(t->t_state & (TS_RUN | TS_ONPROC)) &&
783 787 sdc->sdc_sleep_updates != 0 &&
784 788 (sdc->sdc_sleep_updates - sdc->sdc_nupdates) >
785 789 sysdc_prune_updates) {
786 790 *prevptr = sdc->sdc_next;
787 791 SYSDC_INC_STAT(sysdc_update_idle);
788 792 sdc->sdc_next = NULL;
789 793 thread_unlock(t);
790 794 continue;
791 795 }
792 796 sysdc_update_pri(sdc, SDC_UPDATE_TIMEOUT);
793 797 thread_unlock(t);
794 798
795 799 prevptr = &sdc->sdc_next;
796 800 }
797 801
798 802 /*
799 803 * Add our list to the bucket, putting any new entries
800 804 * added while we were working at the tail of the list.
801 805 */
802 806 do {
803 807 tail = *headp;
804 808 *prevptr = tail;
805 809 } while (atomic_cas_ptr(headp, tail, head) != tail);
806 810
807 811 mutex_exit(&sdl->sdl_lock);
808 812 }
809 813
810 814 mutex_enter(&sysdc_pset_lock);
811 815 for (cur = list_head(&sysdc_psets); cur != NULL;
812 816 cur = list_next(&sysdc_psets, cur)) {
813 817
814 818 cur->sdp_vtime_last_interval =
815 819 diff * cur->sdp_cpupart->cp_ncpus;
816 820 cur->sdp_DC_last_interval =
817 821 (cur->sdp_onproc_time * SYSDC_DC_MAX) /
818 822 cur->sdp_vtime_last_interval;
819 823
820 824 if (cur->sdp_should_break > 0) {
821 825 cur->sdp_should_break--; /* breaking */
822 826 continue;
823 827 }
824 828 if (cur->sdp_dont_break > 0) {
825 829 cur->sdp_dont_break--; /* waiting before checking */
826 830 continue;
827 831 }
828 832 if (cur->sdp_DC_last_interval > sysdc_max_pset_DC) {
829 833 cur->sdp_should_break = sysdc_break_updates;
830 834 cur->sdp_dont_break = sysdc_nobreak_updates;
831 835 SYSDC_INC_STAT(sysdc_update_take_break);
832 836 }
833 837 }
834 838
835 839 /*
836 840 * If there are no sysdc_psets, there can be no threads, so
837 841 * we can stop doing our timeout. Since we're holding the
838 842 * sysdc_pset_lock, no new sysdc_psets can come in, which will
839 843 * prevent anyone from racing with this and dropping our timeout
840 844 * on the floor.
841 845 */
842 846 if (list_is_empty(&sysdc_psets)) {
843 847 SYSDC_INC_STAT(sysdc_update_no_psets);
844 848 ASSERT(sysdc_update_timeout_started);
845 849 sysdc_update_timeout_started = 0;
846 850
847 851 redeploy = 0;
848 852 }
849 853 mutex_exit(&sysdc_pset_lock);
850 854
851 855 while (freelist != NULL) {
852 856 sysdc_t *cur = freelist;
853 857 freelist = cur->sdc_next;
854 858 kmem_free(cur, sizeof (*cur));
855 859 }
856 860
857 861 if (redeploy) {
858 862 (void) timeout(sysdc_update, arg, sysdc_update_ticks);
859 863 }
860 864 }
861 865
862 866 static void
863 867 sysdc_preempt(kthread_t *t)
864 868 {
865 869 ASSERT(t == curthread);
866 870 ASSERT(THREAD_LOCK_HELD(t));
867 871
868 872 setbackdq(t); /* give others a chance to run */
869 873 }
870 874
871 875 static void
872 876 sysdc_tick(kthread_t *t)
873 877 {
874 878 sysdc_t *sdc;
875 879
876 880 thread_lock(t);
877 881 if (t->t_cid != sysdccid) {
878 882 SYSDC_INC_STAT(sysdc_tick_not_sdc);
879 883 thread_unlock(t);
880 884 return;
881 885 }
882 886 sdc = t->t_cldata;
883 887 if (t->t_state == TS_ONPROC &&
884 888 t->t_pri < t->t_disp_queue->disp_maxrunpri) {
885 889 cpu_surrender(t);
886 890 }
887 891
888 892 if (t->t_state == TS_ONPROC || t->t_state == TS_RUN) {
889 893 ASSERT(sdc->sdc_sleep_updates == 0);
890 894 }
891 895
892 896 ASSERT(sdc->sdc_ticks != sdc->sdc_update_ticks);
893 897 sdc->sdc_ticks++;
894 898 if (sdc->sdc_ticks == sdc->sdc_update_ticks) {
895 899 SYSDC_INC_STAT(sysdc_tick_quantum_expired);
896 900 sysdc_update_pri(sdc, SDC_UPDATE_TICK);
897 901 ASSERT(sdc->sdc_ticks != sdc->sdc_update_ticks);
898 902 }
899 903 thread_unlock(t);
900 904 }
901 905
902 906 static void
903 907 sysdc_setrun(kthread_t *t)
904 908 {
905 909 sysdc_t *sdc = t->t_cldata;
906 910
907 911 ASSERT(THREAD_LOCK_HELD(t)); /* t should be in transition */
908 912
909 913 sdc->sdc_sleep_updates = 0;
910 914
911 915 if (sdc->sdc_next == NULL) {
912 916 /*
913 917 * Since we're in transition, we don't want to use the
914 918 * full thread_update_pri().
915 919 */
916 920 if (sysdc_compute_pri(sdc, 0)) {
917 921 THREAD_CHANGE_PRI(t, sdc->sdc_epri);
918 922 }
919 923 sysdc_activate(sdc);
920 924
921 925 ASSERT(sdc->sdc_next != NULL);
922 926 }
923 927
924 928 setbackdq(t);
925 929 }
926 930
927 931 static void
928 932 sysdc_wakeup(kthread_t *t)
929 933 {
930 934 sysdc_setrun(t);
931 935 }
932 936
933 937 static void
934 938 sysdc_sleep(kthread_t *t)
935 939 {
936 940 sysdc_t *sdc = t->t_cldata;
937 941
938 942 ASSERT(THREAD_LOCK_HELD(t)); /* t should be in transition */
939 943
940 944 sdc->sdc_sleep_updates = sdc->sdc_nupdates;
941 945 }
942 946
943 947 /*ARGSUSED*/
944 948 static int
945 949 sysdc_enterclass(kthread_t *t, id_t cid, void *parmsp, cred_t *reqpcredp,
946 950 void *bufp)
947 951 {
948 952 cpupart_t *const cpupart = t->t_cpupart;
949 953 sysdc_t *sdc = bufp;
950 954 sysdc_params_t *sdpp = parmsp;
951 955 sysdc_pset_t *newpset = sdc->sdc_pset;
952 956 sysdc_pset_t *pset;
953 957 int start_timeout;
954 958
955 959 if (t->t_cid != syscid)
956 960 return (EPERM);
957 961
958 962 ASSERT(ttolwp(t) != NULL);
959 963 ASSERT(sdpp != NULL);
960 964 ASSERT(newpset != NULL);
961 965 ASSERT(sysdc_param_init);
962 966
963 967 ASSERT(sdpp->sdp_minpri >= sysdc_minpri);
964 968 ASSERT(sdpp->sdp_maxpri <= sysdc_maxpri);
965 969 ASSERT(sdpp->sdp_DC >= sysdc_minDC);
966 970 ASSERT(sdpp->sdp_DC <= sysdc_maxDC);
967 971
968 972 sdc->sdc_thread = t;
969 973 sdc->sdc_pri = sdpp->sdp_maxpri; /* start off maximally */
970 974 sdc->sdc_minpri = sdpp->sdp_minpri;
971 975 sdc->sdc_maxpri = sdpp->sdp_maxpri;
972 976 sdc->sdc_target_DC = sdpp->sdp_DC;
973 977 sdc->sdc_ticks = 0;
974 978 sdc->sdc_update_ticks = sysdc_update_ticks + 1;
975 979
976 980 /* Assign ourselves to the appropriate pset. */
977 981 sdc->sdc_pset = NULL;
978 982 mutex_enter(&sysdc_pset_lock);
979 983 for (pset = list_head(&sysdc_psets); pset != NULL;
980 984 pset = list_next(&sysdc_psets, pset)) {
981 985 if (pset->sdp_cpupart == cpupart) {
982 986 break;
983 987 }
984 988 }
985 989 if (pset == NULL) {
986 990 pset = newpset;
987 991 newpset = NULL;
988 992 pset->sdp_cpupart = cpupart;
989 993 list_insert_tail(&sysdc_psets, pset);
990 994 }
991 995 pset->sdp_nthreads++;
992 996 ASSERT(pset->sdp_nthreads > 0);
993 997
994 998 sdc->sdc_pset = pset;
995 999
996 1000 start_timeout = (sysdc_update_timeout_started == 0);
997 1001 sysdc_update_timeout_started = 1;
998 1002 mutex_exit(&sysdc_pset_lock);
999 1003
1000 1004 if (newpset != NULL)
1001 1005 kmem_free(newpset, sizeof (*newpset));
1002 1006
1003 1007 /* Update t's scheduling class and priority. */
1004 1008 thread_lock(t);
1005 1009 t->t_clfuncs = &(sclass[cid].cl_funcs->thread);
1006 1010 t->t_cid = cid;
1007 1011 t->t_cldata = sdc;
1008 1012 t->t_schedflag |= TS_RUNQMATCH;
1009 1013
1010 1014 sysdc_update_pri(sdc, SDC_UPDATE_INITIAL);
1011 1015 thread_unlock(t);
1012 1016
1013 1017 /* Kick off the thread timeout if we're the first one in. */
1014 1018 if (start_timeout) {
1015 1019 (void) timeout(sysdc_update, NULL, sysdc_update_ticks);
1016 1020 }
1017 1021
1018 1022 return (0);
1019 1023 }
1020 1024
1021 1025 static void
1022 1026 sysdc_leave(sysdc_t *sdc)
1023 1027 {
1024 1028 sysdc_pset_t *sdp = sdc->sdc_pset;
1025 1029 sysdc_list_t *sdl = SYSDC_LIST(sdc);
1026 1030 uint_t freedc;
1027 1031
1028 1032 mutex_enter(&sdl->sdl_lock); /* block sysdc_update() */
1029 1033 sdc->sdc_thread = NULL;
1030 1034 freedc = (sdc->sdc_next == NULL);
1031 1035 mutex_exit(&sdl->sdl_lock);
1032 1036
1033 1037 mutex_enter(&sysdc_pset_lock);
1034 1038 ASSERT(sdp != NULL);
1035 1039 ASSERT(sdp->sdp_nthreads > 0);
1036 1040 --sdp->sdp_nthreads;
1037 1041 if (sdp->sdp_nthreads == 0) {
1038 1042 list_remove(&sysdc_psets, sdp);
1039 1043 } else {
1040 1044 sdp = NULL;
1041 1045 }
1042 1046 mutex_exit(&sysdc_pset_lock);
1043 1047
1044 1048 if (freedc)
1045 1049 kmem_free(sdc, sizeof (*sdc));
1046 1050 if (sdp != NULL)
1047 1051 kmem_free(sdp, sizeof (*sdp));
1048 1052 }
1049 1053
1050 1054 static void
1051 1055 sysdc_exitclass(void *buf)
1052 1056 {
1053 1057 sysdc_leave((sysdc_t *)buf);
1054 1058 }
1055 1059
1056 1060 /*ARGSUSED*/
1057 1061 static int
1058 1062 sysdc_canexit(kthread_t *t, cred_t *reqpcredp)
1059 1063 {
1060 1064 /* Threads cannot exit SDC once joined, except in a body bag. */
1061 1065 return (EPERM);
1062 1066 }
1063 1067
1064 1068 static void
1065 1069 sysdc_exit(kthread_t *t)
1066 1070 {
1067 1071 sysdc_t *sdc;
1068 1072
1069 1073 /* We're exiting, so we just rejoin the SYS class. */
1070 1074 thread_lock(t);
1071 1075 ASSERT(t->t_cid == sysdccid);
1072 1076 sdc = t->t_cldata;
1073 1077 t->t_cid = syscid;
1074 1078 t->t_cldata = NULL;
1075 1079 t->t_clfuncs = &(sclass[syscid].cl_funcs->thread);
1076 1080 (void) thread_change_pri(t, maxclsyspri, 0);
1077 1081 t->t_schedflag &= ~TS_RUNQMATCH;
1078 1082 thread_unlock_nopreempt(t);
1079 1083
1080 1084 /* Unlink the sdc from everything. */
1081 1085 sysdc_leave(sdc);
1082 1086 }
1083 1087
1084 1088 /*ARGSUSED*/
1085 1089 static int
1086 1090 sysdc_fork(kthread_t *t, kthread_t *ct, void *bufp)
1087 1091 {
1088 1092 /*
1089 1093 * Threads cannot be created with SDC as their class; they must
1090 1094 * be created as SYS and then added with sysdc_thread_enter().
1091 1095 * Because of this restriction, sysdc_fork() should never be called.
1092 1096 */
1093 1097 panic("sysdc cannot be forked");
1094 1098
1095 1099 return (ENOSYS);
1096 1100 }
1097 1101
1098 1102 /*ARGSUSED*/
1099 1103 static void
1100 1104 sysdc_forkret(kthread_t *t, kthread_t *ct)
1101 1105 {
1102 1106 /* SDC threads are part of system processes, which never fork. */
1103 1107 panic("sysdc cannot be forked");
1104 1108 }
1105 1109
1106 1110 static pri_t
1107 1111 sysdc_globpri(kthread_t *t)
1108 1112 {
1109 1113 return (t->t_epri);
1110 1114 }
1111 1115
1112 1116 /*ARGSUSED*/
1113 1117 static pri_t
1114 1118 sysdc_no_swap(kthread_t *t, int flags)
1115 1119 {
1116 1120 /* SDC threads cannot be swapped. */
1117 1121 return (-1);
1118 1122 }
1119 1123
1120 1124 /*
1121 1125 * Get maximum and minimum priorities enjoyed by SDC threads.
1122 1126 */
1123 1127 static int
1124 1128 sysdc_getclpri(pcpri_t *pcprip)
1125 1129 {
1126 1130 pcprip->pc_clpmax = sysdc_maxpri;
1127 1131 pcprip->pc_clpmin = sysdc_minpri;
1128 1132 return (0);
1129 1133 }
1130 1134
1131 1135 /*ARGSUSED*/
1132 1136 static int
1133 1137 sysdc_getclinfo(void *arg)
1134 1138 {
1135 1139 return (0); /* no class-specific info */
1136 1140 }
1137 1141
1138 1142 /*ARGSUSED*/
1139 1143 static int
1140 1144 sysdc_alloc(void **p, int flag)
1141 1145 {
1142 1146 sysdc_t *new;
1143 1147
1144 1148 *p = NULL;
1145 1149 if ((new = kmem_zalloc(sizeof (*new), flag)) == NULL) {
1146 1150 return (ENOMEM);
1147 1151 }
1148 1152 if ((new->sdc_pset = kmem_zalloc(sizeof (*new->sdc_pset), flag)) ==
1149 1153 NULL) {
1150 1154 kmem_free(new, sizeof (*new));
1151 1155 return (ENOMEM);
1152 1156 }
1153 1157 *p = new;
1154 1158 return (0);
1155 1159 }
1156 1160
1157 1161 static void
1158 1162 sysdc_free(void *p)
1159 1163 {
1160 1164 sysdc_t *sdc = p;
1161 1165
1162 1166 if (sdc != NULL) {
1163 1167 /*
1164 1168 * We must have failed CL_ENTERCLASS(), so our pset should be
1165 1169 * there and unused.
1166 1170 */
1167 1171 ASSERT(sdc->sdc_pset != NULL);
1168 1172 ASSERT(sdc->sdc_pset->sdp_cpupart == NULL);
1169 1173 kmem_free(sdc->sdc_pset, sizeof (*sdc->sdc_pset));
1170 1174 kmem_free(sdc, sizeof (*sdc));
1171 1175 }
1172 1176 }
1173 1177
1174 1178 static int sysdc_enosys(); /* Boy, ANSI-C's K&R compatibility is weird. */
1175 1179 static int sysdc_einval();
1176 1180 static void sysdc_nullsys();
1177 1181
1178 1182 static struct classfuncs sysdc_classfuncs = {
1179 1183 /* messages to class manager */
1180 1184 {
1181 1185 sysdc_enosys, /* admin */
1182 1186 sysdc_getclinfo,
1183 1187 sysdc_enosys, /* parmsin */
1184 1188 sysdc_enosys, /* parmsout */
1185 1189 sysdc_enosys, /* vaparmsin */
1186 1190 sysdc_enosys, /* vaparmsout */
1187 1191 sysdc_getclpri,
1188 1192 sysdc_alloc,
1189 1193 sysdc_free,
1190 1194 },
1191 1195 /* operations on threads */
1192 1196 {
1193 1197 sysdc_enterclass,
1194 1198 sysdc_exitclass,
1195 1199 sysdc_canexit,
1196 1200 sysdc_fork,
1197 1201 sysdc_forkret,
1198 1202 sysdc_nullsys, /* parmsget */
1199 1203 sysdc_enosys, /* parmsset */
1200 1204 sysdc_nullsys, /* stop */
1201 1205 sysdc_exit,
1202 1206 sysdc_nullsys, /* active */
1203 1207 sysdc_nullsys, /* inactive */
1204 1208 sysdc_no_swap, /* swapin */
1205 1209 sysdc_no_swap, /* swapout */
1206 1210 sysdc_nullsys, /* trapret */
1207 1211 sysdc_preempt,
1208 1212 sysdc_setrun,
1209 1213 sysdc_sleep,
1210 1214 sysdc_tick,
1211 1215 sysdc_wakeup,
1212 1216 sysdc_einval, /* donice */
1213 1217 sysdc_globpri,
1214 1218 sysdc_nullsys, /* set_process_group */
1215 1219 sysdc_nullsys, /* yield */
1216 1220 sysdc_einval, /* doprio */
1217 1221 }
1218 1222 };
1219 1223
1220 1224 static int
1221 1225 sysdc_enosys()
1222 1226 {
1223 1227 return (ENOSYS);
1224 1228 }
1225 1229
1226 1230 static int
1227 1231 sysdc_einval()
1228 1232 {
1229 1233 return (EINVAL);
1230 1234 }
1231 1235
1232 1236 static void
1233 1237 sysdc_nullsys()
1234 1238 {
1235 1239 }
1236 1240
1237 1241 /*ARGSUSED*/
1238 1242 static pri_t
1239 1243 sysdc_init(id_t cid, int clparmsz, classfuncs_t **clfuncspp)
1240 1244 {
1241 1245 int idx;
1242 1246
1243 1247 list_create(&sysdc_psets, sizeof (sysdc_pset_t),
1244 1248 offsetof(sysdc_pset_t, sdp_node));
1245 1249
1246 1250 for (idx = 0; idx < SYSDC_NLISTS; idx++) {
1247 1251 sysdc_active[idx].sdl_list = &sysdc_dummy;
1248 1252 }
1249 1253
1250 1254 sysdc_initparam();
1251 1255
1252 1256 sysdccid = cid;
1253 1257 *clfuncspp = &sysdc_classfuncs;
1254 1258
1255 1259 return ((pri_t)v.v_maxsyspri);
1256 1260 }
1257 1261
1258 1262 static struct sclass csw = {
1259 1263 "SDC",
1260 1264 sysdc_init,
1261 1265 0
1262 1266 };
1263 1267
1264 1268 static struct modlsched modlsched = {
1265 1269 &mod_schedops, "system duty cycle scheduling class", &csw
1266 1270 };
1267 1271
1268 1272 static struct modlinkage modlinkage = {
1269 1273 MODREV_1, (void *)&modlsched, NULL
1270 1274 };
1271 1275
1272 1276 int
1273 1277 _init()
1274 1278 {
1275 1279 return (mod_install(&modlinkage));
1276 1280 }
1277 1281
1278 1282 int
1279 1283 _fini()
1280 1284 {
1281 1285 return (EBUSY); /* can't unload for now */
1282 1286 }
1283 1287
1284 1288 int
1285 1289 _info(struct modinfo *modinfop)
1286 1290 {
1287 1291 return (mod_info(&modlinkage, modinfop));
1288 1292 }
1289 1293
1290 1294 /* --- consolidation-private interfaces --- */
1291 1295 void
1292 1296 sysdc_thread_enter(kthread_t *t, uint_t dc, uint_t flags)
1293 1297 {
1294 1298 void *buf = NULL;
1295 1299 sysdc_params_t sdp;
1296 1300
1297 1301 SYSDC_INC_STAT(sysdc_thread_enter_enter);
↓ open down ↓ |
1262 lines elided |
↑ open up ↑ |
1298 1302
1299 1303 ASSERT(sysdc_param_init);
1300 1304 ASSERT(sysdccid >= 0);
1301 1305
1302 1306 ASSERT((flags & ~SYSDC_THREAD_BATCH) == 0);
1303 1307
1304 1308 sdp.sdp_minpri = sysdc_minpri;
1305 1309 sdp.sdp_maxpri = sysdc_maxpri;
1306 1310 sdp.sdp_DC = MAX(MIN(dc, sysdc_maxDC), sysdc_minDC);
1307 1311
1308 - VERIFY3U(CL_ALLOC(&buf, sysdccid, KM_SLEEP), ==, 0);
1312 + VERIFY0(CL_ALLOC(&buf, sysdccid, KM_SLEEP));
1309 1313
1310 1314 ASSERT(t->t_lwp != NULL);
1311 1315 ASSERT(t->t_cid == syscid);
1312 1316 ASSERT(t->t_cldata == NULL);
1313 - VERIFY3U(CL_CANEXIT(t, NULL), ==, 0);
1314 - VERIFY3U(CL_ENTERCLASS(t, sysdccid, &sdp, kcred, buf), ==, 0);
1317 + VERIFY0(CL_CANEXIT(t, NULL));
1318 + VERIFY0(CL_ENTERCLASS(t, sysdccid, &sdp, kcred, buf));
1315 1319 CL_EXITCLASS(syscid, NULL);
1316 1320 }
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX