1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  24  */
  25 
  26 /*
  27  * The System Duty Cycle (SDC) scheduling class
  28  * --------------------------------------------
  29  *
  30  * Background
  31  *
  32  * Kernel threads in Solaris have traditionally not been large consumers
  33  * of CPU time.  They typically wake up, perform a small amount of
  34  * work, then go back to sleep waiting for either a timeout or another
  35  * signal.  On the assumption that the small amount of work that they do
  36  * is important for the behavior of the whole system, these threads are
  37  * treated kindly by the dispatcher and the SYS scheduling class: they run
  38  * without preemption from anything other than real-time and interrupt
  39  * threads; when preempted, they are put at the front of the queue, so they
  40  * generally do not migrate between CPUs; and they are allowed to stay
  41  * running until they voluntarily give up the CPU.
  42  *
  43  * As Solaris has evolved, new workloads have emerged which require the
  44  * kernel to perform significant amounts of CPU-intensive work.  One
  45  * example of such a workload is ZFS's transaction group sync processing.
  46  * Each sync operation generates a large batch of I/Os, and each I/O
  47  * may need to be compressed and/or checksummed before it is written to
  48  * storage.  The taskq threads which perform the compression and checksums
  49  * will run nonstop as long as they have work to do; a large sync operation
  50  * on a compression-heavy dataset can keep them busy for seconds on end.
  51  * This causes human-time-scale dispatch latency bubbles for any other
  52  * threads which have the misfortune to share a CPU with the taskq threads.
  53  *
  54  * The SDC scheduling class is a solution to this problem.
  55  *
  56  *
  57  * Overview
  58  *
  59  * SDC is centered around the concept of a thread's duty cycle (DC):
  60  *
  61  *                            ONPROC time
  62  *      Duty Cycle =    ----------------------
  63  *                      ONPROC + Runnable time
  64  *
  65  * This is the ratio of the time that the thread spent running on a CPU
  66  * divided by the time it spent running or trying to run.  It is unaffected
  67  * by any time the thread spent sleeping, stopped, etc.
  68  *
  69  * A thread joining the SDC class specifies a "target" DC that it wants
  70  * to run at.  To implement this policy, the routine sysdc_update() scans
  71  * the list of active SDC threads every few ticks and uses each thread's
  72  * microstate data to compute the actual duty cycle that that thread
  73  * has experienced recently.  If the thread is under its target DC, its
  74  * priority is increased to the maximum available (sysdc_maxpri, which is
  75  * 99 by default).  If the thread is over its target DC, its priority is
  76  * reduced to the minimum available (sysdc_minpri, 0 by default).  This
  77  * is a fairly primitive approach, in that it doesn't use any of the
  78  * intermediate priorities, but it's not completely inappropriate.  Even
  79  * though threads in the SDC class might take a while to do their job, they
  80  * are by some definition important if they're running inside the kernel,
  81  * so it is reasonable that they should get to run at priority 99.
  82  *
  83  * If a thread is running when sysdc_update() calculates its actual duty
  84  * cycle, and there are other threads of equal or greater priority on its
  85  * CPU's dispatch queue, sysdc_update() preempts that thread.  The thread
  86  * acknowledges the preemption by calling sysdc_preempt(), which calls
  87  * setbackdq(), which gives other threads with the same priority a chance
  88  * to run.  This creates a de facto time quantum for threads in the SDC
  89  * scheduling class.
  90  *
  91  * An SDC thread which is assigned priority 0 can continue to run if
  92  * nothing else needs to use the CPU that it's running on.  Similarly, an
  93  * SDC thread at priority 99 might not get to run as much as it wants to
  94  * if there are other priority-99 or higher threads on its CPU.  These
  95  * situations would cause the thread to get ahead of or behind its target
  96  * DC; the longer the situations lasted, the further ahead or behind the
  97  * thread would get.  Rather than condemning a thread to a lifetime of
  98  * paying for its youthful indiscretions, SDC keeps "base" values for
  99  * ONPROC and Runnable times in each thread's sysdc data, and updates these
 100  * values periodically.  The duty cycle is then computed using the elapsed
 101  * amount of ONPROC and Runnable times since those base times.
 102  *
 103  * Since sysdc_update() scans SDC threads fairly frequently, it tries to
 104  * keep the list of "active" threads small by pruning out threads which
 105  * have been asleep for a brief time.  They are not pruned immediately upon
 106  * going to sleep, since some threads may bounce back and forth between
 107  * sleeping and being runnable.
 108  *
 109  *
 110  * Interfaces
 111  *
 112  * void sysdc_thread_enter(t, dc, flags)
 113  *
 114  *      Moves a kernel thread from the SYS scheduling class to the
 115  *      SDC class. t must have an associated LWP (created by calling
 116  *      lwp_kernel_create()).  The thread will have a target DC of dc.
 117  *      Flags should be either 0 or SYSDC_THREAD_BATCH.  If
 118  *      SYSDC_THREAD_BATCH is specified, the thread is expected to be
 119  *      doing large amounts of processing.
 120  *
 121  *
 122  * Complications
 123  *
 124  * - Run queue balancing
 125  *
 126  *      The Solaris dispatcher is biased towards letting a thread run
 127  *      on the same CPU which it last ran on, if no more than 3 ticks
 128  *      (i.e. rechoose_interval) have passed since the thread last ran.
 129  *      This helps to preserve cache warmth.  On the other hand, it also
 130  *      tries to keep the per-CPU run queues fairly balanced; if the CPU
 131  *      chosen for a runnable thread has a run queue which is three or
 132  *      more threads longer than a neighboring CPU's queue, the runnable
 133  *      thread is dispatched onto the neighboring CPU instead.
 134  *
 135  *      These policies work well for some workloads, but not for many SDC
 136  *      threads.  The taskq client of SDC, for example, has many discrete
 137  *      units of work to do.  The work units are largely independent, so
 138  *      cache warmth is not an important consideration.  It is important
 139  *      that the threads fan out quickly to different CPUs, since the
 140  *      amount of work these threads have to do (a few seconds worth at a
 141  *      time) doesn't leave much time to correct thread placement errors
 142  *      (i.e. two SDC threads being dispatched to the same CPU).
 143  *
 144  *      To fix this, SDC uses the TS_RUNQMATCH flag introduced for FSS.
 145  *      This tells the dispatcher to keep neighboring run queues' lengths
 146  *      more evenly matched, which allows SDC threads to migrate more
 147  *      easily.
 148  *
 149  * - LWPs and system processes
 150  *
 151  *      SDC can only be used for kernel threads.  Since SDC uses microstate
 152  *      accounting data to compute each thread's actual duty cycle, all
 153  *      threads entering the SDC class must have associated LWPs (which
 154  *      store the microstate data).  This means that the threads have to
 155  *      be associated with an SSYS process, i.e. one created by newproc().
 156  *      If the microstate accounting information is ever moved into the
 157  *      kthread_t, this restriction could be lifted.
 158  *
 159  * - Dealing with oversubscription
 160  *
 161  *      Since SDC duty cycles are per-thread, it is possible that the
 162  *      aggregate requested duty cycle of all SDC threads in a processor
 163  *      set could be greater than the total CPU time available in that set.
 164  *      The FSS scheduling class has an analogous situation, which it deals
 165  *      with by reducing each thread's allotted CPU time proportionally.
 166  *      Since SDC doesn't need to be as precise as FSS, it uses a simpler
 167  *      solution to the oversubscription problem.
 168  *
 169  *      sysdc_update() accumulates the amount of time that max-priority SDC
 170  *      threads have spent on-CPU in each processor set, and uses that sum
 171  *      to create an implied duty cycle for that processor set:
 172  *
 173  *                              accumulated CPU time
 174  *         pset DC =    -----------------------------------
 175  *                       (# CPUs) * time since last update
 176  *
 177  *      If this implied duty cycle is above a maximum pset duty cycle (90%
 178  *      by default), sysdc_update() sets the priority of all SDC threads
 179  *      in that processor set to sysdc_minpri for a "break" period.  After
 180  *      the break period, it waits for a "nobreak" period before trying to
 181  *      enforce the pset duty cycle limit again.
 182  *
 183  * - Processor sets
 184  *
 185  *      As the above implies, SDC is processor set aware, but it does not
 186  *      currently allow threads to change processor sets while in the SDC
 187  *      class.  Instead, those threads must join the desired processor set
 188  *      before entering SDC. [1]
 189  *
 190  * - Batch threads
 191  *
 192  *      A thread joining the SDC class can specify the SDC_THREAD_BATCH
 193  *      flag.  This flag currently has no effect, but marks threads which
 194  *      do bulk processing.
 195  *
 196  * - Why not FSS?
 197  *
 198  *      It might seem that the existing FSS scheduling class could solve
 199  *      the problems that SDC is attempting to solve.  FSS's more precise
 200  *      solution to the oversubscription problem would hardly cause
 201  *      trouble, as long as it performed well.  SDC is implemented as
 202  *      a separate scheduling class for two main reasons: the initial
 203  *      consumer of SDC does not map well onto the "project" abstraction
 204  *      that is central to FSS, and FSS does not expect to run at kernel
 205  *      priorities.
 206  *
 207  *
 208  * Tunables
 209  *
 210  * - sysdc_update_interval_msec:  Number of milliseconds between
 211  *      consecutive thread priority updates.
 212  *
 213  * - sysdc_reset_interval_msec:  Number of milliseconds between
 214  *      consecutive resets of a thread's base ONPROC and Runnable
 215  *      times.
 216  *
 217  * - sysdc_prune_interval_msec:  Number of milliseconds of sleeping
 218  *      before a thread is pruned from the active list.
 219  *
 220  * - sysdc_max_pset_DC:  Allowable percentage of a processor set's
 221  *      CPU time which SDC can give to its high-priority threads.
 222  *
 223  * - sysdc_break_msec:  Number of milliseconds of "break" taken when
 224  *      sysdc_max_pset_DC is exceeded.
 225  *
 226  *
 227  * Future work (in SDC and related subsystems)
 228  *
 229  * - Per-thread rechoose interval (0 for SDC)
 230  *
 231  *      Allow each thread to specify its own rechoose interval.  SDC
 232  *      threads would specify an interval of zero, which would rechoose
 233  *      the CPU with the lowest priority once per update.
 234  *
 235  * - Allow threads to change processor sets after joining the SDC class
 236  *
 237  * - Thread groups and per-group DC
 238  *
 239  *      It might be nice to be able to specify a duty cycle which applies
 240  *      to a group of threads in aggregate.
 241  *
 242  * - Per-group DC callback to allow dynamic DC tuning
 243  *
 244  *      Currently, DCs are assigned when the thread joins SDC.  Some
 245  *      workloads could benefit from being able to tune their DC using
 246  *      subsystem-specific knowledge about the workload.
 247  *
 248  * - Finer-grained priority updates
 249  *
 250  * - More nuanced management of oversubscription
 251  *
 252  * - Moving other CPU-intensive threads into SDC
 253  *
 254  * - Move msacct data into kthread_t
 255  *
 256  *      This would allow kernel threads without LWPs to join SDC.
 257  *
 258  *
 259  * Footnotes
 260  *
 261  * [1] The details of doing so are left as an exercise for the reader.
 262  */
 263 
 264 #include <sys/types.h>
 265 #include <sys/sysdc.h>
 266 #include <sys/sysdc_impl.h>
 267 
 268 #include <sys/class.h>
 269 #include <sys/cmn_err.h>
 270 #include <sys/cpuvar.h>
 271 #include <sys/cpupart.h>
 272 #include <sys/debug.h>
 273 #include <sys/disp.h>
 274 #include <sys/errno.h>
 275 #include <sys/inline.h>
 276 #include <sys/kmem.h>
 277 #include <sys/modctl.h>
 278 #include <sys/schedctl.h>
 279 #include <sys/sdt.h>
 280 #include <sys/sunddi.h>
 281 #include <sys/sysmacros.h>
 282 #include <sys/systm.h>
 283 #include <sys/var.h>
 284 
 285 /*
 286  * Tunables - loaded into the internal state at module load time
 287  */
 288 uint_t          sysdc_update_interval_msec = 20;
 289 uint_t          sysdc_reset_interval_msec = 400;
 290 uint_t          sysdc_prune_interval_msec = 100;
 291 uint_t          sysdc_max_pset_DC = 90;
 292 uint_t          sysdc_break_msec = 80;
 293 
 294 /*
 295  * Internal state - constants set up by sysdc_initparam()
 296  */
 297 static clock_t  sysdc_update_ticks;     /* ticks between updates */
 298 static uint_t   sysdc_prune_updates;    /* updates asleep before pruning */
 299 static uint_t   sysdc_reset_updates;    /* # of updates before reset */
 300 static uint_t   sysdc_break_updates;    /* updates to break */
 301 static uint_t   sysdc_nobreak_updates;  /* updates to not check */
 302 static uint_t   sysdc_minDC;            /* minimum allowed DC */
 303 static uint_t   sysdc_maxDC;            /* maximum allowed DC */
 304 static pri_t    sysdc_minpri;           /* minimum allowed priority */
 305 static pri_t    sysdc_maxpri;           /* maximum allowed priority */
 306 
 307 /*
 308  * Internal state
 309  */
 310 static kmutex_t sysdc_pset_lock;        /* lock protecting pset data */
 311 static list_t   sysdc_psets;            /* list of psets with SDC threads */
 312 static uint_t   sysdc_param_init;       /* sysdc_initparam() has been called */
 313 static uint_t   sysdc_update_timeout_started; /* update timeout is active */
 314 static hrtime_t sysdc_last_update;      /* time of last sysdc_update() */
 315 static sysdc_t  sysdc_dummy;            /* used to terminate active lists */
 316 
 317 /*
 318  * Internal state - active hash table
 319  */
 320 #define SYSDC_NLISTS    8
 321 #define SYSDC_HASH(sdc) (((uintptr_t)(sdc) >> 6) & (SYSDC_NLISTS - 1))
 322 static sysdc_list_t     sysdc_active[SYSDC_NLISTS];
 323 #define SYSDC_LIST(sdc)         (&sysdc_active[SYSDC_HASH(sdc)])
 324 
 325 #ifdef DEBUG
 326 static struct {
 327         uint64_t        sysdc_update_times_asleep;
 328         uint64_t        sysdc_update_times_base_ran_backwards;
 329         uint64_t        sysdc_update_times_already_done;
 330         uint64_t        sysdc_update_times_cur_ran_backwards;
 331         uint64_t        sysdc_compute_pri_breaking;
 332         uint64_t        sysdc_activate_enter;
 333         uint64_t        sysdc_update_enter;
 334         uint64_t        sysdc_update_exited;
 335         uint64_t        sysdc_update_not_sdc;
 336         uint64_t        sysdc_update_idle;
 337         uint64_t        sysdc_update_take_break;
 338         uint64_t        sysdc_update_no_psets;
 339         uint64_t        sysdc_tick_not_sdc;
 340         uint64_t        sysdc_tick_quantum_expired;
 341         uint64_t        sysdc_thread_enter_enter;
 342 } sysdc_stats;
 343 
 344 #define SYSDC_INC_STAT(x)       (sysdc_stats.x++)
 345 #else
 346 #define SYSDC_INC_STAT(x)       ((void)0)
 347 #endif
 348 
 349 /* macros are UPPER CASE */
 350 #define HOWMANY(a, b)   howmany((a), (b))
 351 #define MSECTOTICKS(a)  HOWMANY((a) * 1000, usec_per_tick)
 352 
 353 static void
 354 sysdc_initparam(void)
 355 {
 356         uint_t sysdc_break_ticks;
 357 
 358         /* update / prune intervals */
 359         sysdc_update_ticks = MSECTOTICKS(sysdc_update_interval_msec);
 360 
 361         sysdc_prune_updates = HOWMANY(sysdc_prune_interval_msec,
 362             sysdc_update_interval_msec);
 363         sysdc_reset_updates = HOWMANY(sysdc_reset_interval_msec,
 364             sysdc_update_interval_msec);
 365 
 366         /* We must get at least a little time on CPU. */
 367         sysdc_minDC = 1;
 368         sysdc_maxDC = SYSDC_DC_MAX;
 369         sysdc_minpri = 0;
 370         sysdc_maxpri = maxclsyspri - 1;
 371 
 372         /* break parameters */
 373         if (sysdc_max_pset_DC > SYSDC_DC_MAX) {
 374                 sysdc_max_pset_DC = SYSDC_DC_MAX;
 375         }
 376         sysdc_break_ticks = MSECTOTICKS(sysdc_break_msec);
 377         sysdc_break_updates = HOWMANY(sysdc_break_ticks, sysdc_update_ticks);
 378 
 379         /*
 380          * We want:
 381          *
 382          *      sysdc_max_pset_DC = (nobreak / (break + nobreak))
 383          *
 384          *      ==>    nobreak = sysdc_max_pset_DC * (break + nobreak)
 385          *
 386          *                          sysdc_max_pset_DC * break
 387          *      ==>    nobreak = -------------------------
 388          *                          1 - sysdc_max_pset_DC
 389          */
 390         sysdc_nobreak_updates =
 391             HOWMANY((uint64_t)sysdc_break_updates * sysdc_max_pset_DC,
 392             (SYSDC_DC_MAX - sysdc_max_pset_DC));
 393 
 394         sysdc_param_init = 1;
 395 }
 396 
 397 #undef HOWMANY
 398 #undef MSECTOTICKS
 399 
 400 #define SDC_UPDATE_INITIAL      0x1     /* for the initial update */
 401 #define SDC_UPDATE_TIMEOUT      0x2     /* from sysdc_update() */
 402 #define SDC_UPDATE_TICK         0x4     /* from sysdc_tick(), on expiry */
 403 
 404 /*
 405  * Updates the recorded times in the sdc, and returns the elapsed ONPROC
 406  * and Runnable times since the last reset.
 407  *
 408  * newO is the thread's actual ONPROC time; it's used during sysdc_update()
 409  * to track processor set usage.
 410  */
 411 static void
 412 sysdc_update_times(sysdc_t *sdc, uint_t flags,
 413     hrtime_t *O, hrtime_t *R, hrtime_t *newO)
 414 {
 415         kthread_t *const t = sdc->sdc_thread;
 416         const uint_t    initial = (flags & SDC_UPDATE_INITIAL);
 417         const uint_t    update = (flags & SDC_UPDATE_TIMEOUT);
 418         const clock_t   now = ddi_get_lbolt();
 419         uint_t          do_reset;
 420 
 421         ASSERT(THREAD_LOCK_HELD(t));
 422 
 423         *O = *R = 0;
 424 
 425         /* If we've been sleeping, we know we haven't had any ONPROC time. */
 426         if (sdc->sdc_sleep_updates != 0 &&
 427             sdc->sdc_sleep_updates != sdc->sdc_nupdates) {
 428                 *newO = sdc->sdc_last_base_O;
 429                 SYSDC_INC_STAT(sysdc_update_times_asleep);
 430                 return;
 431         }
 432 
 433         /*
 434          * If this is our first update, or we've hit the reset point,
 435          * we need to reset our base_{O,R}.  Once we've updated them, we
 436          * report O and R for the entire prior interval.
 437          */
 438         do_reset = initial;
 439         if (update) {
 440                 ++sdc->sdc_nupdates;
 441                 if ((sdc->sdc_nupdates % sysdc_reset_updates) == 0)
 442                         do_reset = 1;
 443         }
 444         if (do_reset) {
 445                 hrtime_t baseO, baseR;
 446                 if (initial) {
 447                         /*
 448                          * Start off our cycle count somewhere in the middle,
 449                          * to keep the resets from all happening at once.
 450                          *
 451                          * 4999 is a handy prime much larger than
 452                          * sysdc_reset_updates, so that we don't run into
 453                          * trouble if the resolution is a multiple of
 454                          * sysdc_reset_updates.
 455                          */
 456                         sdc->sdc_nupdates = (uint_t)((gethrtime() % 4999) %
 457                             sysdc_reset_updates);
 458                         baseO = baseR = 0;
 459                 } else {
 460                         baseO = sdc->sdc_base_O;
 461                         baseR = sdc->sdc_base_R;
 462                 }
 463 
 464                 mstate_systhread_times(t, &sdc->sdc_base_O, &sdc->sdc_base_R);
 465                 *newO = sdc->sdc_base_O;
 466 
 467                 sdc->sdc_reset = now;
 468                 sdc->sdc_pri_check = -1; /* force mismatch below */
 469 
 470                 /*
 471                  * See below for rationale.
 472                  */
 473                 if (baseO > sdc->sdc_base_O || baseR > sdc->sdc_base_R) {
 474                         SYSDC_INC_STAT(sysdc_update_times_base_ran_backwards);
 475                         baseO = sdc->sdc_base_O;
 476                         baseR = sdc->sdc_base_R;
 477                 }
 478 
 479                 /* compute based on the entire interval */
 480                 *O = (sdc->sdc_base_O - baseO);
 481                 *R = (sdc->sdc_base_R - baseR);
 482                 return;
 483         }
 484 
 485         /*
 486          * If we're called from sysdc_update(), we *must* return a value
 487          * for newO, so we always call mstate_systhread_times().
 488          *
 489          * Otherwise, if we've already done a pri check this tick,
 490          * we can skip it.
 491          */
 492         if (!update && sdc->sdc_pri_check == now) {
 493                 SYSDC_INC_STAT(sysdc_update_times_already_done);
 494                 return;
 495         }
 496 
 497         /* Get the current times from the thread */
 498         sdc->sdc_pri_check = now;
 499         mstate_systhread_times(t, &sdc->sdc_cur_O, &sdc->sdc_cur_R);
 500         *newO = sdc->sdc_cur_O;
 501 
 502         /*
 503          * The updating of microstate accounting is not done under a
 504          * consistent set of locks, particularly the t_waitrq field.  This
 505          * can lead to narrow windows in which we account for time in the
 506          * wrong bucket, which on the next read will be accounted for
 507          * correctly.
 508          *
 509          * If our sdc_base_* fields were affected by one of these blips, we
 510          * throw away the old data, and pretend this tick didn't happen.
 511          */
 512         if (sdc->sdc_cur_O < sdc->sdc_base_O ||
 513             sdc->sdc_cur_R < sdc->sdc_base_R) {
 514 
 515                 sdc->sdc_base_O = sdc->sdc_cur_O;
 516                 sdc->sdc_base_R = sdc->sdc_cur_R;
 517 
 518                 SYSDC_INC_STAT(sysdc_update_times_cur_ran_backwards);
 519                 return;
 520         }
 521 
 522         *O = sdc->sdc_cur_O - sdc->sdc_base_O;
 523         *R = sdc->sdc_cur_R - sdc->sdc_base_R;
 524 }
 525 
 526 /*
 527  * sysdc_compute_pri()
 528  *
 529  *      Recomputes the priority of the thread, leaving the result in
 530  *      sdc->sdc_epri.  Returns 1 if a priority update should occur
 531  *      (which will also trigger a cpu_surrender()), otherwise
 532  *      returns 0.
 533  */
 534 static uint_t
 535 sysdc_compute_pri(sysdc_t *sdc, uint_t flags)
 536 {
 537         kthread_t *const t = sdc->sdc_thread;
 538         const uint_t    update = (flags & SDC_UPDATE_TIMEOUT);
 539         const uint_t    tick = (flags & SDC_UPDATE_TICK);
 540 
 541         hrtime_t        O, R;
 542         hrtime_t        newO = -1;
 543 
 544         ASSERT(THREAD_LOCK_HELD(t));
 545 
 546         sysdc_update_times(sdc, flags, &O, &R, &newO);
 547         ASSERT(!update || newO != -1);
 548 
 549         /* If we have new data, recompute our priority. */
 550         if ((O + R) != 0) {
 551                 sdc->sdc_cur_DC = (O * SYSDC_DC_MAX) / (O + R);
 552 
 553                 /* Adjust our priority to move our DC closer to the target. */
 554                 if (sdc->sdc_cur_DC < sdc->sdc_target_DC)
 555                         sdc->sdc_pri = sdc->sdc_maxpri;
 556                 else
 557                         sdc->sdc_pri = sdc->sdc_minpri;
 558         }
 559 
 560         /*
 561          * If our per-pset duty cycle goes over the max, we will take a break.
 562          * This forces all sysdc threads in the pset to minimum priority, in
 563          * order to let everyone else have a chance at the CPU.
 564          */
 565         if (sdc->sdc_pset->sdp_need_break) {
 566                 SYSDC_INC_STAT(sysdc_compute_pri_breaking);
 567                 sdc->sdc_epri = sdc->sdc_minpri;
 568         } else {
 569                 sdc->sdc_epri = sdc->sdc_pri;
 570         }
 571 
 572         DTRACE_PROBE4(sysdc__compute__pri,
 573             kthread_t *, t, pri_t, sdc->sdc_epri, uint_t, sdc->sdc_cur_DC,
 574             uint_t, sdc->sdc_target_DC);
 575 
 576         /*
 577          * For sysdc_update(), we compute the ONPROC time for high-priority
 578          * threads, which is used to calculate the per-pset duty cycle.  We
 579          * will always tell our callers to update the thread's priority,
 580          * since we want to force a cpu_surrender().
 581          *
 582          * We reset sdc_update_ticks so that sysdc_tick() will only update
 583          * the thread's priority if our timeout is delayed by a tick or
 584          * more.
 585          */
 586         if (update) {
 587                 /* SDC threads are not allowed to change cpupart bindings. */
 588                 ASSERT(t->t_cpupart == sdc->sdc_pset->sdp_cpupart);
 589 
 590                 /* If we were at MAXPRI, account for our onproc time. */
 591                 if (t->t_pri == sdc->sdc_maxpri &&
 592                     sdc->sdc_last_base_O != 0 &&
 593                     sdc->sdc_last_base_O < newO) {
 594                         sdc->sdc_last_O = newO - sdc->sdc_last_base_O;
 595                         sdc->sdc_pset->sdp_onproc_time +=
 596                             (uint64_t)sdc->sdc_last_O;
 597                         sdc->sdc_pset->sdp_onproc_threads++;
 598                 } else {
 599                         sdc->sdc_last_O = 0;
 600                 }
 601                 sdc->sdc_last_base_O = newO;
 602 
 603                 sdc->sdc_update_ticks = sdc->sdc_ticks + sysdc_update_ticks + 1;
 604                 return (1);
 605         }
 606 
 607         /*
 608          * Like sysdc_update(), sysdc_tick() always wants to update the
 609          * thread's priority, so that the CPU is surrendered if necessary.
 610          * We reset sdc_update_ticks so that if the timeout continues to be
 611          * delayed, we'll update at the regular interval.
 612          */
 613         if (tick) {
 614                 ASSERT(sdc->sdc_ticks == sdc->sdc_update_ticks);
 615                 sdc->sdc_update_ticks = sdc->sdc_ticks + sysdc_update_ticks;
 616                 return (1);
 617         }
 618 
 619         /*
 620          * Otherwise, only tell our callers to update the priority if it has
 621          * changed.
 622          */
 623         return (sdc->sdc_epri != t->t_pri);
 624 }
 625 
 626 static void
 627 sysdc_update_pri(sysdc_t *sdc, uint_t flags)
 628 {
 629         kthread_t *t = sdc->sdc_thread;
 630 
 631         ASSERT(THREAD_LOCK_HELD(t));
 632 
 633         if (sysdc_compute_pri(sdc, flags)) {
 634                 if (!thread_change_pri(t, sdc->sdc_epri, 0)) {
 635                         cpu_surrender(t);
 636                 }
 637         }
 638 }
 639 
 640 /*
 641  * Add a thread onto the active list.  It will only be removed by
 642  * sysdc_update().
 643  */
 644 static void
 645 sysdc_activate(sysdc_t *sdc)
 646 {
 647         sysdc_t *volatile *headp = &SYSDC_LIST(sdc)->sdl_list;
 648         sysdc_t         *head;
 649         kthread_t       *t = sdc->sdc_thread;
 650 
 651         SYSDC_INC_STAT(sysdc_activate_enter);
 652 
 653         ASSERT(sdc->sdc_next == NULL);
 654         ASSERT(THREAD_LOCK_HELD(t));
 655 
 656         do {
 657                 head = *headp;
 658                 sdc->sdc_next = head;
 659         } while (atomic_cas_ptr(headp, head, sdc) != head);
 660 }
 661 
 662 /*
 663  * sysdc_update() has two jobs:
 664  *
 665  *      1. It updates the priorities of all active SDC threads on the system.
 666  *      2. It measures pset CPU usage and enforces sysdc_max_pset_DC.
 667  */
 668 static void
 669 sysdc_update(void *arg)
 670 {
 671         int             idx;
 672         sysdc_t         *freelist = NULL;
 673         sysdc_pset_t    *cur;
 674         hrtime_t        now, diff;
 675         uint_t          redeploy = 1;
 676 
 677         SYSDC_INC_STAT(sysdc_update_enter);
 678 
 679         ASSERT(sysdc_update_timeout_started);
 680 
 681         /*
 682          * If this is our first time through, diff will be gigantic, and
 683          * no breaks will be necessary.
 684          */
 685         now = gethrtime();
 686         diff = now - sysdc_last_update;
 687         sysdc_last_update = now;
 688 
 689         mutex_enter(&sysdc_pset_lock);
 690         for (cur = list_head(&sysdc_psets); cur != NULL;
 691             cur = list_next(&sysdc_psets, cur)) {
 692                 boolean_t breaking = (cur->sdp_should_break != 0);
 693 
 694                 if (cur->sdp_need_break != breaking) {
 695                         DTRACE_PROBE2(sdc__pset__break, sysdc_pset_t *, cur,
 696                             boolean_t, breaking);
 697                 }
 698                 cur->sdp_onproc_time = 0;
 699                 cur->sdp_onproc_threads = 0;
 700                 cur->sdp_need_break = breaking;
 701         }
 702         mutex_exit(&sysdc_pset_lock);
 703 
 704         for (idx = 0; idx < SYSDC_NLISTS; idx++) {
 705                 sysdc_list_t            *sdl = &sysdc_active[idx];
 706                 sysdc_t *volatile       *headp = &sdl->sdl_list;
 707                 sysdc_t                 *head, *tail;
 708                 sysdc_t                 **prevptr;
 709 
 710                 if (*headp == &sysdc_dummy)
 711                         continue;
 712 
 713                 /* Prevent any threads from exiting while we're poking them. */
 714                 mutex_enter(&sdl->sdl_lock);
 715 
 716                 /*
 717                  * Each sdl_list contains a singly-linked list of active
 718                  * threads. Threads which become active while we are
 719                  * processing the list will be added to sdl_list.  Since we
 720                  * don't want that to interfere with our own processing, we
 721                  * swap in an empty list.  Any newly active threads will
 722                  * go on to this empty list.  When finished, we'll put any
 723                  * such threads at the end of the processed list.
 724                  */
 725                 head = atomic_swap_ptr(headp, &sysdc_dummy);
 726                 prevptr = &head;
 727                 while (*prevptr != &sysdc_dummy) {
 728                         sysdc_t         *const  sdc = *prevptr;
 729                         kthread_t       *const  t = sdc->sdc_thread;
 730 
 731                         /*
 732                          * If the thread has exited, move its sysdc_t onto
 733                          * freelist, to be freed later.
 734                          */
 735                         if (t == NULL) {
 736                                 *prevptr = sdc->sdc_next;
 737                                 SYSDC_INC_STAT(sysdc_update_exited);
 738                                 sdc->sdc_next = freelist;
 739                                 freelist = sdc;
 740                                 continue;
 741                         }
 742 
 743                         thread_lock(t);
 744                         if (t->t_cid != sysdccid) {
 745                                 thread_unlock(t);
 746                                 prevptr = &sdc->sdc_next;
 747                                 SYSDC_INC_STAT(sysdc_update_not_sdc);
 748                                 continue;
 749                         }
 750                         ASSERT(t->t_cldata == sdc);
 751 
 752                         /*
 753                          * If the thread has been sleeping for longer
 754                          * than sysdc_prune_interval, make it inactive by
 755                          * removing it from the list.
 756                          */
 757                         if (!(t->t_state & (TS_RUN | TS_ONPROC)) &&
 758                             sdc->sdc_sleep_updates != 0 &&
 759                             (sdc->sdc_sleep_updates - sdc->sdc_nupdates) >
 760                             sysdc_prune_updates) {
 761                                 *prevptr = sdc->sdc_next;
 762                                 SYSDC_INC_STAT(sysdc_update_idle);
 763                                 sdc->sdc_next = NULL;
 764                                 thread_unlock(t);
 765                                 continue;
 766                         }
 767                         sysdc_update_pri(sdc, SDC_UPDATE_TIMEOUT);
 768                         thread_unlock(t);
 769 
 770                         prevptr = &sdc->sdc_next;
 771                 }
 772 
 773                 /*
 774                  * Add our list to the bucket, putting any new entries
 775                  * added while we were working at the tail of the list.
 776                  */
 777                 do {
 778                         tail = *headp;
 779                         *prevptr = tail;
 780                 } while (atomic_cas_ptr(headp, tail, head) != tail);
 781 
 782                 mutex_exit(&sdl->sdl_lock);
 783         }
 784 
 785         mutex_enter(&sysdc_pset_lock);
 786         for (cur = list_head(&sysdc_psets); cur != NULL;
 787             cur = list_next(&sysdc_psets, cur)) {
 788 
 789                 cur->sdp_vtime_last_interval =
 790                     diff * cur->sdp_cpupart->cp_ncpus;
 791                 cur->sdp_DC_last_interval =
 792                     (cur->sdp_onproc_time * SYSDC_DC_MAX) /
 793                     cur->sdp_vtime_last_interval;
 794 
 795                 if (cur->sdp_should_break > 0) {
 796                         cur->sdp_should_break--;     /* breaking */
 797                         continue;
 798                 }
 799                 if (cur->sdp_dont_break > 0) {
 800                         cur->sdp_dont_break--;       /* waiting before checking */
 801                         continue;
 802                 }
 803                 if (cur->sdp_DC_last_interval > sysdc_max_pset_DC) {
 804                         cur->sdp_should_break = sysdc_break_updates;
 805                         cur->sdp_dont_break = sysdc_nobreak_updates;
 806                         SYSDC_INC_STAT(sysdc_update_take_break);
 807                 }
 808         }
 809 
 810         /*
 811          * If there are no sysdc_psets, there can be no threads, so
 812          * we can stop doing our timeout.  Since we're holding the
 813          * sysdc_pset_lock, no new sysdc_psets can come in, which will
 814          * prevent anyone from racing with this and dropping our timeout
 815          * on the floor.
 816          */
 817         if (list_is_empty(&sysdc_psets)) {
 818                 SYSDC_INC_STAT(sysdc_update_no_psets);
 819                 ASSERT(sysdc_update_timeout_started);
 820                 sysdc_update_timeout_started = 0;
 821 
 822                 redeploy = 0;
 823         }
 824         mutex_exit(&sysdc_pset_lock);
 825 
 826         while (freelist != NULL) {
 827                 sysdc_t *cur = freelist;
 828                 freelist = cur->sdc_next;
 829                 kmem_free(cur, sizeof (*cur));
 830         }
 831 
 832         if (redeploy) {
 833                 (void) timeout(sysdc_update, arg, sysdc_update_ticks);
 834         }
 835 }
 836 
 837 static void
 838 sysdc_preempt(kthread_t *t)
 839 {
 840         ASSERT(t == curthread);
 841         ASSERT(THREAD_LOCK_HELD(t));
 842 
 843         setbackdq(t);           /* give others a chance to run */
 844 }
 845 
 846 static void
 847 sysdc_tick(kthread_t *t)
 848 {
 849         sysdc_t *sdc;
 850 
 851         thread_lock(t);
 852         if (t->t_cid != sysdccid) {
 853                 SYSDC_INC_STAT(sysdc_tick_not_sdc);
 854                 thread_unlock(t);
 855                 return;
 856         }
 857         sdc = t->t_cldata;
 858         if (t->t_state == TS_ONPROC &&
 859             t->t_pri < t->t_disp_queue->disp_maxrunpri) {
 860                 cpu_surrender(t);
 861         }
 862 
 863         if (t->t_state == TS_ONPROC || t->t_state == TS_RUN) {
 864                 ASSERT(sdc->sdc_sleep_updates == 0);
 865         }
 866 
 867         ASSERT(sdc->sdc_ticks != sdc->sdc_update_ticks);
 868         sdc->sdc_ticks++;
 869         if (sdc->sdc_ticks == sdc->sdc_update_ticks) {
 870                 SYSDC_INC_STAT(sysdc_tick_quantum_expired);
 871                 sysdc_update_pri(sdc, SDC_UPDATE_TICK);
 872                 ASSERT(sdc->sdc_ticks != sdc->sdc_update_ticks);
 873         }
 874         thread_unlock(t);
 875 }
 876 
 877 static void
 878 sysdc_setrun(kthread_t *t)
 879 {
 880         sysdc_t *sdc = t->t_cldata;
 881 
 882         ASSERT(THREAD_LOCK_HELD(t));    /* t should be in transition */
 883 
 884         sdc->sdc_sleep_updates = 0;
 885 
 886         if (sdc->sdc_next == NULL) {
 887                 /*
 888                  * Since we're in transition, we don't want to use the
 889                  * full thread_update_pri().
 890                  */
 891                 if (sysdc_compute_pri(sdc, 0)) {
 892                         THREAD_CHANGE_PRI(t, sdc->sdc_epri);
 893                 }
 894                 sysdc_activate(sdc);
 895 
 896                 ASSERT(sdc->sdc_next != NULL);
 897         }
 898 
 899         setbackdq(t);
 900 }
 901 
 902 static void
 903 sysdc_wakeup(kthread_t *t)
 904 {
 905         sysdc_setrun(t);
 906 }
 907 
 908 static void
 909 sysdc_sleep(kthread_t *t)
 910 {
 911         sysdc_t *sdc = t->t_cldata;
 912 
 913         ASSERT(THREAD_LOCK_HELD(t));    /* t should be in transition */
 914 
 915         sdc->sdc_sleep_updates = sdc->sdc_nupdates;
 916 }
 917 
 918 /*ARGSUSED*/
 919 static int
 920 sysdc_enterclass(kthread_t *t, id_t cid, void *parmsp, cred_t *reqpcredp,
 921     void *bufp)
 922 {
 923         cpupart_t *const cpupart = t->t_cpupart;
 924         sysdc_t *sdc = bufp;
 925         sysdc_params_t *sdpp = parmsp;
 926         sysdc_pset_t *newpset = sdc->sdc_pset;
 927         sysdc_pset_t *pset;
 928         int start_timeout;
 929 
 930         if (t->t_cid != syscid)
 931                 return (EPERM);
 932 
 933         ASSERT(ttolwp(t) != NULL);
 934         ASSERT(sdpp != NULL);
 935         ASSERT(newpset != NULL);
 936         ASSERT(sysdc_param_init);
 937 
 938         ASSERT(sdpp->sdp_minpri >= sysdc_minpri);
 939         ASSERT(sdpp->sdp_maxpri <= sysdc_maxpri);
 940         ASSERT(sdpp->sdp_DC >= sysdc_minDC);
 941         ASSERT(sdpp->sdp_DC <= sysdc_maxDC);
 942 
 943         sdc->sdc_thread = t;
 944         sdc->sdc_pri = sdpp->sdp_maxpri;  /* start off maximally */
 945         sdc->sdc_minpri = sdpp->sdp_minpri;
 946         sdc->sdc_maxpri = sdpp->sdp_maxpri;
 947         sdc->sdc_target_DC = sdpp->sdp_DC;
 948         sdc->sdc_ticks = 0;
 949         sdc->sdc_update_ticks = sysdc_update_ticks + 1;
 950 
 951         /* Assign ourselves to the appropriate pset. */
 952         sdc->sdc_pset = NULL;
 953         mutex_enter(&sysdc_pset_lock);
 954         for (pset = list_head(&sysdc_psets); pset != NULL;
 955             pset = list_next(&sysdc_psets, pset)) {
 956                 if (pset->sdp_cpupart == cpupart) {
 957                         break;
 958                 }
 959         }
 960         if (pset == NULL) {
 961                 pset = newpset;
 962                 newpset = NULL;
 963                 pset->sdp_cpupart = cpupart;
 964                 list_insert_tail(&sysdc_psets, pset);
 965         }
 966         pset->sdp_nthreads++;
 967         ASSERT(pset->sdp_nthreads > 0);
 968 
 969         sdc->sdc_pset = pset;
 970 
 971         start_timeout = (sysdc_update_timeout_started == 0);
 972         sysdc_update_timeout_started = 1;
 973         mutex_exit(&sysdc_pset_lock);
 974 
 975         if (newpset != NULL)
 976                 kmem_free(newpset, sizeof (*newpset));
 977 
 978         /* Update t's scheduling class and priority. */
 979         thread_lock(t);
 980         t->t_clfuncs = &(sclass[cid].cl_funcs->thread);
 981         t->t_cid = cid;
 982         t->t_cldata = sdc;
 983         t->t_schedflag |= TS_RUNQMATCH;
 984 
 985         sysdc_update_pri(sdc, SDC_UPDATE_INITIAL);
 986         thread_unlock(t);
 987 
 988         /* Kick off the thread timeout if we're the first one in. */
 989         if (start_timeout) {
 990                 (void) timeout(sysdc_update, NULL, sysdc_update_ticks);
 991         }
 992 
 993         return (0);
 994 }
 995 
 996 static void
 997 sysdc_leave(sysdc_t *sdc)
 998 {
 999         sysdc_pset_t *sdp = sdc->sdc_pset;
1000         sysdc_list_t *sdl = SYSDC_LIST(sdc);
1001         uint_t freedc;
1002 
1003         mutex_enter(&sdl->sdl_lock);             /* block sysdc_update() */
1004         sdc->sdc_thread = NULL;
1005         freedc = (sdc->sdc_next == NULL);
1006         mutex_exit(&sdl->sdl_lock);
1007 
1008         mutex_enter(&sysdc_pset_lock);
1009         ASSERT(sdp != NULL);
1010         ASSERT(sdp->sdp_nthreads > 0);
1011         --sdp->sdp_nthreads;
1012         if (sdp->sdp_nthreads == 0) {
1013                 list_remove(&sysdc_psets, sdp);
1014         } else {
1015                 sdp = NULL;
1016         }
1017         mutex_exit(&sysdc_pset_lock);
1018 
1019         if (freedc)
1020                 kmem_free(sdc, sizeof (*sdc));
1021         if (sdp != NULL)
1022                 kmem_free(sdp, sizeof (*sdp));
1023 }
1024 
1025 static void
1026 sysdc_exitclass(void *buf)
1027 {
1028         sysdc_leave((sysdc_t *)buf);
1029 }
1030 
1031 /*ARGSUSED*/
1032 static int
1033 sysdc_canexit(kthread_t *t, cred_t *reqpcredp)
1034 {
1035         /* Threads cannot exit SDC once joined, except in a body bag. */
1036         return (EPERM);
1037 }
1038 
1039 static void
1040 sysdc_exit(kthread_t *t)
1041 {
1042         sysdc_t *sdc;
1043 
1044         /* We're exiting, so we just rejoin the SYS class. */
1045         thread_lock(t);
1046         ASSERT(t->t_cid == sysdccid);
1047         sdc = t->t_cldata;
1048         t->t_cid = syscid;
1049         t->t_cldata = NULL;
1050         t->t_clfuncs = &(sclass[syscid].cl_funcs->thread);
1051         (void) thread_change_pri(t, maxclsyspri, 0);
1052         t->t_schedflag &= ~TS_RUNQMATCH;
1053         thread_unlock_nopreempt(t);
1054 
1055         /* Unlink the sdc from everything. */
1056         sysdc_leave(sdc);
1057 }
1058 
1059 /*ARGSUSED*/
1060 static int
1061 sysdc_fork(kthread_t *t, kthread_t *ct, void *bufp)
1062 {
1063         /*
1064          * Threads cannot be created with SDC as their class; they must
1065          * be created as SYS and then added with sysdc_thread_enter().
1066          * Because of this restriction, sysdc_fork() should never be called.
1067          */
1068         panic("sysdc cannot be forked");
1069 
1070         return (ENOSYS);
1071 }
1072 
1073 /*ARGSUSED*/
1074 static void
1075 sysdc_forkret(kthread_t *t, kthread_t *ct)
1076 {
1077         /* SDC threads are part of system processes, which never fork. */
1078         panic("sysdc cannot be forked");
1079 }
1080 
1081 static pri_t
1082 sysdc_globpri(kthread_t *t)
1083 {
1084         return (t->t_epri);
1085 }
1086 
1087 /*ARGSUSED*/
1088 static pri_t
1089 sysdc_no_swap(kthread_t *t, int flags)
1090 {
1091         /* SDC threads cannot be swapped. */
1092         return (-1);
1093 }
1094 
1095 /*
1096  * Get maximum and minimum priorities enjoyed by SDC threads.
1097  */
1098 static int
1099 sysdc_getclpri(pcpri_t *pcprip)
1100 {
1101         pcprip->pc_clpmax = sysdc_maxpri;
1102         pcprip->pc_clpmin = sysdc_minpri;
1103         return (0);
1104 }
1105 
1106 /*ARGSUSED*/
1107 static int
1108 sysdc_getclinfo(void *arg)
1109 {
1110         return (0);             /* no class-specific info */
1111 }
1112 
1113 /*ARGSUSED*/
1114 static int
1115 sysdc_alloc(void **p, int flag)
1116 {
1117         sysdc_t *new;
1118 
1119         *p = NULL;
1120         if ((new = kmem_zalloc(sizeof (*new), flag)) == NULL) {
1121                 return (ENOMEM);
1122         }
1123         if ((new->sdc_pset = kmem_zalloc(sizeof (*new->sdc_pset), flag)) ==
1124             NULL) {
1125                 kmem_free(new, sizeof (*new));
1126                 return (ENOMEM);
1127         }
1128         *p = new;
1129         return (0);
1130 }
1131 
1132 static void
1133 sysdc_free(void *p)
1134 {
1135         sysdc_t *sdc = p;
1136 
1137         if (sdc != NULL) {
1138                 /*
1139                  * We must have failed CL_ENTERCLASS(), so our pset should be
1140                  * there and unused.
1141                  */
1142                 ASSERT(sdc->sdc_pset != NULL);
1143                 ASSERT(sdc->sdc_pset->sdp_cpupart == NULL);
1144                 kmem_free(sdc->sdc_pset, sizeof (*sdc->sdc_pset));
1145                 kmem_free(sdc, sizeof (*sdc));
1146         }
1147 }
1148 
1149 static int sysdc_enosys();      /* Boy, ANSI-C's K&R compatibility is weird. */
1150 static int sysdc_einval();
1151 static void sysdc_nullsys();
1152 
1153 static struct classfuncs sysdc_classfuncs = {
1154         /* messages to class manager */
1155         {
1156                 sysdc_enosys,   /* admin */
1157                 sysdc_getclinfo,
1158                 sysdc_enosys,   /* parmsin */
1159                 sysdc_enosys,   /* parmsout */
1160                 sysdc_enosys,   /* vaparmsin */
1161                 sysdc_enosys,   /* vaparmsout */
1162                 sysdc_getclpri,
1163                 sysdc_alloc,
1164                 sysdc_free,
1165         },
1166         /* operations on threads */
1167         {
1168                 sysdc_enterclass,
1169                 sysdc_exitclass,
1170                 sysdc_canexit,
1171                 sysdc_fork,
1172                 sysdc_forkret,
1173                 sysdc_nullsys,  /* parmsget */
1174                 sysdc_enosys,   /* parmsset */
1175                 sysdc_nullsys,  /* stop */
1176                 sysdc_exit,
1177                 sysdc_nullsys,  /* active */
1178                 sysdc_nullsys,  /* inactive */
1179                 sysdc_no_swap,  /* swapin */
1180                 sysdc_no_swap,  /* swapout */
1181                 sysdc_nullsys,  /* trapret */
1182                 sysdc_preempt,
1183                 sysdc_setrun,
1184                 sysdc_sleep,
1185                 sysdc_tick,
1186                 sysdc_wakeup,
1187                 sysdc_einval,   /* donice */
1188                 sysdc_globpri,
1189                 sysdc_nullsys,  /* set_process_group */
1190                 sysdc_nullsys,  /* yield */
1191                 sysdc_einval,   /* doprio */
1192         }
1193 };
1194 
1195 static int
1196 sysdc_enosys()
1197 {
1198         return (ENOSYS);
1199 }
1200 
1201 static int
1202 sysdc_einval()
1203 {
1204         return (EINVAL);
1205 }
1206 
1207 static void
1208 sysdc_nullsys()
1209 {
1210 }
1211 
1212 /*ARGSUSED*/
1213 static pri_t
1214 sysdc_init(id_t cid, int clparmsz, classfuncs_t **clfuncspp)
1215 {
1216         int idx;
1217 
1218         list_create(&sysdc_psets, sizeof (sysdc_pset_t),
1219             offsetof(sysdc_pset_t, sdp_node));
1220 
1221         for (idx = 0; idx < SYSDC_NLISTS; idx++) {
1222                 sysdc_active[idx].sdl_list = &sysdc_dummy;
1223         }
1224 
1225         sysdc_initparam();
1226 
1227         sysdccid = cid;
1228         *clfuncspp = &sysdc_classfuncs;
1229 
1230         return ((pri_t)v.v_maxsyspri);
1231 }
1232 
1233 static struct sclass csw = {
1234         "SDC",
1235         sysdc_init,
1236         0
1237 };
1238 
1239 static struct modlsched modlsched = {
1240         &mod_schedops, "system duty cycle scheduling class", &csw
1241 };
1242 
1243 static struct modlinkage modlinkage = {
1244         MODREV_1, (void *)&modlsched, NULL
1245 };
1246 
1247 int
1248 _init()
1249 {
1250         return (mod_install(&modlinkage));
1251 }
1252 
1253 int
1254 _fini()
1255 {
1256         return (EBUSY);         /* can't unload for now */
1257 }
1258 
1259 int
1260 _info(struct modinfo *modinfop)
1261 {
1262         return (mod_info(&modlinkage, modinfop));
1263 }
1264 
1265 /* --- consolidation-private interfaces --- */
1266 void
1267 sysdc_thread_enter(kthread_t *t, uint_t dc, uint_t flags)
1268 {
1269         void *buf = NULL;
1270         sysdc_params_t sdp;
1271 
1272         SYSDC_INC_STAT(sysdc_thread_enter_enter);
1273 
1274         ASSERT(sysdc_param_init);
1275         ASSERT(sysdccid >= 0);
1276 
1277         ASSERT((flags & ~SYSDC_THREAD_BATCH) == 0);
1278 
1279         sdp.sdp_minpri = sysdc_minpri;
1280         sdp.sdp_maxpri = sysdc_maxpri;
1281         sdp.sdp_DC = MAX(MIN(dc, sysdc_maxDC), sysdc_minDC);
1282 
1283         VERIFY0(CL_ALLOC(&buf, sysdccid, KM_SLEEP));
1284 
1285         ASSERT(t->t_lwp != NULL);
1286         ASSERT(t->t_cid == syscid);
1287         ASSERT(t->t_cldata == NULL);
1288         VERIFY0(CL_CANEXIT(t, NULL));
1289         VERIFY0(CL_ENTERCLASS(t, sysdccid, &sdp, kcred, buf));
1290         CL_EXITCLASS(syscid, NULL);
1291 }