illumos-gate New usr/src/uts/common/disp/sysdc.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
  23  */
  24 
  25 /*
  26  * Copyright (c) 2012 by Delphix. All rights reserved.
  27  */
  28 
  29 /*
  30  * The System Duty Cycle (SDC) scheduling class
  31  * --------------------------------------------
  32  *
  33  * Background
  34  *
  35  * Kernel threads in Solaris have traditionally not been large consumers
  36  * of CPU time.  They typically wake up, perform a small amount of
  37  * work, then go back to sleep waiting for either a timeout or another
  38  * signal.  On the assumption that the small amount of work that they do
  39  * is important for the behavior of the whole system, these threads are
  40  * treated kindly by the dispatcher and the SYS scheduling class: they run
  41  * without preemption from anything other than real-time and interrupt
  42  * threads; when preempted, they are put at the front of the queue, so they
  43  * generally do not migrate between CPUs; and they are allowed to stay
  44  * running until they voluntarily give up the CPU.
  45  *
  46  * As Solaris has evolved, new workloads have emerged which require the
  47  * kernel to perform significant amounts of CPU-intensive work.  One
  48  * example of such a workload is ZFS's transaction group sync processing.
  49  * Each sync operation generates a large batch of I/Os, and each I/O
  50  * may need to be compressed and/or checksummed before it is written to
  51  * storage.  The taskq threads which perform the compression and checksums
  52  * will run nonstop as long as they have work to do; a large sync operation
  53  * on a compression-heavy dataset can keep them busy for seconds on end.
  54  * This causes human-time-scale dispatch latency bubbles for any other
  55  * threads which have the misfortune to share a CPU with the taskq threads.
  56  *
  57  * The SDC scheduling class is a solution to this problem.
  58  *
  59  *
  60  * Overview
  61  *
  62  * SDC is centered around the concept of a thread's duty cycle (DC):
  63  *
  64  *                            ONPROC time
  65  *      Duty Cycle =    ----------------------
  66  *                      ONPROC + Runnable time
  67  *
  68  * This is the ratio of the time that the thread spent running on a CPU
  69  * divided by the time it spent running or trying to run.  It is unaffected
  70  * by any time the thread spent sleeping, stopped, etc.
  71  *
  72  * A thread joining the SDC class specifies a "target" DC that it wants
  73  * to run at.  To implement this policy, the routine sysdc_update() scans
  74  * the list of active SDC threads every few ticks and uses each thread's
  75  * microstate data to compute the actual duty cycle that that thread
  76  * has experienced recently.  If the thread is under its target DC, its
  77  * priority is increased to the maximum available (sysdc_maxpri, which is
  78  * 99 by default).  If the thread is over its target DC, its priority is
  79  * reduced to the minimum available (sysdc_minpri, 0 by default).  This
  80  * is a fairly primitive approach, in that it doesn't use any of the
  81  * intermediate priorities, but it's not completely inappropriate.  Even
  82  * though threads in the SDC class might take a while to do their job, they
  83  * are by some definition important if they're running inside the kernel,
  84  * so it is reasonable that they should get to run at priority 99.
  85  *
  86  * If a thread is running when sysdc_update() calculates its actual duty
  87  * cycle, and there are other threads of equal or greater priority on its
  88  * CPU's dispatch queue, sysdc_update() preempts that thread.  The thread
  89  * acknowledges the preemption by calling sysdc_preempt(), which calls
  90  * setbackdq(), which gives other threads with the same priority a chance
  91  * to run.  This creates a de facto time quantum for threads in the SDC
  92  * scheduling class.
  93  *
  94  * An SDC thread which is assigned priority 0 can continue to run if
  95  * nothing else needs to use the CPU that it's running on.  Similarly, an
  96  * SDC thread at priority 99 might not get to run as much as it wants to
  97  * if there are other priority-99 or higher threads on its CPU.  These
  98  * situations would cause the thread to get ahead of or behind its target
  99  * DC; the longer the situations lasted, the further ahead or behind the
 100  * thread would get.  Rather than condemning a thread to a lifetime of
 101  * paying for its youthful indiscretions, SDC keeps "base" values for
 102  * ONPROC and Runnable times in each thread's sysdc data, and updates these
 103  * values periodically.  The duty cycle is then computed using the elapsed
 104  * amount of ONPROC and Runnable times since those base times.
 105  *
 106  * Since sysdc_update() scans SDC threads fairly frequently, it tries to
 107  * keep the list of "active" threads small by pruning out threads which
 108  * have been asleep for a brief time.  They are not pruned immediately upon
 109  * going to sleep, since some threads may bounce back and forth between
 110  * sleeping and being runnable.
 111  *
 112  *
 113  * Interfaces
 114  *
 115  * void sysdc_thread_enter(t, dc, flags)
 116  *
 117  *      Moves a kernel thread from the SYS scheduling class to the
 118  *      SDC class. t must have an associated LWP (created by calling
 119  *      lwp_kernel_create()).  The thread will have a target DC of dc.
 120  *      Flags should be either 0 or SYSDC_THREAD_BATCH.  If
 121  *      SYSDC_THREAD_BATCH is specified, the thread is expected to be
 122  *      doing large amounts of processing.
 123  *
 124  *
 125  * Complications
 126  *
 127  * - Run queue balancing
 128  *
 129  *      The Solaris dispatcher is biased towards letting a thread run
 130  *      on the same CPU which it last ran on, if no more than 3 ticks
 131  *      (i.e. rechoose_interval) have passed since the thread last ran.
 132  *      This helps to preserve cache warmth.  On the other hand, it also
 133  *      tries to keep the per-CPU run queues fairly balanced; if the CPU
 134  *      chosen for a runnable thread has a run queue which is three or
 135  *      more threads longer than a neighboring CPU's queue, the runnable
 136  *      thread is dispatched onto the neighboring CPU instead.
 137  *
 138  *      These policies work well for some workloads, but not for many SDC
 139  *      threads.  The taskq client of SDC, for example, has many discrete
 140  *      units of work to do.  The work units are largely independent, so
 141  *      cache warmth is not an important consideration.  It is important
 142  *      that the threads fan out quickly to different CPUs, since the
 143  *      amount of work these threads have to do (a few seconds worth at a
 144  *      time) doesn't leave much time to correct thread placement errors
 145  *      (i.e. two SDC threads being dispatched to the same CPU).
 146  *
 147  *      To fix this, SDC uses the TS_RUNQMATCH flag introduced for FSS.
 148  *      This tells the dispatcher to keep neighboring run queues' lengths
 149  *      more evenly matched, which allows SDC threads to migrate more
 150  *      easily.
 151  *
 152  * - LWPs and system processes
 153  *
 154  *      SDC can only be used for kernel threads.  Since SDC uses microstate
 155  *      accounting data to compute each thread's actual duty cycle, all
 156  *      threads entering the SDC class must have associated LWPs (which
 157  *      store the microstate data).  This means that the threads have to
 158  *      be associated with an SSYS process, i.e. one created by newproc().
 159  *      If the microstate accounting information is ever moved into the
 160  *      kthread_t, this restriction could be lifted.
 161  *
 162  * - Dealing with oversubscription
 163  *
 164  *      Since SDC duty cycles are per-thread, it is possible that the
 165  *      aggregate requested duty cycle of all SDC threads in a processor
 166  *      set could be greater than the total CPU time available in that set.
 167  *      The FSS scheduling class has an analogous situation, which it deals
 168  *      with by reducing each thread's allotted CPU time proportionally.
 169  *      Since SDC doesn't need to be as precise as FSS, it uses a simpler
 170  *      solution to the oversubscription problem.
 171  *
 172  *      sysdc_update() accumulates the amount of time that max-priority SDC
 173  *      threads have spent on-CPU in each processor set, and uses that sum
 174  *      to create an implied duty cycle for that processor set:
 175  *
 176  *                              accumulated CPU time
 177  *         pset DC =    -----------------------------------
 178  *                       (# CPUs) * time since last update
 179  *
 180  *      If this implied duty cycle is above a maximum pset duty cycle (90%
 181  *      by default), sysdc_update() sets the priority of all SDC threads
 182  *      in that processor set to sysdc_minpri for a "break" period.  After
 183  *      the break period, it waits for a "nobreak" period before trying to
 184  *      enforce the pset duty cycle limit again.
 185  *
 186  * - Processor sets
 187  *
 188  *      As the above implies, SDC is processor set aware, but it does not
 189  *      currently allow threads to change processor sets while in the SDC
 190  *      class.  Instead, those threads must join the desired processor set
 191  *      before entering SDC. [1]
 192  *
 193  * - Batch threads
 194  *
 195  *      A thread joining the SDC class can specify the SDC_THREAD_BATCH
 196  *      flag.  This flag currently has no effect, but marks threads which
 197  *      do bulk processing.
 198  *
 199  * - t_kpri_req
 200  *
 201  *      The TS and FSS scheduling classes pay attention to t_kpri_req,
 202  *      which provides a simple form of priority inheritance for
 203  *      synchronization primitives (such as rwlocks held as READER) which
 204  *      cannot be traced to a unique thread.  The SDC class does not honor
 205  *      t_kpri_req, for a few reasons:
 206  *
 207  *      1.  t_kpri_req is notoriously inaccurate.  A measure of its
 208  *          inaccuracy is that it needs to be cleared every time a thread
 209  *          returns to user mode, because it is frequently non-zero at that
 210  *          point.  This can happen because "ownership" of synchronization
 211  *          primitives that use t_kpri_req can be silently handed off,
 212  *          leaving no opportunity to will the t_kpri_req inheritance.
 213  *
 214  *      2.  Unlike in TS and FSS, threads in SDC *will* eventually run at
 215  *          kernel priority.  This means that even if an SDC thread
 216  *          is holding a synchronization primitive and running at low
 217  *          priority, its priority will eventually be raised above 60,
 218  *          allowing it to drive on and release the resource.
 219  *
 220  *      3.  The first consumer of SDC uses the taskq subsystem, which holds
 221  *          a reader lock for the duration of the task's execution.  This
 222  *          would mean that SDC threads would never drop below kernel
 223  *          priority in practice, which defeats one of the purposes of SDC.
 224  *
 225  * - Why not FSS?
 226  *
 227  *      It might seem that the existing FSS scheduling class could solve
 228  *      the problems that SDC is attempting to solve.  FSS's more precise
 229  *      solution to the oversubscription problem would hardly cause
 230  *      trouble, as long as it performed well.  SDC is implemented as
 231  *      a separate scheduling class for two main reasons: the initial
 232  *      consumer of SDC does not map well onto the "project" abstraction
 233  *      that is central to FSS, and FSS does not expect to run at kernel
 234  *      priorities.
 235  *
 236  *
 237  * Tunables
 238  *
 239  * - sysdc_update_interval_msec:  Number of milliseconds between
 240  *      consecutive thread priority updates.
 241  *
 242  * - sysdc_reset_interval_msec:  Number of milliseconds between
 243  *      consecutive resets of a thread's base ONPROC and Runnable
 244  *      times.
 245  *
 246  * - sysdc_prune_interval_msec:  Number of milliseconds of sleeping
 247  *      before a thread is pruned from the active list.
 248  *
 249  * - sysdc_max_pset_DC:  Allowable percentage of a processor set's
 250  *      CPU time which SDC can give to its high-priority threads.
 251  *
 252  * - sysdc_break_msec:  Number of milliseconds of "break" taken when
 253  *      sysdc_max_pset_DC is exceeded.
 254  *
 255  *
 256  * Future work (in SDC and related subsystems)
 257  *
 258  * - Per-thread rechoose interval (0 for SDC)
 259  *
 260  *      Allow each thread to specify its own rechoose interval.  SDC
 261  *      threads would specify an interval of zero, which would rechoose
 262  *      the CPU with the lowest priority once per update.
 263  *
 264  * - Allow threads to change processor sets after joining the SDC class
 265  *
 266  * - Thread groups and per-group DC
 267  *
 268  *      It might be nice to be able to specify a duty cycle which applies
 269  *      to a group of threads in aggregate.
 270  *
 271  * - Per-group DC callback to allow dynamic DC tuning
 272  *
 273  *      Currently, DCs are assigned when the thread joins SDC.  Some
 274  *      workloads could benefit from being able to tune their DC using
 275  *      subsystem-specific knowledge about the workload.
 276  *
 277  * - Finer-grained priority updates
 278  *
 279  * - More nuanced management of oversubscription
 280  *
 281  * - Moving other CPU-intensive threads into SDC
 282  *
 283  * - Move msacct data into kthread_t
 284  *
 285  *      This would allow kernel threads without LWPs to join SDC.
 286  *
 287  *
 288  * Footnotes
 289  *
 290  * [1] The details of doing so are left as an exercise for the reader.
 291  */
 292 
 293 #include <sys/types.h>
 294 #include <sys/sysdc.h>
 295 #include <sys/sysdc_impl.h>
 296 
 297 #include <sys/class.h>
 298 #include <sys/cmn_err.h>
 299 #include <sys/cpuvar.h>
 300 #include <sys/cpupart.h>
 301 #include <sys/debug.h>
 302 #include <sys/disp.h>
 303 #include <sys/errno.h>
 304 #include <sys/inline.h>
 305 #include <sys/kmem.h>
 306 #include <sys/modctl.h>
 307 #include <sys/schedctl.h>
 308 #include <sys/sdt.h>
 309 #include <sys/sunddi.h>
 310 #include <sys/sysmacros.h>
 311 #include <sys/systm.h>
 312 #include <sys/var.h>
 313 
 314 /*
 315  * Tunables - loaded into the internal state at module load time
 316  */
 317 uint_t          sysdc_update_interval_msec = 20;
 318 uint_t          sysdc_reset_interval_msec = 400;
 319 uint_t          sysdc_prune_interval_msec = 100;
 320 uint_t          sysdc_max_pset_DC = 90;
 321 uint_t          sysdc_break_msec = 80;
 322 
 323 /*
 324  * Internal state - constants set up by sysdc_initparam()
 325  */
 326 static clock_t  sysdc_update_ticks;     /* ticks between updates */
 327 static uint_t   sysdc_prune_updates;    /* updates asleep before pruning */
 328 static uint_t   sysdc_reset_updates;    /* # of updates before reset */
 329 static uint_t   sysdc_break_updates;    /* updates to break */
 330 static uint_t   sysdc_nobreak_updates;  /* updates to not check */
 331 static uint_t   sysdc_minDC;            /* minimum allowed DC */
 332 static uint_t   sysdc_maxDC;            /* maximum allowed DC */
 333 static pri_t    sysdc_minpri;           /* minimum allowed priority */
 334 static pri_t    sysdc_maxpri;           /* maximum allowed priority */
 335 
 336 /*
 337  * Internal state
 338  */
 339 static kmutex_t sysdc_pset_lock;        /* lock protecting pset data */
 340 static list_t   sysdc_psets;            /* list of psets with SDC threads */
 341 static uint_t   sysdc_param_init;       /* sysdc_initparam() has been called */
 342 static uint_t   sysdc_update_timeout_started; /* update timeout is active */
 343 static hrtime_t sysdc_last_update;      /* time of last sysdc_update() */
 344 static sysdc_t  sysdc_dummy;            /* used to terminate active lists */
 345 
 346 /*
 347  * Internal state - active hash table
 348  */
 349 #define SYSDC_NLISTS    8
 350 #define SYSDC_HASH(sdc) (((uintptr_t)(sdc) >> 6) & (SYSDC_NLISTS - 1))
 351 static sysdc_list_t     sysdc_active[SYSDC_NLISTS];
 352 #define SYSDC_LIST(sdc)         (&sysdc_active[SYSDC_HASH(sdc)])
 353 
 354 #ifdef DEBUG
 355 static struct {
 356         uint64_t        sysdc_update_times_asleep;
 357         uint64_t        sysdc_update_times_base_ran_backwards;
 358         uint64_t        sysdc_update_times_already_done;
 359         uint64_t        sysdc_update_times_cur_ran_backwards;
 360         uint64_t        sysdc_compute_pri_breaking;
 361         uint64_t        sysdc_activate_enter;
 362         uint64_t        sysdc_update_enter;
 363         uint64_t        sysdc_update_exited;
 364         uint64_t        sysdc_update_not_sdc;
 365         uint64_t        sysdc_update_idle;
 366         uint64_t        sysdc_update_take_break;
 367         uint64_t        sysdc_update_no_psets;
 368         uint64_t        sysdc_tick_not_sdc;
 369         uint64_t        sysdc_tick_quantum_expired;
 370         uint64_t        sysdc_thread_enter_enter;
 371 } sysdc_stats;
 372 
 373 #define SYSDC_INC_STAT(x)       (sysdc_stats.x++)
 374 #else
 375 #define SYSDC_INC_STAT(x)       ((void)0)
 376 #endif
 377 
 378 /* macros are UPPER CASE */
 379 #define HOWMANY(a, b)   howmany((a), (b))
 380 #define MSECTOTICKS(a)  HOWMANY((a) * 1000, usec_per_tick)
 381 
 382 static void
 383 sysdc_initparam(void)
 384 {
 385         uint_t sysdc_break_ticks;
 386 
 387         /* update / prune intervals */
 388         sysdc_update_ticks = MSECTOTICKS(sysdc_update_interval_msec);
 389 
 390         sysdc_prune_updates = HOWMANY(sysdc_prune_interval_msec,
 391             sysdc_update_interval_msec);
 392         sysdc_reset_updates = HOWMANY(sysdc_reset_interval_msec,
 393             sysdc_update_interval_msec);
 394 
 395         /* We must get at least a little time on CPU. */
 396         sysdc_minDC = 1;
 397         sysdc_maxDC = SYSDC_DC_MAX;
 398         sysdc_minpri = 0;
 399         sysdc_maxpri = maxclsyspri;
 400 
 401         /* break parameters */
 402         if (sysdc_max_pset_DC > SYSDC_DC_MAX) {
 403                 sysdc_max_pset_DC = SYSDC_DC_MAX;
 404         }
 405         sysdc_break_ticks = MSECTOTICKS(sysdc_break_msec);
 406         sysdc_break_updates = HOWMANY(sysdc_break_ticks, sysdc_update_ticks);
 407 
 408         /*
 409          * We want:
 410          *
 411          *      sysdc_max_pset_DC = (nobreak / (break + nobreak))
 412          *
 413          *      ==>    nobreak = sysdc_max_pset_DC * (break + nobreak)
 414          *
 415          *                          sysdc_max_pset_DC * break
 416          *      ==>    nobreak = -------------------------
 417          *                          1 - sysdc_max_pset_DC
 418          */
 419         sysdc_nobreak_updates =
 420             HOWMANY((uint64_t)sysdc_break_updates * sysdc_max_pset_DC,
 421             (SYSDC_DC_MAX - sysdc_max_pset_DC));
 422 
 423         sysdc_param_init = 1;
 424 }
 425 
 426 #undef HOWMANY
 427 #undef MSECTOTICKS
 428 
 429 #define SDC_UPDATE_INITIAL      0x1     /* for the initial update */
 430 #define SDC_UPDATE_TIMEOUT      0x2     /* from sysdc_update() */
 431 #define SDC_UPDATE_TICK         0x4     /* from sysdc_tick(), on expiry */
 432 
 433 /*
 434  * Updates the recorded times in the sdc, and returns the elapsed ONPROC
 435  * and Runnable times since the last reset.
 436  *
 437  * newO is the thread's actual ONPROC time; it's used during sysdc_update()
 438  * to track processor set usage.
 439  */
 440 static void
 441 sysdc_update_times(sysdc_t *sdc, uint_t flags,
 442     hrtime_t *O, hrtime_t *R, hrtime_t *newO)
 443 {
 444         kthread_t *const t = sdc->sdc_thread;
 445         const uint_t    initial = (flags & SDC_UPDATE_INITIAL);
 446         const uint_t    update = (flags & SDC_UPDATE_TIMEOUT);
 447         const clock_t   now = ddi_get_lbolt();
 448         uint_t          do_reset;
 449 
 450         ASSERT(THREAD_LOCK_HELD(t));
 451 
 452         *O = *R = 0;
 453 
 454         /* If we've been sleeping, we know we haven't had any ONPROC time. */
 455         if (sdc->sdc_sleep_updates != 0 &&
 456             sdc->sdc_sleep_updates != sdc->sdc_nupdates) {
 457                 *newO = sdc->sdc_last_base_O;
 458                 SYSDC_INC_STAT(sysdc_update_times_asleep);
 459                 return;
 460         }
 461 
 462         /*
 463          * If this is our first update, or we've hit the reset point,
 464          * we need to reset our base_{O,R}.  Once we've updated them, we
 465          * report O and R for the entire prior interval.
 466          */
 467         do_reset = initial;
 468         if (update) {
 469                 ++sdc->sdc_nupdates;
 470                 if ((sdc->sdc_nupdates % sysdc_reset_updates) == 0)
 471                         do_reset = 1;
 472         }
 473         if (do_reset) {
 474                 hrtime_t baseO, baseR;
 475                 if (initial) {
 476                         /*
 477                          * Start off our cycle count somewhere in the middle,
 478                          * to keep the resets from all happening at once.
 479                          *
 480                          * 4999 is a handy prime much larger than
 481                          * sysdc_reset_updates, so that we don't run into
 482                          * trouble if the resolution is a multiple of
 483                          * sysdc_reset_updates.
 484                          */
 485                         sdc->sdc_nupdates = (uint_t)((gethrtime() % 4999) %
 486                             sysdc_reset_updates);
 487                         baseO = baseR = 0;
 488                 } else {
 489                         baseO = sdc->sdc_base_O;
 490                         baseR = sdc->sdc_base_R;
 491                 }
 492 
 493                 mstate_systhread_times(t, &sdc->sdc_base_O, &sdc->sdc_base_R);
 494                 *newO = sdc->sdc_base_O;
 495 
 496                 sdc->sdc_reset = now;
 497                 sdc->sdc_pri_check = -1; /* force mismatch below */
 498 
 499                 /*
 500                  * See below for rationale.
 501                  */
 502                 if (baseO > sdc->sdc_base_O || baseR > sdc->sdc_base_R) {
 503                         SYSDC_INC_STAT(sysdc_update_times_base_ran_backwards);
 504                         baseO = sdc->sdc_base_O;
 505                         baseR = sdc->sdc_base_R;
 506                 }
 507 
 508                 /* compute based on the entire interval */
 509                 *O = (sdc->sdc_base_O - baseO);
 510                 *R = (sdc->sdc_base_R - baseR);
 511                 return;
 512         }
 513 
 514         /*
 515          * If we're called from sysdc_update(), we *must* return a value
 516          * for newO, so we always call mstate_systhread_times().
 517          *
 518          * Otherwise, if we've already done a pri check this tick,
 519          * we can skip it.
 520          */
 521         if (!update && sdc->sdc_pri_check == now) {
 522                 SYSDC_INC_STAT(sysdc_update_times_already_done);
 523                 return;
 524         }
 525 
 526         /* Get the current times from the thread */
 527         sdc->sdc_pri_check = now;
 528         mstate_systhread_times(t, &sdc->sdc_cur_O, &sdc->sdc_cur_R);
 529         *newO = sdc->sdc_cur_O;
 530 
 531         /*
 532          * The updating of microstate accounting is not done under a
 533          * consistent set of locks, particularly the t_waitrq field.  This
 534          * can lead to narrow windows in which we account for time in the
 535          * wrong bucket, which on the next read will be accounted for
 536          * correctly.
 537          *
 538          * If our sdc_base_* fields were affected by one of these blips, we
 539          * throw away the old data, and pretend this tick didn't happen.
 540          */
 541         if (sdc->sdc_cur_O < sdc->sdc_base_O ||
 542             sdc->sdc_cur_R < sdc->sdc_base_R) {
 543 
 544                 sdc->sdc_base_O = sdc->sdc_cur_O;
 545                 sdc->sdc_base_R = sdc->sdc_cur_R;
 546 
 547                 SYSDC_INC_STAT(sysdc_update_times_cur_ran_backwards);
 548                 return;
 549         }
 550 
 551         *O = sdc->sdc_cur_O - sdc->sdc_base_O;
 552         *R = sdc->sdc_cur_R - sdc->sdc_base_R;
 553 }
 554 
 555 /*
 556  * sysdc_compute_pri()
 557  *
 558  *      Recomputes the priority of the thread, leaving the result in
 559  *      sdc->sdc_epri.  Returns 1 if a priority update should occur
 560  *      (which will also trigger a cpu_surrender()), otherwise
 561  *      returns 0.
 562  */
 563 static uint_t
 564 sysdc_compute_pri(sysdc_t *sdc, uint_t flags)
 565 {
 566         kthread_t *const t = sdc->sdc_thread;
 567         const uint_t    update = (flags & SDC_UPDATE_TIMEOUT);
 568         const uint_t    tick = (flags & SDC_UPDATE_TICK);
 569 
 570         hrtime_t        O, R;
 571         hrtime_t        newO = -1;
 572 
 573         ASSERT(THREAD_LOCK_HELD(t));
 574 
 575         sysdc_update_times(sdc, flags, &O, &R, &newO);
 576         ASSERT(!update || newO != -1);
 577 
 578         /* If we have new data, recompute our priority. */
 579         if ((O + R) != 0) {
 580                 sdc->sdc_cur_DC = (O * SYSDC_DC_MAX) / (O + R);
 581 
 582                 /* Adjust our priority to move our DC closer to the target. */
 583                 if (sdc->sdc_cur_DC < sdc->sdc_target_DC)
 584                         sdc->sdc_pri = sdc->sdc_maxpri;
 585                 else
 586                         sdc->sdc_pri = sdc->sdc_minpri;
 587         }
 588 
 589         /*
 590          * If our per-pset duty cycle goes over the max, we will take a break.
 591          * This forces all sysdc threads in the pset to minimum priority, in
 592          * order to let everyone else have a chance at the CPU.
 593          */
 594         if (sdc->sdc_pset->sdp_need_break) {
 595                 SYSDC_INC_STAT(sysdc_compute_pri_breaking);
 596                 sdc->sdc_epri = sdc->sdc_minpri;
 597         } else {
 598                 sdc->sdc_epri = sdc->sdc_pri;
 599         }
 600 
 601         DTRACE_PROBE4(sysdc__compute__pri,
 602             kthread_t *, t, pri_t, sdc->sdc_epri, uint_t, sdc->sdc_cur_DC,
 603             uint_t, sdc->sdc_target_DC);
 604 
 605         /*
 606          * For sysdc_update(), we compute the ONPROC time for high-priority
 607          * threads, which is used to calculate the per-pset duty cycle.  We
 608          * will always tell our callers to update the thread's priority,
 609          * since we want to force a cpu_surrender().
 610          *
 611          * We reset sdc_update_ticks so that sysdc_tick() will only update
 612          * the thread's priority if our timeout is delayed by a tick or
 613          * more.
 614          */
 615         if (update) {
 616                 /* SDC threads are not allowed to change cpupart bindings. */
 617                 ASSERT(t->t_cpupart == sdc->sdc_pset->sdp_cpupart);
 618 
 619                 /* If we were at MAXPRI, account for our onproc time. */
 620                 if (t->t_pri == sdc->sdc_maxpri &&
 621                     sdc->sdc_last_base_O != 0 &&
 622                     sdc->sdc_last_base_O < newO) {
 623                         sdc->sdc_last_O = newO - sdc->sdc_last_base_O;
 624                         sdc->sdc_pset->sdp_onproc_time +=
 625                             (uint64_t)sdc->sdc_last_O;
 626                         sdc->sdc_pset->sdp_onproc_threads++;
 627                 } else {
 628                         sdc->sdc_last_O = 0;
 629                 }
 630                 sdc->sdc_last_base_O = newO;
 631 
 632                 sdc->sdc_update_ticks = sdc->sdc_ticks + sysdc_update_ticks + 1;
 633                 return (1);
 634         }
 635 
 636         /*
 637          * Like sysdc_update(), sysdc_tick() always wants to update the
 638          * thread's priority, so that the CPU is surrendered if necessary.
 639          * We reset sdc_update_ticks so that if the timeout continues to be
 640          * delayed, we'll update at the regular interval.
 641          */
 642         if (tick) {
 643                 ASSERT(sdc->sdc_ticks == sdc->sdc_update_ticks);
 644                 sdc->sdc_update_ticks = sdc->sdc_ticks + sysdc_update_ticks;
 645                 return (1);
 646         }
 647 
 648         /*
 649          * Otherwise, only tell our callers to update the priority if it has
 650          * changed.
 651          */
 652         return (sdc->sdc_epri != t->t_pri);
 653 }
 654 
 655 static void
 656 sysdc_update_pri(sysdc_t *sdc, uint_t flags)
 657 {
 658         kthread_t *t = sdc->sdc_thread;
 659 
 660         ASSERT(THREAD_LOCK_HELD(t));
 661 
 662         if (sysdc_compute_pri(sdc, flags)) {
 663                 if (!thread_change_pri(t, sdc->sdc_epri, 0)) {
 664                         cpu_surrender(t);
 665                 }
 666         }
 667 }
 668 
 669 /*
 670  * Add a thread onto the active list.  It will only be removed by
 671  * sysdc_update().
 672  */
 673 static void
 674 sysdc_activate(sysdc_t *sdc)
 675 {
 676         sysdc_t *volatile *headp = &SYSDC_LIST(sdc)->sdl_list;
 677         sysdc_t         *head;
 678         kthread_t       *t = sdc->sdc_thread;
 679 
 680         SYSDC_INC_STAT(sysdc_activate_enter);
 681 
 682         ASSERT(sdc->sdc_next == NULL);
 683         ASSERT(THREAD_LOCK_HELD(t));
 684 
 685         do {
 686                 head = *headp;
 687                 sdc->sdc_next = head;
 688         } while (atomic_cas_ptr(headp, head, sdc) != head);
 689 }
 690 
 691 /*
 692  * sysdc_update() has two jobs:
 693  *
 694  *      1. It updates the priorities of all active SDC threads on the system.
 695  *      2. It measures pset CPU usage and enforces sysdc_max_pset_DC.
 696  */
 697 static void
 698 sysdc_update(void *arg)
 699 {
 700         int             idx;
 701         sysdc_t         *freelist = NULL;
 702         sysdc_pset_t    *cur;
 703         hrtime_t        now, diff;
 704         uint_t          redeploy = 1;
 705 
 706         SYSDC_INC_STAT(sysdc_update_enter);
 707 
 708         ASSERT(sysdc_update_timeout_started);
 709 
 710         /*
 711          * If this is our first time through, diff will be gigantic, and
 712          * no breaks will be necessary.
 713          */
 714         now = gethrtime();
 715         diff = now - sysdc_last_update;
 716         sysdc_last_update = now;
 717 
 718         mutex_enter(&sysdc_pset_lock);
 719         for (cur = list_head(&sysdc_psets); cur != NULL;
 720             cur = list_next(&sysdc_psets, cur)) {
 721                 boolean_t breaking = (cur->sdp_should_break != 0);
 722 
 723                 if (cur->sdp_need_break != breaking) {
 724                         DTRACE_PROBE2(sdc__pset__break, sysdc_pset_t *, cur,
 725                             boolean_t, breaking);
 726                 }
 727                 cur->sdp_onproc_time = 0;
 728                 cur->sdp_onproc_threads = 0;
 729                 cur->sdp_need_break = breaking;
 730         }
 731         mutex_exit(&sysdc_pset_lock);
 732 
 733         for (idx = 0; idx < SYSDC_NLISTS; idx++) {
 734                 sysdc_list_t            *sdl = &sysdc_active[idx];
 735                 sysdc_t *volatile       *headp = &sdl->sdl_list;
 736                 sysdc_t                 *head, *tail;
 737                 sysdc_t                 **prevptr;
 738 
 739                 if (*headp == &sysdc_dummy)
 740                         continue;
 741 
 742                 /* Prevent any threads from exiting while we're poking them. */
 743                 mutex_enter(&sdl->sdl_lock);
 744 
 745                 /*
 746                  * Each sdl_list contains a singly-linked list of active
 747                  * threads. Threads which become active while we are
 748                  * processing the list will be added to sdl_list.  Since we
 749                  * don't want that to interfere with our own processing, we
 750                  * swap in an empty list.  Any newly active threads will
 751                  * go on to this empty list.  When finished, we'll put any
 752                  * such threads at the end of the processed list.
 753                  */
 754                 head = atomic_swap_ptr(headp, &sysdc_dummy);
 755                 prevptr = &head;
 756                 while (*prevptr != &sysdc_dummy) {
 757                         sysdc_t         *const  sdc = *prevptr;
 758                         kthread_t       *const  t = sdc->sdc_thread;
 759 
 760                         /*
 761                          * If the thread has exited, move its sysdc_t onto
 762                          * freelist, to be freed later.
 763                          */
 764                         if (t == NULL) {
 765                                 *prevptr = sdc->sdc_next;
 766                                 SYSDC_INC_STAT(sysdc_update_exited);
 767                                 sdc->sdc_next = freelist;
 768                                 freelist = sdc;
 769                                 continue;
 770                         }
 771 
 772                         thread_lock(t);
 773                         if (t->t_cid != sysdccid) {
 774                                 thread_unlock(t);
 775                                 prevptr = &sdc->sdc_next;
 776                                 SYSDC_INC_STAT(sysdc_update_not_sdc);
 777                                 continue;
 778                         }
 779                         ASSERT(t->t_cldata == sdc);
 780 
 781                         /*
 782                          * If the thread has been sleeping for longer
 783                          * than sysdc_prune_interval, make it inactive by
 784                          * removing it from the list.
 785                          */
 786                         if (!(t->t_state & (TS_RUN | TS_ONPROC)) &&
 787                             sdc->sdc_sleep_updates != 0 &&
 788                             (sdc->sdc_sleep_updates - sdc->sdc_nupdates) >
 789                             sysdc_prune_updates) {
 790                                 *prevptr = sdc->sdc_next;
 791                                 SYSDC_INC_STAT(sysdc_update_idle);
 792                                 sdc->sdc_next = NULL;
 793                                 thread_unlock(t);
 794                                 continue;
 795                         }
 796                         sysdc_update_pri(sdc, SDC_UPDATE_TIMEOUT);
 797                         thread_unlock(t);
 798 
 799                         prevptr = &sdc->sdc_next;
 800                 }
 801 
 802                 /*
 803                  * Add our list to the bucket, putting any new entries
 804                  * added while we were working at the tail of the list.
 805                  */
 806                 do {
 807                         tail = *headp;
 808                         *prevptr = tail;
 809                 } while (atomic_cas_ptr(headp, tail, head) != tail);
 810 
 811                 mutex_exit(&sdl->sdl_lock);
 812         }
 813 
 814         mutex_enter(&sysdc_pset_lock);
 815         for (cur = list_head(&sysdc_psets); cur != NULL;
 816             cur = list_next(&sysdc_psets, cur)) {
 817 
 818                 cur->sdp_vtime_last_interval =
 819                     diff * cur->sdp_cpupart->cp_ncpus;
 820                 cur->sdp_DC_last_interval =
 821                     (cur->sdp_onproc_time * SYSDC_DC_MAX) /
 822                     cur->sdp_vtime_last_interval;
 823 
 824                 if (cur->sdp_should_break > 0) {
 825                         cur->sdp_should_break--;     /* breaking */
 826                         continue;
 827                 }
 828                 if (cur->sdp_dont_break > 0) {
 829                         cur->sdp_dont_break--;       /* waiting before checking */
 830                         continue;
 831                 }
 832                 if (cur->sdp_DC_last_interval > sysdc_max_pset_DC) {
 833                         cur->sdp_should_break = sysdc_break_updates;
 834                         cur->sdp_dont_break = sysdc_nobreak_updates;
 835                         SYSDC_INC_STAT(sysdc_update_take_break);
 836                 }
 837         }
 838 
 839         /*
 840          * If there are no sysdc_psets, there can be no threads, so
 841          * we can stop doing our timeout.  Since we're holding the
 842          * sysdc_pset_lock, no new sysdc_psets can come in, which will
 843          * prevent anyone from racing with this and dropping our timeout
 844          * on the floor.
 845          */
 846         if (list_is_empty(&sysdc_psets)) {
 847                 SYSDC_INC_STAT(sysdc_update_no_psets);
 848                 ASSERT(sysdc_update_timeout_started);
 849                 sysdc_update_timeout_started = 0;
 850 
 851                 redeploy = 0;
 852         }
 853         mutex_exit(&sysdc_pset_lock);
 854 
 855         while (freelist != NULL) {
 856                 sysdc_t *cur = freelist;
 857                 freelist = cur->sdc_next;
 858                 kmem_free(cur, sizeof (*cur));
 859         }
 860 
 861         if (redeploy) {
 862                 (void) timeout(sysdc_update, arg, sysdc_update_ticks);
 863         }
 864 }
 865 
 866 static void
 867 sysdc_preempt(kthread_t *t)
 868 {
 869         ASSERT(t == curthread);
 870         ASSERT(THREAD_LOCK_HELD(t));
 871 
 872         setbackdq(t);           /* give others a chance to run */
 873 }
 874 
 875 static void
 876 sysdc_tick(kthread_t *t)
 877 {
 878         sysdc_t *sdc;
 879 
 880         thread_lock(t);
 881         if (t->t_cid != sysdccid) {
 882                 SYSDC_INC_STAT(sysdc_tick_not_sdc);
 883                 thread_unlock(t);
 884                 return;
 885         }
 886         sdc = t->t_cldata;
 887         if (t->t_state == TS_ONPROC &&
 888             t->t_pri < t->t_disp_queue->disp_maxrunpri) {
 889                 cpu_surrender(t);
 890         }
 891 
 892         if (t->t_state == TS_ONPROC || t->t_state == TS_RUN) {
 893                 ASSERT(sdc->sdc_sleep_updates == 0);
 894         }
 895 
 896         ASSERT(sdc->sdc_ticks != sdc->sdc_update_ticks);
 897         sdc->sdc_ticks++;
 898         if (sdc->sdc_ticks == sdc->sdc_update_ticks) {
 899                 SYSDC_INC_STAT(sysdc_tick_quantum_expired);
 900                 sysdc_update_pri(sdc, SDC_UPDATE_TICK);
 901                 ASSERT(sdc->sdc_ticks != sdc->sdc_update_ticks);
 902         }
 903         thread_unlock(t);
 904 }
 905 
 906 static void
 907 sysdc_setrun(kthread_t *t)
 908 {
 909         sysdc_t *sdc = t->t_cldata;
 910 
 911         ASSERT(THREAD_LOCK_HELD(t));    /* t should be in transition */
 912 
 913         sdc->sdc_sleep_updates = 0;
 914 
 915         if (sdc->sdc_next == NULL) {
 916                 /*
 917                  * Since we're in transition, we don't want to use the
 918                  * full thread_update_pri().
 919                  */
 920                 if (sysdc_compute_pri(sdc, 0)) {
 921                         THREAD_CHANGE_PRI(t, sdc->sdc_epri);
 922                 }
 923                 sysdc_activate(sdc);
 924 
 925                 ASSERT(sdc->sdc_next != NULL);
 926         }
 927 
 928         setbackdq(t);
 929 }
 930 
 931 static void
 932 sysdc_wakeup(kthread_t *t)
 933 {
 934         sysdc_setrun(t);
 935 }
 936 
 937 static void
 938 sysdc_sleep(kthread_t *t)
 939 {
 940         sysdc_t *sdc = t->t_cldata;
 941 
 942         ASSERT(THREAD_LOCK_HELD(t));    /* t should be in transition */
 943 
 944         sdc->sdc_sleep_updates = sdc->sdc_nupdates;
 945 }
 946 
 947 /*ARGSUSED*/
 948 static int
 949 sysdc_enterclass(kthread_t *t, id_t cid, void *parmsp, cred_t *reqpcredp,
 950     void *bufp)
 951 {
 952         cpupart_t *const cpupart = t->t_cpupart;
 953         sysdc_t *sdc = bufp;
 954         sysdc_params_t *sdpp = parmsp;
 955         sysdc_pset_t *newpset = sdc->sdc_pset;
 956         sysdc_pset_t *pset;
 957         int start_timeout;
 958 
 959         if (t->t_cid != syscid)
 960                 return (EPERM);
 961 
 962         ASSERT(ttolwp(t) != NULL);
 963         ASSERT(sdpp != NULL);
 964         ASSERT(newpset != NULL);
 965         ASSERT(sysdc_param_init);
 966 
 967         ASSERT(sdpp->sdp_minpri >= sysdc_minpri);
 968         ASSERT(sdpp->sdp_maxpri <= sysdc_maxpri);
 969         ASSERT(sdpp->sdp_DC >= sysdc_minDC);
 970         ASSERT(sdpp->sdp_DC <= sysdc_maxDC);
 971 
 972         sdc->sdc_thread = t;
 973         sdc->sdc_pri = sdpp->sdp_maxpri;  /* start off maximally */
 974         sdc->sdc_minpri = sdpp->sdp_minpri;
 975         sdc->sdc_maxpri = sdpp->sdp_maxpri;
 976         sdc->sdc_target_DC = sdpp->sdp_DC;
 977         sdc->sdc_ticks = 0;
 978         sdc->sdc_update_ticks = sysdc_update_ticks + 1;
 979 
 980         /* Assign ourselves to the appropriate pset. */
 981         sdc->sdc_pset = NULL;
 982         mutex_enter(&sysdc_pset_lock);
 983         for (pset = list_head(&sysdc_psets); pset != NULL;
 984             pset = list_next(&sysdc_psets, pset)) {
 985                 if (pset->sdp_cpupart == cpupart) {
 986                         break;
 987                 }
 988         }
 989         if (pset == NULL) {
 990                 pset = newpset;
 991                 newpset = NULL;
 992                 pset->sdp_cpupart = cpupart;
 993                 list_insert_tail(&sysdc_psets, pset);
 994         }
 995         pset->sdp_nthreads++;
 996         ASSERT(pset->sdp_nthreads > 0);
 997 
 998         sdc->sdc_pset = pset;
 999 
1000         start_timeout = (sysdc_update_timeout_started == 0);
1001         sysdc_update_timeout_started = 1;
1002         mutex_exit(&sysdc_pset_lock);
1003 
1004         if (newpset != NULL)
1005                 kmem_free(newpset, sizeof (*newpset));
1006 
1007         /* Update t's scheduling class and priority. */
1008         thread_lock(t);
1009         t->t_clfuncs = &(sclass[cid].cl_funcs->thread);
1010         t->t_cid = cid;
1011         t->t_cldata = sdc;
1012         t->t_schedflag |= TS_RUNQMATCH;
1013 
1014         sysdc_update_pri(sdc, SDC_UPDATE_INITIAL);
1015         thread_unlock(t);
1016 
1017         /* Kick off the thread timeout if we're the first one in. */
1018         if (start_timeout) {
1019                 (void) timeout(sysdc_update, NULL, sysdc_update_ticks);
1020         }
1021 
1022         return (0);
1023 }
1024 
1025 static void
1026 sysdc_leave(sysdc_t *sdc)
1027 {
1028         sysdc_pset_t *sdp = sdc->sdc_pset;
1029         sysdc_list_t *sdl = SYSDC_LIST(sdc);
1030         uint_t freedc;
1031 
1032         mutex_enter(&sdl->sdl_lock);             /* block sysdc_update() */
1033         sdc->sdc_thread = NULL;
1034         freedc = (sdc->sdc_next == NULL);
1035         mutex_exit(&sdl->sdl_lock);
1036 
1037         mutex_enter(&sysdc_pset_lock);
1038         ASSERT(sdp != NULL);
1039         ASSERT(sdp->sdp_nthreads > 0);
1040         --sdp->sdp_nthreads;
1041         if (sdp->sdp_nthreads == 0) {
1042                 list_remove(&sysdc_psets, sdp);
1043         } else {
1044                 sdp = NULL;
1045         }
1046         mutex_exit(&sysdc_pset_lock);
1047 
1048         if (freedc)
1049                 kmem_free(sdc, sizeof (*sdc));
1050         if (sdp != NULL)
1051                 kmem_free(sdp, sizeof (*sdp));
1052 }
1053 
1054 static void
1055 sysdc_exitclass(void *buf)
1056 {
1057         sysdc_leave((sysdc_t *)buf);
1058 }
1059 
1060 /*ARGSUSED*/
1061 static int
1062 sysdc_canexit(kthread_t *t, cred_t *reqpcredp)
1063 {
1064         /* Threads cannot exit SDC once joined, except in a body bag. */
1065         return (EPERM);
1066 }
1067 
1068 static void
1069 sysdc_exit(kthread_t *t)
1070 {
1071         sysdc_t *sdc;
1072 
1073         /* We're exiting, so we just rejoin the SYS class. */
1074         thread_lock(t);
1075         ASSERT(t->t_cid == sysdccid);
1076         sdc = t->t_cldata;
1077         t->t_cid = syscid;
1078         t->t_cldata = NULL;
1079         t->t_clfuncs = &(sclass[syscid].cl_funcs->thread);
1080         (void) thread_change_pri(t, maxclsyspri, 0);
1081         t->t_schedflag &= ~TS_RUNQMATCH;
1082         thread_unlock_nopreempt(t);
1083 
1084         /* Unlink the sdc from everything. */
1085         sysdc_leave(sdc);
1086 }
1087 
1088 /*ARGSUSED*/
1089 static int
1090 sysdc_fork(kthread_t *t, kthread_t *ct, void *bufp)
1091 {
1092         /*
1093          * Threads cannot be created with SDC as their class; they must
1094          * be created as SYS and then added with sysdc_thread_enter().
1095          * Because of this restriction, sysdc_fork() should never be called.
1096          */
1097         panic("sysdc cannot be forked");
1098 
1099         return (ENOSYS);
1100 }
1101 
1102 /*ARGSUSED*/
1103 static void
1104 sysdc_forkret(kthread_t *t, kthread_t *ct)
1105 {
1106         /* SDC threads are part of system processes, which never fork. */
1107         panic("sysdc cannot be forked");
1108 }
1109 
1110 static pri_t
1111 sysdc_globpri(kthread_t *t)
1112 {
1113         return (t->t_epri);
1114 }
1115 
1116 /*ARGSUSED*/
1117 static pri_t
1118 sysdc_no_swap(kthread_t *t, int flags)
1119 {
1120         /* SDC threads cannot be swapped. */
1121         return (-1);
1122 }
1123 
1124 /*
1125  * Get maximum and minimum priorities enjoyed by SDC threads.
1126  */
1127 static int
1128 sysdc_getclpri(pcpri_t *pcprip)
1129 {
1130         pcprip->pc_clpmax = sysdc_maxpri;
1131         pcprip->pc_clpmin = sysdc_minpri;
1132         return (0);
1133 }
1134 
1135 /*ARGSUSED*/
1136 static int
1137 sysdc_getclinfo(void *arg)
1138 {
1139         return (0);             /* no class-specific info */
1140 }
1141 
1142 /*ARGSUSED*/
1143 static int
1144 sysdc_alloc(void **p, int flag)
1145 {
1146         sysdc_t *new;
1147 
1148         *p = NULL;
1149         if ((new = kmem_zalloc(sizeof (*new), flag)) == NULL) {
1150                 return (ENOMEM);
1151         }
1152         if ((new->sdc_pset = kmem_zalloc(sizeof (*new->sdc_pset), flag)) ==
1153             NULL) {
1154                 kmem_free(new, sizeof (*new));
1155                 return (ENOMEM);
1156         }
1157         *p = new;
1158         return (0);
1159 }
1160 
1161 static void
1162 sysdc_free(void *p)
1163 {
1164         sysdc_t *sdc = p;
1165 
1166         if (sdc != NULL) {
1167                 /*
1168                  * We must have failed CL_ENTERCLASS(), so our pset should be
1169                  * there and unused.
1170                  */
1171                 ASSERT(sdc->sdc_pset != NULL);
1172                 ASSERT(sdc->sdc_pset->sdp_cpupart == NULL);
1173                 kmem_free(sdc->sdc_pset, sizeof (*sdc->sdc_pset));
1174                 kmem_free(sdc, sizeof (*sdc));
1175         }
1176 }
1177 
1178 static int sysdc_enosys();      /* Boy, ANSI-C's K&R compatibility is weird. */
1179 static int sysdc_einval();
1180 static void sysdc_nullsys();
1181 
1182 static struct classfuncs sysdc_classfuncs = {
1183         /* messages to class manager */
1184         {
1185                 sysdc_enosys,   /* admin */
1186                 sysdc_getclinfo,
1187                 sysdc_enosys,   /* parmsin */
1188                 sysdc_enosys,   /* parmsout */
1189                 sysdc_enosys,   /* vaparmsin */
1190                 sysdc_enosys,   /* vaparmsout */
1191                 sysdc_getclpri,
1192                 sysdc_alloc,
1193                 sysdc_free,
1194         },
1195         /* operations on threads */
1196         {
1197                 sysdc_enterclass,
1198                 sysdc_exitclass,
1199                 sysdc_canexit,
1200                 sysdc_fork,
1201                 sysdc_forkret,
1202                 sysdc_nullsys,  /* parmsget */
1203                 sysdc_enosys,   /* parmsset */
1204                 sysdc_nullsys,  /* stop */
1205                 sysdc_exit,
1206                 sysdc_nullsys,  /* active */
1207                 sysdc_nullsys,  /* inactive */
1208                 sysdc_no_swap,  /* swapin */
1209                 sysdc_no_swap,  /* swapout */
1210                 sysdc_nullsys,  /* trapret */
1211                 sysdc_preempt,
1212                 sysdc_setrun,
1213                 sysdc_sleep,
1214                 sysdc_tick,
1215                 sysdc_wakeup,
1216                 sysdc_einval,   /* donice */
1217                 sysdc_globpri,
1218                 sysdc_nullsys,  /* set_process_group */
1219                 sysdc_nullsys,  /* yield */
1220                 sysdc_einval,   /* doprio */
1221         }
1222 };
1223 
1224 static int
1225 sysdc_enosys()
1226 {
1227         return (ENOSYS);
1228 }
1229 
1230 static int
1231 sysdc_einval()
1232 {
1233         return (EINVAL);
1234 }
1235 
1236 static void
1237 sysdc_nullsys()
1238 {
1239 }
1240 
1241 /*ARGSUSED*/
1242 static pri_t
1243 sysdc_init(id_t cid, int clparmsz, classfuncs_t **clfuncspp)
1244 {
1245         int idx;
1246 
1247         list_create(&sysdc_psets, sizeof (sysdc_pset_t),
1248             offsetof(sysdc_pset_t, sdp_node));
1249 
1250         for (idx = 0; idx < SYSDC_NLISTS; idx++) {
1251                 sysdc_active[idx].sdl_list = &sysdc_dummy;
1252         }
1253 
1254         sysdc_initparam();
1255 
1256         sysdccid = cid;
1257         *clfuncspp = &sysdc_classfuncs;
1258 
1259         return ((pri_t)v.v_maxsyspri);
1260 }
1261 
1262 static struct sclass csw = {
1263         "SDC",
1264         sysdc_init,
1265         0
1266 };
1267 
1268 static struct modlsched modlsched = {
1269         &mod_schedops, "system duty cycle scheduling class", &csw
1270 };
1271 
1272 static struct modlinkage modlinkage = {
1273         MODREV_1, (void *)&modlsched, NULL
1274 };
1275 
1276 int
1277 _init()
1278 {
1279         return (mod_install(&modlinkage));
1280 }
1281 
1282 int
1283 _fini()
1284 {
1285         return (EBUSY);         /* can't unload for now */
1286 }
1287 
1288 int
1289 _info(struct modinfo *modinfop)
1290 {
1291         return (mod_info(&modlinkage, modinfop));
1292 }
1293 
1294 /* --- consolidation-private interfaces --- */
1295 void
1296 sysdc_thread_enter(kthread_t *t, uint_t dc, uint_t flags)
1297 {
1298         void *buf = NULL;
1299         sysdc_params_t sdp;
1300 
1301         SYSDC_INC_STAT(sysdc_thread_enter_enter);
1302 
1303         ASSERT(sysdc_param_init);
1304         ASSERT(sysdccid >= 0);
1305 
1306         ASSERT((flags & ~SYSDC_THREAD_BATCH) == 0);
1307 
1308         sdp.sdp_minpri = sysdc_minpri;
1309         sdp.sdp_maxpri = sysdc_maxpri;
1310         sdp.sdp_DC = MAX(MIN(dc, sysdc_maxDC), sysdc_minDC);
1311 
1312         VERIFY0(CL_ALLOC(&buf, sysdccid, KM_SLEEP));
1313 
1314         ASSERT(t->t_lwp != NULL);
1315         ASSERT(t->t_cid == syscid);
1316         ASSERT(t->t_cldata == NULL);
1317         VERIFY0(CL_CANEXIT(t, NULL));
1318         VERIFY0(CL_ENTERCLASS(t, sysdccid, &sdp, kcred, buf));
1319         CL_EXITCLASS(syscid, NULL);
1320 }