illumos-gate Wdiff usr/src/uts/common/disp/sysdc.c

Print this page

7127  remove -Wno-missing-braces from Makefile.uts

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/disp/sysdc.c
          +++ new/usr/src/uts/common/disp/sysdc.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   * Copyright (c) 2012 by Delphix. All rights reserved.
  24   24   */
  25   25  
  26   26  /*
  27   27   * The System Duty Cycle (SDC) scheduling class
  28   28   * --------------------------------------------
  29   29   *
  30   30   * Background
  31   31   *
  32   32   * Kernel threads in Solaris have traditionally not been large consumers
  33   33   * of CPU time.  They typically wake up, perform a small amount of
  34   34   * work, then go back to sleep waiting for either a timeout or another
  35   35   * signal.  On the assumption that the small amount of work that they do
  36   36   * is important for the behavior of the whole system, these threads are
  37   37   * treated kindly by the dispatcher and the SYS scheduling class: they run
  38   38   * without preemption from anything other than real-time and interrupt
  39   39   * threads; when preempted, they are put at the front of the queue, so they
  40   40   * generally do not migrate between CPUs; and they are allowed to stay
  41   41   * running until they voluntarily give up the CPU.
  42   42   *
  43   43   * As Solaris has evolved, new workloads have emerged which require the
  44   44   * kernel to perform significant amounts of CPU-intensive work.  One
  45   45   * example of such a workload is ZFS's transaction group sync processing.
  46   46   * Each sync operation generates a large batch of I/Os, and each I/O
  47   47   * may need to be compressed and/or checksummed before it is written to
  48   48   * storage.  The taskq threads which perform the compression and checksums
  49   49   * will run nonstop as long as they have work to do; a large sync operation
  50   50   * on a compression-heavy dataset can keep them busy for seconds on end.
  51   51   * This causes human-time-scale dispatch latency bubbles for any other
  52   52   * threads which have the misfortune to share a CPU with the taskq threads.
  53   53   *
  54   54   * The SDC scheduling class is a solution to this problem.
  55   55   *
  56   56   *
  57   57   * Overview
  58   58   *
  59   59   * SDC is centered around the concept of a thread's duty cycle (DC):
  60   60   *
  61   61   *                            ONPROC time
  62   62   *      Duty Cycle =    ----------------------
  63   63   *                      ONPROC + Runnable time
  64   64   *
  65   65   * This is the ratio of the time that the thread spent running on a CPU
  66   66   * divided by the time it spent running or trying to run.  It is unaffected
  67   67   * by any time the thread spent sleeping, stopped, etc.
  68   68   *
  69   69   * A thread joining the SDC class specifies a "target" DC that it wants
  70   70   * to run at.  To implement this policy, the routine sysdc_update() scans
  71   71   * the list of active SDC threads every few ticks and uses each thread's
  72   72   * microstate data to compute the actual duty cycle that that thread
  73   73   * has experienced recently.  If the thread is under its target DC, its
  74   74   * priority is increased to the maximum available (sysdc_maxpri, which is
  75   75   * 99 by default).  If the thread is over its target DC, its priority is
  76   76   * reduced to the minimum available (sysdc_minpri, 0 by default).  This
  77   77   * is a fairly primitive approach, in that it doesn't use any of the
  78   78   * intermediate priorities, but it's not completely inappropriate.  Even
  79   79   * though threads in the SDC class might take a while to do their job, they
  80   80   * are by some definition important if they're running inside the kernel,
  81   81   * so it is reasonable that they should get to run at priority 99.
  82   82   *
  83   83   * If a thread is running when sysdc_update() calculates its actual duty
  84   84   * cycle, and there are other threads of equal or greater priority on its
  85   85   * CPU's dispatch queue, sysdc_update() preempts that thread.  The thread
  86   86   * acknowledges the preemption by calling sysdc_preempt(), which calls
  87   87   * setbackdq(), which gives other threads with the same priority a chance
  88   88   * to run.  This creates a de facto time quantum for threads in the SDC
  89   89   * scheduling class.
  90   90   *
  91   91   * An SDC thread which is assigned priority 0 can continue to run if
  92   92   * nothing else needs to use the CPU that it's running on.  Similarly, an
  93   93   * SDC thread at priority 99 might not get to run as much as it wants to
  94   94   * if there are other priority-99 or higher threads on its CPU.  These
  95   95   * situations would cause the thread to get ahead of or behind its target
  96   96   * DC; the longer the situations lasted, the further ahead or behind the
  97   97   * thread would get.  Rather than condemning a thread to a lifetime of
  98   98   * paying for its youthful indiscretions, SDC keeps "base" values for
  99   99   * ONPROC and Runnable times in each thread's sysdc data, and updates these
 100  100   * values periodically.  The duty cycle is then computed using the elapsed
 101  101   * amount of ONPROC and Runnable times since those base times.
 102  102   *
 103  103   * Since sysdc_update() scans SDC threads fairly frequently, it tries to
 104  104   * keep the list of "active" threads small by pruning out threads which
 105  105   * have been asleep for a brief time.  They are not pruned immediately upon
 106  106   * going to sleep, since some threads may bounce back and forth between
 107  107   * sleeping and being runnable.
 108  108   *
 109  109   *
 110  110   * Interfaces
 111  111   *
 112  112   * void sysdc_thread_enter(t, dc, flags)
 113  113   *
 114  114   *      Moves a kernel thread from the SYS scheduling class to the
 115  115   *      SDC class. t must have an associated LWP (created by calling
 116  116   *      lwp_kernel_create()).  The thread will have a target DC of dc.
 117  117   *      Flags should be either 0 or SYSDC_THREAD_BATCH.  If
 118  118   *      SYSDC_THREAD_BATCH is specified, the thread is expected to be
 119  119   *      doing large amounts of processing.
 120  120   *
 121  121   *
 122  122   * Complications
 123  123   *
 124  124   * - Run queue balancing
 125  125   *
 126  126   *      The Solaris dispatcher is biased towards letting a thread run
 127  127   *      on the same CPU which it last ran on, if no more than 3 ticks
 128  128   *      (i.e. rechoose_interval) have passed since the thread last ran.
 129  129   *      This helps to preserve cache warmth.  On the other hand, it also
 130  130   *      tries to keep the per-CPU run queues fairly balanced; if the CPU
 131  131   *      chosen for a runnable thread has a run queue which is three or
 132  132   *      more threads longer than a neighboring CPU's queue, the runnable
 133  133   *      thread is dispatched onto the neighboring CPU instead.
 134  134   *
 135  135   *      These policies work well for some workloads, but not for many SDC
 136  136   *      threads.  The taskq client of SDC, for example, has many discrete
 137  137   *      units of work to do.  The work units are largely independent, so
 138  138   *      cache warmth is not an important consideration.  It is important
 139  139   *      that the threads fan out quickly to different CPUs, since the
 140  140   *      amount of work these threads have to do (a few seconds worth at a
 141  141   *      time) doesn't leave much time to correct thread placement errors
 142  142   *      (i.e. two SDC threads being dispatched to the same CPU).
 143  143   *
 144  144   *      To fix this, SDC uses the TS_RUNQMATCH flag introduced for FSS.
 145  145   *      This tells the dispatcher to keep neighboring run queues' lengths
 146  146   *      more evenly matched, which allows SDC threads to migrate more
 147  147   *      easily.
 148  148   *
 149  149   * - LWPs and system processes
 150  150   *
 151  151   *      SDC can only be used for kernel threads.  Since SDC uses microstate
 152  152   *      accounting data to compute each thread's actual duty cycle, all
 153  153   *      threads entering the SDC class must have associated LWPs (which
 154  154   *      store the microstate data).  This means that the threads have to
 155  155   *      be associated with an SSYS process, i.e. one created by newproc().
 156  156   *      If the microstate accounting information is ever moved into the
 157  157   *      kthread_t, this restriction could be lifted.
 158  158   *
 159  159   * - Dealing with oversubscription
 160  160   *
 161  161   *      Since SDC duty cycles are per-thread, it is possible that the
 162  162   *      aggregate requested duty cycle of all SDC threads in a processor
 163  163   *      set could be greater than the total CPU time available in that set.
 164  164   *      The FSS scheduling class has an analogous situation, which it deals
 165  165   *      with by reducing each thread's allotted CPU time proportionally.
 166  166   *      Since SDC doesn't need to be as precise as FSS, it uses a simpler
 167  167   *      solution to the oversubscription problem.
 168  168   *
 169  169   *      sysdc_update() accumulates the amount of time that max-priority SDC
 170  170   *      threads have spent on-CPU in each processor set, and uses that sum
 171  171   *      to create an implied duty cycle for that processor set:
 172  172   *
 173  173   *                              accumulated CPU time
 174  174   *         pset DC =    -----------------------------------
 175  175   *                       (# CPUs) * time since last update
 176  176   *
 177  177   *      If this implied duty cycle is above a maximum pset duty cycle (90%
 178  178   *      by default), sysdc_update() sets the priority of all SDC threads
 179  179   *      in that processor set to sysdc_minpri for a "break" period.  After
 180  180   *      the break period, it waits for a "nobreak" period before trying to
 181  181   *      enforce the pset duty cycle limit again.
 182  182   *
 183  183   * - Processor sets
 184  184   *
 185  185   *      As the above implies, SDC is processor set aware, but it does not
 186  186   *      currently allow threads to change processor sets while in the SDC
 187  187   *      class.  Instead, those threads must join the desired processor set
 188  188   *      before entering SDC. [1]
 189  189   *
 190  190   * - Batch threads
 191  191   *
 192  192   *      A thread joining the SDC class can specify the SDC_THREAD_BATCH
 193  193   *      flag.  This flag currently has no effect, but marks threads which
 194  194   *      do bulk processing.
 195  195   *
 196  196   * - t_kpri_req
 197  197   *
 198  198   *      The TS and FSS scheduling classes pay attention to t_kpri_req,
 199  199   *      which provides a simple form of priority inheritance for
 200  200   *      synchronization primitives (such as rwlocks held as READER) which
 201  201   *      cannot be traced to a unique thread.  The SDC class does not honor
 202  202   *      t_kpri_req, for a few reasons:
 203  203   *
 204  204   *      1.  t_kpri_req is notoriously inaccurate.  A measure of its
 205  205   *          inaccuracy is that it needs to be cleared every time a thread
 206  206   *          returns to user mode, because it is frequently non-zero at that
 207  207   *          point.  This can happen because "ownership" of synchronization
 208  208   *          primitives that use t_kpri_req can be silently handed off,
 209  209   *          leaving no opportunity to will the t_kpri_req inheritance.
 210  210   *
 211  211   *      2.  Unlike in TS and FSS, threads in SDC *will* eventually run at
 212  212   *          kernel priority.  This means that even if an SDC thread
 213  213   *          is holding a synchronization primitive and running at low
 214  214   *          priority, its priority will eventually be raised above 60,
 215  215   *          allowing it to drive on and release the resource.
 216  216   *
 217  217   *      3.  The first consumer of SDC uses the taskq subsystem, which holds
 218  218   *          a reader lock for the duration of the task's execution.  This
 219  219   *          would mean that SDC threads would never drop below kernel
 220  220   *          priority in practice, which defeats one of the purposes of SDC.
 221  221   *
 222  222   * - Why not FSS?
 223  223   *
 224  224   *      It might seem that the existing FSS scheduling class could solve
 225  225   *      the problems that SDC is attempting to solve.  FSS's more precise
 226  226   *      solution to the oversubscription problem would hardly cause
 227  227   *      trouble, as long as it performed well.  SDC is implemented as
 228  228   *      a separate scheduling class for two main reasons: the initial
 229  229   *      consumer of SDC does not map well onto the "project" abstraction
 230  230   *      that is central to FSS, and FSS does not expect to run at kernel
 231  231   *      priorities.
 232  232   *
 233  233   *
 234  234   * Tunables
 235  235   *
 236  236   * - sysdc_update_interval_msec:  Number of milliseconds between
 237  237   *      consecutive thread priority updates.
 238  238   *
 239  239   * - sysdc_reset_interval_msec:  Number of milliseconds between
 240  240   *      consecutive resets of a thread's base ONPROC and Runnable
 241  241   *      times.
 242  242   *
 243  243   * - sysdc_prune_interval_msec:  Number of milliseconds of sleeping
 244  244   *      before a thread is pruned from the active list.
 245  245   *
 246  246   * - sysdc_max_pset_DC:  Allowable percentage of a processor set's
 247  247   *      CPU time which SDC can give to its high-priority threads.
 248  248   *
 249  249   * - sysdc_break_msec:  Number of milliseconds of "break" taken when
 250  250   *      sysdc_max_pset_DC is exceeded.
 251  251   *
 252  252   *
 253  253   * Future work (in SDC and related subsystems)
 254  254   *
 255  255   * - Per-thread rechoose interval (0 for SDC)
 256  256   *
 257  257   *      Allow each thread to specify its own rechoose interval.  SDC
 258  258   *      threads would specify an interval of zero, which would rechoose
 259  259   *      the CPU with the lowest priority once per update.
 260  260   *
 261  261   * - Allow threads to change processor sets after joining the SDC class
 262  262   *
 263  263   * - Thread groups and per-group DC
 264  264   *
 265  265   *      It might be nice to be able to specify a duty cycle which applies
 266  266   *      to a group of threads in aggregate.
 267  267   *
 268  268   * - Per-group DC callback to allow dynamic DC tuning
 269  269   *
 270  270   *      Currently, DCs are assigned when the thread joins SDC.  Some
 271  271   *      workloads could benefit from being able to tune their DC using
 272  272   *      subsystem-specific knowledge about the workload.
 273  273   *
 274  274   * - Finer-grained priority updates
 275  275   *
 276  276   * - More nuanced management of oversubscription
 277  277   *
 278  278   * - Moving other CPU-intensive threads into SDC
 279  279   *
 280  280   * - Move msacct data into kthread_t
 281  281   *
 282  282   *      This would allow kernel threads without LWPs to join SDC.
 283  283   *
 284  284   *
 285  285   * Footnotes
 286  286   *
 287  287   * [1] The details of doing so are left as an exercise for the reader.
 288  288   */
 289  289  
 290  290  #include <sys/types.h>
 291  291  #include <sys/sysdc.h>
 292  292  #include <sys/sysdc_impl.h>
 293  293  
 294  294  #include <sys/class.h>
 295  295  #include <sys/cmn_err.h>
 296  296  #include <sys/cpuvar.h>
 297  297  #include <sys/cpupart.h>
 298  298  #include <sys/debug.h>
 299  299  #include <sys/disp.h>
 300  300  #include <sys/errno.h>
 301  301  #include <sys/inline.h>
 302  302  #include <sys/kmem.h>
 303  303  #include <sys/modctl.h>
 304  304  #include <sys/schedctl.h>
 305  305  #include <sys/sdt.h>
 306  306  #include <sys/sunddi.h>
 307  307  #include <sys/sysmacros.h>
 308  308  #include <sys/systm.h>
 309  309  #include <sys/var.h>
 310  310  
 311  311  /*
 312  312   * Tunables - loaded into the internal state at module load time
 313  313   */
 314  314  uint_t          sysdc_update_interval_msec = 20;
 315  315  uint_t          sysdc_reset_interval_msec = 400;
 316  316  uint_t          sysdc_prune_interval_msec = 100;
 317  317  uint_t          sysdc_max_pset_DC = 90;
 318  318  uint_t          sysdc_break_msec = 80;
 319  319  
 320  320  /*
 321  321   * Internal state - constants set up by sysdc_initparam()
 322  322   */
 323  323  static clock_t  sysdc_update_ticks;     /* ticks between updates */
 324  324  static uint_t   sysdc_prune_updates;    /* updates asleep before pruning */
 325  325  static uint_t   sysdc_reset_updates;    /* # of updates before reset */
 326  326  static uint_t   sysdc_break_updates;    /* updates to break */
 327  327  static uint_t   sysdc_nobreak_updates;  /* updates to not check */
 328  328  static uint_t   sysdc_minDC;            /* minimum allowed DC */
 329  329  static uint_t   sysdc_maxDC;            /* maximum allowed DC */
 330  330  static pri_t    sysdc_minpri;           /* minimum allowed priority */
 331  331  static pri_t    sysdc_maxpri;           /* maximum allowed priority */
 332  332  
 333  333  /*
 334  334   * Internal state
 335  335   */
 336  336  static kmutex_t sysdc_pset_lock;        /* lock protecting pset data */
 337  337  static list_t   sysdc_psets;            /* list of psets with SDC threads */
 338  338  static uint_t   sysdc_param_init;       /* sysdc_initparam() has been called */
 339  339  static uint_t   sysdc_update_timeout_started; /* update timeout is active */
 340  340  static hrtime_t sysdc_last_update;      /* time of last sysdc_update() */
 341  341  static sysdc_t  sysdc_dummy;            /* used to terminate active lists */
 342  342  
 343  343  /*
 344  344   * Internal state - active hash table
 345  345   */
 346  346  #define SYSDC_NLISTS    8
 347  347  #define SYSDC_HASH(sdc) (((uintptr_t)(sdc) >> 6) & (SYSDC_NLISTS - 1))
 348  348  static sysdc_list_t     sysdc_active[SYSDC_NLISTS];
 349  349  #define SYSDC_LIST(sdc)         (&sysdc_active[SYSDC_HASH(sdc)])
 350  350  
 351  351  #ifdef DEBUG
 352  352  static struct {
 353  353          uint64_t        sysdc_update_times_asleep;
 354  354          uint64_t        sysdc_update_times_base_ran_backwards;
 355  355          uint64_t        sysdc_update_times_already_done;
 356  356          uint64_t        sysdc_update_times_cur_ran_backwards;
 357  357          uint64_t        sysdc_compute_pri_breaking;
 358  358          uint64_t        sysdc_activate_enter;
 359  359          uint64_t        sysdc_update_enter;
 360  360          uint64_t        sysdc_update_exited;
 361  361          uint64_t        sysdc_update_not_sdc;
 362  362          uint64_t        sysdc_update_idle;
 363  363          uint64_t        sysdc_update_take_break;
 364  364          uint64_t        sysdc_update_no_psets;
 365  365          uint64_t        sysdc_tick_not_sdc;
 366  366          uint64_t        sysdc_tick_quantum_expired;
 367  367          uint64_t        sysdc_thread_enter_enter;
 368  368  } sysdc_stats;
 369  369  
 370  370  #define SYSDC_INC_STAT(x)       (sysdc_stats.x++)
 371  371  #else
 372  372  #define SYSDC_INC_STAT(x)       ((void)0)
 373  373  #endif
 374  374  
 375  375  /* macros are UPPER CASE */
 376  376  #define HOWMANY(a, b)   howmany((a), (b))
 377  377  #define MSECTOTICKS(a)  HOWMANY((a) * 1000, usec_per_tick)
 378  378  
 379  379  static void
 380  380  sysdc_initparam(void)
 381  381  {
 382  382          uint_t sysdc_break_ticks;
 383  383  
 384  384          /* update / prune intervals */
 385  385          sysdc_update_ticks = MSECTOTICKS(sysdc_update_interval_msec);
 386  386  
 387  387          sysdc_prune_updates = HOWMANY(sysdc_prune_interval_msec,
 388  388              sysdc_update_interval_msec);
 389  389          sysdc_reset_updates = HOWMANY(sysdc_reset_interval_msec,
 390  390              sysdc_update_interval_msec);
 391  391  
 392  392          /* We must get at least a little time on CPU. */
 393  393          sysdc_minDC = 1;
 394  394          sysdc_maxDC = SYSDC_DC_MAX;
 395  395          sysdc_minpri = 0;
 396  396          sysdc_maxpri = maxclsyspri;
 397  397  
 398  398          /* break parameters */
 399  399          if (sysdc_max_pset_DC > SYSDC_DC_MAX) {
 400  400                  sysdc_max_pset_DC = SYSDC_DC_MAX;
 401  401          }
 402  402          sysdc_break_ticks = MSECTOTICKS(sysdc_break_msec);
 403  403          sysdc_break_updates = HOWMANY(sysdc_break_ticks, sysdc_update_ticks);
 404  404  
 405  405          /*
 406  406           * We want:
 407  407           *
 408  408           *      sysdc_max_pset_DC = (nobreak / (break + nobreak))
 409  409           *
 410  410           *      ==>       nobreak = sysdc_max_pset_DC * (break + nobreak)
 411  411           *
 412  412           *                          sysdc_max_pset_DC * break
 413  413           *      ==>       nobreak = -------------------------
 414  414           *                          1 - sysdc_max_pset_DC
 415  415           */
 416  416          sysdc_nobreak_updates =
 417  417              HOWMANY((uint64_t)sysdc_break_updates * sysdc_max_pset_DC,
 418  418              (SYSDC_DC_MAX - sysdc_max_pset_DC));
 419  419  
 420  420          sysdc_param_init = 1;
 421  421  }
 422  422  
 423  423  #undef HOWMANY
 424  424  #undef MSECTOTICKS
 425  425  
 426  426  #define SDC_UPDATE_INITIAL      0x1     /* for the initial update */
 427  427  #define SDC_UPDATE_TIMEOUT      0x2     /* from sysdc_update() */
 428  428  #define SDC_UPDATE_TICK         0x4     /* from sysdc_tick(), on expiry */
 429  429  
 430  430  /*
 431  431   * Updates the recorded times in the sdc, and returns the elapsed ONPROC
 432  432   * and Runnable times since the last reset.
 433  433   *
 434  434   * newO is the thread's actual ONPROC time; it's used during sysdc_update()
 435  435   * to track processor set usage.
 436  436   */
 437  437  static void
 438  438  sysdc_update_times(sysdc_t *sdc, uint_t flags,
 439  439      hrtime_t *O, hrtime_t *R, hrtime_t *newO)
 440  440  {
 441  441          kthread_t *const t = sdc->sdc_thread;
 442  442          const uint_t    initial = (flags & SDC_UPDATE_INITIAL);
 443  443          const uint_t    update = (flags & SDC_UPDATE_TIMEOUT);
 444  444          const clock_t   now = ddi_get_lbolt();
 445  445          uint_t          do_reset;
 446  446  
 447  447          ASSERT(THREAD_LOCK_HELD(t));
 448  448  
 449  449          *O = *R = 0;
 450  450  
 451  451          /* If we've been sleeping, we know we haven't had any ONPROC time. */
 452  452          if (sdc->sdc_sleep_updates != 0 &&
 453  453              sdc->sdc_sleep_updates != sdc->sdc_nupdates) {
 454  454                  *newO = sdc->sdc_last_base_O;
 455  455                  SYSDC_INC_STAT(sysdc_update_times_asleep);
 456  456                  return;
 457  457          }
 458  458  
 459  459          /*
 460  460           * If this is our first update, or we've hit the reset point,
 461  461           * we need to reset our base_{O,R}.  Once we've updated them, we
 462  462           * report O and R for the entire prior interval.
 463  463           */
 464  464          do_reset = initial;
 465  465          if (update) {
 466  466                  ++sdc->sdc_nupdates;
 467  467                  if ((sdc->sdc_nupdates % sysdc_reset_updates) == 0)
 468  468                          do_reset = 1;
 469  469          }
 470  470          if (do_reset) {
 471  471                  hrtime_t baseO, baseR;
 472  472                  if (initial) {
 473  473                          /*
 474  474                           * Start off our cycle count somewhere in the middle,
 475  475                           * to keep the resets from all happening at once.
 476  476                           *
 477  477                           * 4999 is a handy prime much larger than
 478  478                           * sysdc_reset_updates, so that we don't run into
 479  479                           * trouble if the resolution is a multiple of
 480  480                           * sysdc_reset_updates.
 481  481                           */
 482  482                          sdc->sdc_nupdates = (uint_t)((gethrtime() % 4999) %
 483  483                              sysdc_reset_updates);
 484  484                          baseO = baseR = 0;
 485  485                  } else {
 486  486                          baseO = sdc->sdc_base_O;
 487  487                          baseR = sdc->sdc_base_R;
 488  488                  }
 489  489  
 490  490                  mstate_systhread_times(t, &sdc->sdc_base_O, &sdc->sdc_base_R);
 491  491                  *newO = sdc->sdc_base_O;
 492  492  
 493  493                  sdc->sdc_reset = now;
 494  494                  sdc->sdc_pri_check = -1; /* force mismatch below */
 495  495  
 496  496                  /*
 497  497                   * See below for rationale.
 498  498                   */
 499  499                  if (baseO > sdc->sdc_base_O || baseR > sdc->sdc_base_R) {
 500  500                          SYSDC_INC_STAT(sysdc_update_times_base_ran_backwards);
 501  501                          baseO = sdc->sdc_base_O;
 502  502                          baseR = sdc->sdc_base_R;
 503  503                  }
 504  504  
 505  505                  /* compute based on the entire interval */
 506  506                  *O = (sdc->sdc_base_O - baseO);
 507  507                  *R = (sdc->sdc_base_R - baseR);
 508  508                  return;
 509  509          }
 510  510  
 511  511          /*
 512  512           * If we're called from sysdc_update(), we *must* return a value
 513  513           * for newO, so we always call mstate_systhread_times().
 514  514           *
 515  515           * Otherwise, if we've already done a pri check this tick,
 516  516           * we can skip it.
 517  517           */
 518  518          if (!update && sdc->sdc_pri_check == now) {
 519  519                  SYSDC_INC_STAT(sysdc_update_times_already_done);
 520  520                  return;
 521  521          }
 522  522  
 523  523          /* Get the current times from the thread */
 524  524          sdc->sdc_pri_check = now;
 525  525          mstate_systhread_times(t, &sdc->sdc_cur_O, &sdc->sdc_cur_R);
 526  526          *newO = sdc->sdc_cur_O;
 527  527  
 528  528          /*
 529  529           * The updating of microstate accounting is not done under a
 530  530           * consistent set of locks, particularly the t_waitrq field.  This
 531  531           * can lead to narrow windows in which we account for time in the
 532  532           * wrong bucket, which on the next read will be accounted for
 533  533           * correctly.
 534  534           *
 535  535           * If our sdc_base_* fields were affected by one of these blips, we
 536  536           * throw away the old data, and pretend this tick didn't happen.
 537  537           */
 538  538          if (sdc->sdc_cur_O < sdc->sdc_base_O ||
 539  539              sdc->sdc_cur_R < sdc->sdc_base_R) {
 540  540  
 541  541                  sdc->sdc_base_O = sdc->sdc_cur_O;
 542  542                  sdc->sdc_base_R = sdc->sdc_cur_R;
 543  543  
 544  544                  SYSDC_INC_STAT(sysdc_update_times_cur_ran_backwards);
 545  545                  return;
 546  546          }
 547  547  
 548  548          *O = sdc->sdc_cur_O - sdc->sdc_base_O;
 549  549          *R = sdc->sdc_cur_R - sdc->sdc_base_R;
 550  550  }
 551  551  
 552  552  /*
 553  553   * sysdc_compute_pri()
 554  554   *
 555  555   *      Recomputes the priority of the thread, leaving the result in
 556  556   *      sdc->sdc_epri.  Returns 1 if a priority update should occur
 557  557   *      (which will also trigger a cpu_surrender()), otherwise
 558  558   *      returns 0.
 559  559   */
 560  560  static uint_t
 561  561  sysdc_compute_pri(sysdc_t *sdc, uint_t flags)
 562  562  {
 563  563          kthread_t *const t = sdc->sdc_thread;
 564  564          const uint_t    update = (flags & SDC_UPDATE_TIMEOUT);
 565  565          const uint_t    tick = (flags & SDC_UPDATE_TICK);
 566  566  
 567  567          hrtime_t        O, R;
 568  568          hrtime_t        newO = -1;
 569  569  
 570  570          ASSERT(THREAD_LOCK_HELD(t));
 571  571  
 572  572          sysdc_update_times(sdc, flags, &O, &R, &newO);
 573  573          ASSERT(!update || newO != -1);
 574  574  
 575  575          /* If we have new data, recompute our priority. */
 576  576          if ((O + R) != 0) {
 577  577                  sdc->sdc_cur_DC = (O * SYSDC_DC_MAX) / (O + R);
 578  578  
 579  579                  /* Adjust our priority to move our DC closer to the target. */
 580  580                  if (sdc->sdc_cur_DC < sdc->sdc_target_DC)
 581  581                          sdc->sdc_pri = sdc->sdc_maxpri;
 582  582                  else
 583  583                          sdc->sdc_pri = sdc->sdc_minpri;
 584  584          }
 585  585  
 586  586          /*
 587  587           * If our per-pset duty cycle goes over the max, we will take a break.
 588  588           * This forces all sysdc threads in the pset to minimum priority, in
 589  589           * order to let everyone else have a chance at the CPU.
 590  590           */
 591  591          if (sdc->sdc_pset->sdp_need_break) {
 592  592                  SYSDC_INC_STAT(sysdc_compute_pri_breaking);
 593  593                  sdc->sdc_epri = sdc->sdc_minpri;
 594  594          } else {
 595  595                  sdc->sdc_epri = sdc->sdc_pri;
 596  596          }
 597  597  
 598  598          DTRACE_PROBE4(sysdc__compute__pri,
 599  599              kthread_t *, t, pri_t, sdc->sdc_epri, uint_t, sdc->sdc_cur_DC,
 600  600              uint_t, sdc->sdc_target_DC);
 601  601  
 602  602          /*
 603  603           * For sysdc_update(), we compute the ONPROC time for high-priority
 604  604           * threads, which is used to calculate the per-pset duty cycle.  We
 605  605           * will always tell our callers to update the thread's priority,
 606  606           * since we want to force a cpu_surrender().
 607  607           *
 608  608           * We reset sdc_update_ticks so that sysdc_tick() will only update
 609  609           * the thread's priority if our timeout is delayed by a tick or
 610  610           * more.
 611  611           */
 612  612          if (update) {
 613  613                  /* SDC threads are not allowed to change cpupart bindings. */
 614  614                  ASSERT(t->t_cpupart == sdc->sdc_pset->sdp_cpupart);
 615  615  
 616  616                  /* If we were at MAXPRI, account for our onproc time. */
 617  617                  if (t->t_pri == sdc->sdc_maxpri &&
 618  618                      sdc->sdc_last_base_O != 0 &&
 619  619                      sdc->sdc_last_base_O < newO) {
 620  620                          sdc->sdc_last_O = newO - sdc->sdc_last_base_O;
 621  621                          sdc->sdc_pset->sdp_onproc_time +=
 622  622                              (uint64_t)sdc->sdc_last_O;
 623  623                          sdc->sdc_pset->sdp_onproc_threads++;
 624  624                  } else {
 625  625                          sdc->sdc_last_O = 0;
 626  626                  }
 627  627                  sdc->sdc_last_base_O = newO;
 628  628  
 629  629                  sdc->sdc_update_ticks = sdc->sdc_ticks + sysdc_update_ticks + 1;
 630  630                  return (1);
 631  631          }
 632  632  
 633  633          /*
 634  634           * Like sysdc_update(), sysdc_tick() always wants to update the
 635  635           * thread's priority, so that the CPU is surrendered if necessary.
 636  636           * We reset sdc_update_ticks so that if the timeout continues to be
 637  637           * delayed, we'll update at the regular interval.
 638  638           */
 639  639          if (tick) {
 640  640                  ASSERT(sdc->sdc_ticks == sdc->sdc_update_ticks);
 641  641                  sdc->sdc_update_ticks = sdc->sdc_ticks + sysdc_update_ticks;
 642  642                  return (1);
 643  643          }
 644  644  
 645  645          /*
 646  646           * Otherwise, only tell our callers to update the priority if it has
 647  647           * changed.
 648  648           */
 649  649          return (sdc->sdc_epri != t->t_pri);
 650  650  }
 651  651  
 652  652  static void
 653  653  sysdc_update_pri(sysdc_t *sdc, uint_t flags)
 654  654  {
 655  655          kthread_t *t = sdc->sdc_thread;
 656  656  
 657  657          ASSERT(THREAD_LOCK_HELD(t));
 658  658  
 659  659          if (sysdc_compute_pri(sdc, flags)) {
 660  660                  if (!thread_change_pri(t, sdc->sdc_epri, 0)) {
 661  661                          cpu_surrender(t);
 662  662                  }
 663  663          }
 664  664  }
 665  665  
 666  666  /*
 667  667   * Add a thread onto the active list.  It will only be removed by
 668  668   * sysdc_update().
 669  669   */
 670  670  static void
 671  671  sysdc_activate(sysdc_t *sdc)
 672  672  {
 673  673          sysdc_t *volatile *headp = &SYSDC_LIST(sdc)->sdl_list;
 674  674          sysdc_t         *head;
 675  675          kthread_t       *t = sdc->sdc_thread;
 676  676  
 677  677          SYSDC_INC_STAT(sysdc_activate_enter);
 678  678  
 679  679          ASSERT(sdc->sdc_next == NULL);
 680  680          ASSERT(THREAD_LOCK_HELD(t));
 681  681  
 682  682          do {
 683  683                  head = *headp;
 684  684                  sdc->sdc_next = head;
 685  685          } while (atomic_cas_ptr(headp, head, sdc) != head);
 686  686  }
 687  687  
 688  688  /*
 689  689   * sysdc_update() has two jobs:
 690  690   *
 691  691   *      1. It updates the priorities of all active SDC threads on the system.
 692  692   *      2. It measures pset CPU usage and enforces sysdc_max_pset_DC.
 693  693   */
 694  694  static void
 695  695  sysdc_update(void *arg)
 696  696  {
 697  697          int             idx;
 698  698          sysdc_t         *freelist = NULL;
 699  699          sysdc_pset_t    *cur;
 700  700          hrtime_t        now, diff;
 701  701          uint_t          redeploy = 1;
 702  702  
 703  703          SYSDC_INC_STAT(sysdc_update_enter);
 704  704  
 705  705          ASSERT(sysdc_update_timeout_started);
 706  706  
 707  707          /*
 708  708           * If this is our first time through, diff will be gigantic, and
 709  709           * no breaks will be necessary.
 710  710           */
 711  711          now = gethrtime();
 712  712          diff = now - sysdc_last_update;
 713  713          sysdc_last_update = now;
 714  714  
 715  715          mutex_enter(&sysdc_pset_lock);
 716  716          for (cur = list_head(&sysdc_psets); cur != NULL;
 717  717              cur = list_next(&sysdc_psets, cur)) {
 718  718                  boolean_t breaking = (cur->sdp_should_break != 0);
 719  719  
 720  720                  if (cur->sdp_need_break != breaking) {
 721  721                          DTRACE_PROBE2(sdc__pset__break, sysdc_pset_t *, cur,
 722  722                              boolean_t, breaking);
 723  723                  }
 724  724                  cur->sdp_onproc_time = 0;
 725  725                  cur->sdp_onproc_threads = 0;
 726  726                  cur->sdp_need_break = breaking;
 727  727          }
 728  728          mutex_exit(&sysdc_pset_lock);
 729  729  
 730  730          for (idx = 0; idx < SYSDC_NLISTS; idx++) {
 731  731                  sysdc_list_t            *sdl = &sysdc_active[idx];
 732  732                  sysdc_t *volatile       *headp = &sdl->sdl_list;
 733  733                  sysdc_t                 *head, *tail;
 734  734                  sysdc_t                 **prevptr;
 735  735  
 736  736                  if (*headp == &sysdc_dummy)
 737  737                          continue;
 738  738  
 739  739                  /* Prevent any threads from exiting while we're poking them. */
 740  740                  mutex_enter(&sdl->sdl_lock);
 741  741  
 742  742                  /*
 743  743                   * Each sdl_list contains a singly-linked list of active
 744  744                   * threads. Threads which become active while we are
 745  745                   * processing the list will be added to sdl_list.  Since we
 746  746                   * don't want that to interfere with our own processing, we
 747  747                   * swap in an empty list.  Any newly active threads will
 748  748                   * go on to this empty list.  When finished, we'll put any
 749  749                   * such threads at the end of the processed list.
 750  750                   */
 751  751                  head = atomic_swap_ptr(headp, &sysdc_dummy);
 752  752                  prevptr = &head;
 753  753                  while (*prevptr != &sysdc_dummy) {
 754  754                          sysdc_t         *const  sdc = *prevptr;
 755  755                          kthread_t       *const  t = sdc->sdc_thread;
 756  756  
 757  757                          /*
 758  758                           * If the thread has exited, move its sysdc_t onto
 759  759                           * freelist, to be freed later.
 760  760                           */
 761  761                          if (t == NULL) {
 762  762                                  *prevptr = sdc->sdc_next;
 763  763                                  SYSDC_INC_STAT(sysdc_update_exited);
 764  764                                  sdc->sdc_next = freelist;
 765  765                                  freelist = sdc;
 766  766                                  continue;
 767  767                          }
 768  768  
 769  769                          thread_lock(t);
 770  770                          if (t->t_cid != sysdccid) {
 771  771                                  thread_unlock(t);
 772  772                                  prevptr = &sdc->sdc_next;
 773  773                                  SYSDC_INC_STAT(sysdc_update_not_sdc);
 774  774                                  continue;
 775  775                          }
 776  776                          ASSERT(t->t_cldata == sdc);
 777  777  
 778  778                          /*
 779  779                           * If the thread has been sleeping for longer
 780  780                           * than sysdc_prune_interval, make it inactive by
 781  781                           * removing it from the list.
 782  782                           */
 783  783                          if (!(t->t_state & (TS_RUN | TS_ONPROC)) &&
 784  784                              sdc->sdc_sleep_updates != 0 &&
 785  785                              (sdc->sdc_sleep_updates - sdc->sdc_nupdates) >
 786  786                              sysdc_prune_updates) {
 787  787                                  *prevptr = sdc->sdc_next;
 788  788                                  SYSDC_INC_STAT(sysdc_update_idle);
 789  789                                  sdc->sdc_next = NULL;
 790  790                                  thread_unlock(t);
 791  791                                  continue;
 792  792                          }
 793  793                          sysdc_update_pri(sdc, SDC_UPDATE_TIMEOUT);
 794  794                          thread_unlock(t);
 795  795  
 796  796                          prevptr = &sdc->sdc_next;
 797  797                  }
 798  798  
 799  799                  /*
 800  800                   * Add our list to the bucket, putting any new entries
 801  801                   * added while we were working at the tail of the list.
 802  802                   */
 803  803                  do {
 804  804                          tail = *headp;
 805  805                          *prevptr = tail;
 806  806                  } while (atomic_cas_ptr(headp, tail, head) != tail);
 807  807  
 808  808                  mutex_exit(&sdl->sdl_lock);
 809  809          }
 810  810  
 811  811          mutex_enter(&sysdc_pset_lock);
 812  812          for (cur = list_head(&sysdc_psets); cur != NULL;
 813  813              cur = list_next(&sysdc_psets, cur)) {
 814  814  
 815  815                  cur->sdp_vtime_last_interval =
 816  816                      diff * cur->sdp_cpupart->cp_ncpus;
 817  817                  cur->sdp_DC_last_interval =
 818  818                      (cur->sdp_onproc_time * SYSDC_DC_MAX) /
 819  819                      cur->sdp_vtime_last_interval;
 820  820  
 821  821                  if (cur->sdp_should_break > 0) {
 822  822                          cur->sdp_should_break--;        /* breaking */
 823  823                          continue;
 824  824                  }
 825  825                  if (cur->sdp_dont_break > 0) {
 826  826                          cur->sdp_dont_break--;  /* waiting before checking */
 827  827                          continue;
 828  828                  }
 829  829                  if (cur->sdp_DC_last_interval > sysdc_max_pset_DC) {
 830  830                          cur->sdp_should_break = sysdc_break_updates;
 831  831                          cur->sdp_dont_break = sysdc_nobreak_updates;
 832  832                          SYSDC_INC_STAT(sysdc_update_take_break);
 833  833                  }
 834  834          }
 835  835  
 836  836          /*
 837  837           * If there are no sysdc_psets, there can be no threads, so
 838  838           * we can stop doing our timeout.  Since we're holding the
 839  839           * sysdc_pset_lock, no new sysdc_psets can come in, which will
 840  840           * prevent anyone from racing with this and dropping our timeout
 841  841           * on the floor.
 842  842           */
 843  843          if (list_is_empty(&sysdc_psets)) {
 844  844                  SYSDC_INC_STAT(sysdc_update_no_psets);
 845  845                  ASSERT(sysdc_update_timeout_started);
 846  846                  sysdc_update_timeout_started = 0;
 847  847  
 848  848                  redeploy = 0;
 849  849          }
 850  850          mutex_exit(&sysdc_pset_lock);
 851  851  
 852  852          while (freelist != NULL) {
 853  853                  sysdc_t *cur = freelist;
 854  854                  freelist = cur->sdc_next;
 855  855                  kmem_free(cur, sizeof (*cur));
 856  856          }
 857  857  
 858  858          if (redeploy) {
 859  859                  (void) timeout(sysdc_update, arg, sysdc_update_ticks);
 860  860          }
 861  861  }
 862  862  
 863  863  static void
 864  864  sysdc_preempt(kthread_t *t)
 865  865  {
 866  866          ASSERT(t == curthread);
 867  867          ASSERT(THREAD_LOCK_HELD(t));
 868  868  
 869  869          setbackdq(t);           /* give others a chance to run */
 870  870  }
 871  871  
 872  872  static void
 873  873  sysdc_tick(kthread_t *t)
 874  874  {
 875  875          sysdc_t *sdc;
 876  876  
 877  877          thread_lock(t);
 878  878          if (t->t_cid != sysdccid) {
 879  879                  SYSDC_INC_STAT(sysdc_tick_not_sdc);
 880  880                  thread_unlock(t);
 881  881                  return;
 882  882          }
 883  883          sdc = t->t_cldata;
 884  884          if (t->t_state == TS_ONPROC &&
 885  885              t->t_pri < t->t_disp_queue->disp_maxrunpri) {
 886  886                  cpu_surrender(t);
 887  887          }
 888  888  
 889  889          if (t->t_state == TS_ONPROC || t->t_state == TS_RUN) {
 890  890                  ASSERT(sdc->sdc_sleep_updates == 0);
 891  891          }
 892  892  
 893  893          ASSERT(sdc->sdc_ticks != sdc->sdc_update_ticks);
 894  894          sdc->sdc_ticks++;
 895  895          if (sdc->sdc_ticks == sdc->sdc_update_ticks) {
 896  896                  SYSDC_INC_STAT(sysdc_tick_quantum_expired);
 897  897                  sysdc_update_pri(sdc, SDC_UPDATE_TICK);
 898  898                  ASSERT(sdc->sdc_ticks != sdc->sdc_update_ticks);
 899  899          }
 900  900          thread_unlock(t);
 901  901  }
 902  902  
 903  903  static void
 904  904  sysdc_setrun(kthread_t *t)
 905  905  {
 906  906          sysdc_t *sdc = t->t_cldata;
 907  907  
 908  908          ASSERT(THREAD_LOCK_HELD(t));    /* t should be in transition */
 909  909  
 910  910          sdc->sdc_sleep_updates = 0;
 911  911  
 912  912          if (sdc->sdc_next == NULL) {
 913  913                  /*
 914  914                   * Since we're in transition, we don't want to use the
 915  915                   * full thread_update_pri().
 916  916                   */
 917  917                  if (sysdc_compute_pri(sdc, 0)) {
 918  918                          THREAD_CHANGE_PRI(t, sdc->sdc_epri);
 919  919                  }
 920  920                  sysdc_activate(sdc);
 921  921  
 922  922                  ASSERT(sdc->sdc_next != NULL);
 923  923          }
 924  924  
 925  925          setbackdq(t);
 926  926  }
 927  927  
 928  928  static void
 929  929  sysdc_wakeup(kthread_t *t)
 930  930  {
 931  931          sysdc_setrun(t);
 932  932  }
 933  933  
 934  934  static void
 935  935  sysdc_sleep(kthread_t *t)
 936  936  {
 937  937          sysdc_t *sdc = t->t_cldata;
 938  938  
 939  939          ASSERT(THREAD_LOCK_HELD(t));    /* t should be in transition */
 940  940  
 941  941          sdc->sdc_sleep_updates = sdc->sdc_nupdates;
 942  942  }
 943  943  
 944  944  /*ARGSUSED*/
 945  945  static int
 946  946  sysdc_enterclass(kthread_t *t, id_t cid, void *parmsp, cred_t *reqpcredp,
 947  947      void *bufp)
 948  948  {
 949  949          cpupart_t *const cpupart = t->t_cpupart;
 950  950          sysdc_t *sdc = bufp;
 951  951          sysdc_params_t *sdpp = parmsp;
 952  952          sysdc_pset_t *newpset = sdc->sdc_pset;
 953  953          sysdc_pset_t *pset;
 954  954          int start_timeout;
 955  955  
 956  956          if (t->t_cid != syscid)
 957  957                  return (EPERM);
 958  958  
 959  959          ASSERT(ttolwp(t) != NULL);
 960  960          ASSERT(sdpp != NULL);
 961  961          ASSERT(newpset != NULL);
 962  962          ASSERT(sysdc_param_init);
 963  963  
 964  964          ASSERT(sdpp->sdp_minpri >= sysdc_minpri);
 965  965          ASSERT(sdpp->sdp_maxpri <= sysdc_maxpri);
 966  966          ASSERT(sdpp->sdp_DC >= sysdc_minDC);
 967  967          ASSERT(sdpp->sdp_DC <= sysdc_maxDC);
 968  968  
 969  969          sdc->sdc_thread = t;
 970  970          sdc->sdc_pri = sdpp->sdp_maxpri;        /* start off maximally */
 971  971          sdc->sdc_minpri = sdpp->sdp_minpri;
 972  972          sdc->sdc_maxpri = sdpp->sdp_maxpri;
 973  973          sdc->sdc_target_DC = sdpp->sdp_DC;
 974  974          sdc->sdc_ticks = 0;
 975  975          sdc->sdc_update_ticks = sysdc_update_ticks + 1;
 976  976  
 977  977          /* Assign ourselves to the appropriate pset. */
 978  978          sdc->sdc_pset = NULL;
 979  979          mutex_enter(&sysdc_pset_lock);
 980  980          for (pset = list_head(&sysdc_psets); pset != NULL;
 981  981              pset = list_next(&sysdc_psets, pset)) {
 982  982                  if (pset->sdp_cpupart == cpupart) {
 983  983                          break;
 984  984                  }
 985  985          }
 986  986          if (pset == NULL) {
 987  987                  pset = newpset;
 988  988                  newpset = NULL;
 989  989                  pset->sdp_cpupart = cpupart;
 990  990                  list_insert_tail(&sysdc_psets, pset);
 991  991          }
 992  992          pset->sdp_nthreads++;
 993  993          ASSERT(pset->sdp_nthreads > 0);
 994  994  
 995  995          sdc->sdc_pset = pset;
 996  996  
 997  997          start_timeout = (sysdc_update_timeout_started == 0);
 998  998          sysdc_update_timeout_started = 1;
 999  999          mutex_exit(&sysdc_pset_lock);
1000 1000  
1001 1001          if (newpset != NULL)
1002 1002                  kmem_free(newpset, sizeof (*newpset));
1003 1003  
1004 1004          /* Update t's scheduling class and priority. */
1005 1005          thread_lock(t);
1006 1006          t->t_clfuncs = &(sclass[cid].cl_funcs->thread);
1007 1007          t->t_cid = cid;
1008 1008          t->t_cldata = sdc;
1009 1009          t->t_schedflag |= TS_RUNQMATCH;
1010 1010  
1011 1011          sysdc_update_pri(sdc, SDC_UPDATE_INITIAL);
1012 1012          thread_unlock(t);
1013 1013  
1014 1014          /* Kick off the thread timeout if we're the first one in. */
1015 1015          if (start_timeout) {
1016 1016                  (void) timeout(sysdc_update, NULL, sysdc_update_ticks);
1017 1017          }
1018 1018  
1019 1019          return (0);
1020 1020  }
1021 1021  
1022 1022  static void
1023 1023  sysdc_leave(sysdc_t *sdc)
1024 1024  {
1025 1025          sysdc_pset_t *sdp = sdc->sdc_pset;
1026 1026          sysdc_list_t *sdl = SYSDC_LIST(sdc);
1027 1027          uint_t freedc;
1028 1028  
1029 1029          mutex_enter(&sdl->sdl_lock);            /* block sysdc_update() */
1030 1030          sdc->sdc_thread = NULL;
1031 1031          freedc = (sdc->sdc_next == NULL);
1032 1032          mutex_exit(&sdl->sdl_lock);
1033 1033  
1034 1034          mutex_enter(&sysdc_pset_lock);
1035 1035          ASSERT(sdp != NULL);
1036 1036          ASSERT(sdp->sdp_nthreads > 0);
1037 1037          --sdp->sdp_nthreads;
1038 1038          if (sdp->sdp_nthreads == 0) {
1039 1039                  list_remove(&sysdc_psets, sdp);
1040 1040          } else {
1041 1041                  sdp = NULL;
1042 1042          }
1043 1043          mutex_exit(&sysdc_pset_lock);
1044 1044  
1045 1045          if (freedc)
1046 1046                  kmem_free(sdc, sizeof (*sdc));
1047 1047          if (sdp != NULL)
1048 1048                  kmem_free(sdp, sizeof (*sdp));
1049 1049  }
1050 1050  
1051 1051  static void
1052 1052  sysdc_exitclass(void *buf)
1053 1053  {
1054 1054          sysdc_leave((sysdc_t *)buf);
1055 1055  }
1056 1056  
1057 1057  /*ARGSUSED*/
1058 1058  static int
1059 1059  sysdc_canexit(kthread_t *t, cred_t *reqpcredp)
1060 1060  {
1061 1061          /* Threads cannot exit SDC once joined, except in a body bag. */
1062 1062          return (EPERM);
1063 1063  }
1064 1064  
1065 1065  static void
1066 1066  sysdc_exit(kthread_t *t)
1067 1067  {
1068 1068          sysdc_t *sdc;
1069 1069  
1070 1070          /* We're exiting, so we just rejoin the SYS class. */
1071 1071          thread_lock(t);
1072 1072          ASSERT(t->t_cid == sysdccid);
1073 1073          sdc = t->t_cldata;
1074 1074          t->t_cid = syscid;
1075 1075          t->t_cldata = NULL;
1076 1076          t->t_clfuncs = &(sclass[syscid].cl_funcs->thread);
1077 1077          (void) thread_change_pri(t, maxclsyspri, 0);
1078 1078          t->t_schedflag &= ~TS_RUNQMATCH;
1079 1079          thread_unlock_nopreempt(t);
1080 1080  
1081 1081          /* Unlink the sdc from everything. */
1082 1082          sysdc_leave(sdc);
1083 1083  }
1084 1084  
1085 1085  /*ARGSUSED*/
1086 1086  static int
1087 1087  sysdc_fork(kthread_t *t, kthread_t *ct, void *bufp)
1088 1088  {
1089 1089          /*
1090 1090           * Threads cannot be created with SDC as their class; they must
1091 1091           * be created as SYS and then added with sysdc_thread_enter().
1092 1092           * Because of this restriction, sysdc_fork() should never be called.
1093 1093           */
1094 1094          panic("sysdc cannot be forked");
1095 1095  
1096 1096          return (ENOSYS);
1097 1097  }
1098 1098  
1099 1099  /*ARGSUSED*/
1100 1100  static void
1101 1101  sysdc_forkret(kthread_t *t, kthread_t *ct)
1102 1102  {
1103 1103          /* SDC threads are part of system processes, which never fork. */
1104 1104          panic("sysdc cannot be forked");
1105 1105  }
1106 1106  
1107 1107  static pri_t
1108 1108  sysdc_globpri(kthread_t *t)
1109 1109  {
1110 1110          return (t->t_epri);
1111 1111  }
1112 1112  
1113 1113  /*ARGSUSED*/
1114 1114  static pri_t
1115 1115  sysdc_no_swap(kthread_t *t, int flags)
1116 1116  {
1117 1117          /* SDC threads cannot be swapped. */
1118 1118          return (-1);
1119 1119  }
1120 1120  
1121 1121  /*
1122 1122   * Get maximum and minimum priorities enjoyed by SDC threads.
1123 1123   */
1124 1124  static int
1125 1125  sysdc_getclpri(pcpri_t *pcprip)
1126 1126  {
1127 1127          pcprip->pc_clpmax = sysdc_maxpri;
1128 1128          pcprip->pc_clpmin = sysdc_minpri;
1129 1129          return (0);
1130 1130  }
1131 1131  
1132 1132  /*ARGSUSED*/
1133 1133  static int
1134 1134  sysdc_getclinfo(void *arg)
1135 1135  {
1136 1136          return (0);             /* no class-specific info */
1137 1137  }
1138 1138  
1139 1139  /*ARGSUSED*/
1140 1140  static int
1141 1141  sysdc_alloc(void **p, int flag)
1142 1142  {
1143 1143          sysdc_t *new;
1144 1144  
1145 1145          *p = NULL;
1146 1146          if ((new = kmem_zalloc(sizeof (*new), flag)) == NULL) {
1147 1147                  return (ENOMEM);
1148 1148          }
1149 1149          if ((new->sdc_pset = kmem_zalloc(sizeof (*new->sdc_pset), flag)) ==
1150 1150              NULL) {
1151 1151                  kmem_free(new, sizeof (*new));
1152 1152                  return (ENOMEM);
1153 1153          }
1154 1154          *p = new;
1155 1155          return (0);
1156 1156  }
1157 1157  
1158 1158  static void
1159 1159  sysdc_free(void *p)
1160 1160  {
1161 1161          sysdc_t *sdc = p;
1162 1162  
1163 1163          if (sdc != NULL) {
1164 1164                  /*
1165 1165                   * We must have failed CL_ENTERCLASS(), so our pset should be
1166 1166                   * there and unused.
1167 1167                   */
1168 1168                  ASSERT(sdc->sdc_pset != NULL);
1169 1169                  ASSERT(sdc->sdc_pset->sdp_cpupart == NULL);
1170 1170                  kmem_free(sdc->sdc_pset, sizeof (*sdc->sdc_pset));
1171 1171                  kmem_free(sdc, sizeof (*sdc));
1172 1172          }
1173 1173  }
1174 1174  
1175 1175  static int sysdc_enosys();      /* Boy, ANSI-C's K&R compatibility is weird. */
1176 1176  static int sysdc_einval();
1177 1177  static void sysdc_nullsys();
1178 1178  
1179 1179  static struct classfuncs sysdc_classfuncs = {
1180 1180          /* messages to class manager */
1181 1181          {
1182 1182                  sysdc_enosys,   /* admin */
1183 1183                  sysdc_getclinfo,
1184 1184                  sysdc_enosys,   /* parmsin */
1185 1185                  sysdc_enosys,   /* parmsout */
1186 1186                  sysdc_enosys,   /* vaparmsin */
1187 1187                  sysdc_enosys,   /* vaparmsout */
1188 1188                  sysdc_getclpri,
1189 1189                  sysdc_alloc,
1190 1190                  sysdc_free,
1191 1191          },
1192 1192          /* operations on threads */
1193 1193          {
1194 1194                  sysdc_enterclass,
1195 1195                  sysdc_exitclass,
1196 1196                  sysdc_canexit,
1197 1197                  sysdc_fork,
1198 1198                  sysdc_forkret,
1199 1199                  sysdc_nullsys,  /* parmsget */
1200 1200                  sysdc_enosys,   /* parmsset */
1201 1201                  sysdc_nullsys,  /* stop */
1202 1202                  sysdc_exit,
1203 1203                  sysdc_nullsys,  /* active */
1204 1204                  sysdc_nullsys,  /* inactive */
1205 1205                  sysdc_no_swap,  /* swapin */
1206 1206                  sysdc_no_swap,  /* swapout */
1207 1207                  sysdc_nullsys,  /* trapret */
1208 1208                  sysdc_preempt,
1209 1209                  sysdc_setrun,
1210 1210                  sysdc_sleep,
1211 1211                  sysdc_tick,
1212 1212                  sysdc_wakeup,
1213 1213                  sysdc_einval,   /* donice */
1214 1214                  sysdc_globpri,
1215 1215                  sysdc_nullsys,  /* set_process_group */
1216 1216                  sysdc_nullsys,  /* yield */
1217 1217                  sysdc_einval,   /* doprio */
1218 1218          }
1219 1219  };
1220 1220  
1221 1221  static int
1222 1222  sysdc_enosys()
1223 1223  {
1224 1224          return (ENOSYS);
1225 1225  }
1226 1226  
1227 1227  static int
1228 1228  sysdc_einval()
1229 1229  {
1230 1230          return (EINVAL);
1231 1231  }
1232 1232  
1233 1233  static void
1234 1234  sysdc_nullsys()
1235 1235  {
1236 1236  }
1237 1237  
1238 1238  /*ARGSUSED*/
1239 1239  static pri_t
1240 1240  sysdc_init(id_t cid, int clparmsz, classfuncs_t **clfuncspp)
1241 1241  {
1242 1242          int idx;
1243 1243  
1244 1244          list_create(&sysdc_psets, sizeof (sysdc_pset_t),
1245 1245              offsetof(sysdc_pset_t, sdp_node));
1246 1246  
1247 1247          for (idx = 0; idx < SYSDC_NLISTS; idx++) {
1248 1248                  sysdc_active[idx].sdl_list = &sysdc_dummy;
1249 1249          }
1250 1250  
1251 1251          sysdc_initparam();
1252 1252  
1253 1253          sysdccid = cid;
1254 1254          *clfuncspp = &sysdc_classfuncs;
1255 1255  
1256 1256          return ((pri_t)v.v_maxsyspri);
1257 1257  }
1258 1258  
1259 1259  static struct sclass csw = {

↓ open down ↓

1259 lines elided

↑ open up ↑

1260 1260          "SDC",
1261 1261          sysdc_init,
1262 1262          0
1263 1263  };
1264 1264  
1265 1265  static struct modlsched modlsched = {
1266 1266          &mod_schedops, "system duty cycle scheduling class", &csw
1267 1267  };
1268 1268  
1269 1269  static struct modlinkage modlinkage = {
1270      -        MODREV_1, (void *)&modlsched, NULL
     1270 +        MODREV_1, { (void *)&modlsched, NULL }
1271 1271  };
1272 1272  
1273 1273  int
1274 1274  _init()
1275 1275  {
1276 1276          return (mod_install(&modlinkage));
1277 1277  }
1278 1278  
1279 1279  int
1280 1280  _fini()

1281 1281  {
1282 1282          return (EBUSY);         /* can't unload for now */
1283 1283  }
1284 1284  
1285 1285  int
1286 1286  _info(struct modinfo *modinfop)
1287 1287  {
1288 1288          return (mod_info(&modlinkage, modinfop));
1289 1289  }
1290 1290  
1291 1291  /* --- consolidation-private interfaces --- */
1292 1292  void
1293 1293  sysdc_thread_enter(kthread_t *t, uint_t dc, uint_t flags)
1294 1294  {
1295 1295          void *buf = NULL;
1296 1296          sysdc_params_t sdp;
1297 1297  
1298 1298          SYSDC_INC_STAT(sysdc_thread_enter_enter);
1299 1299  
1300 1300          ASSERT(sysdc_param_init);
1301 1301          ASSERT(sysdccid >= 0);
1302 1302  
1303 1303          ASSERT((flags & ~SYSDC_THREAD_BATCH) == 0);
1304 1304  
1305 1305          sdp.sdp_minpri = sysdc_minpri;
1306 1306          sdp.sdp_maxpri = sysdc_maxpri;
1307 1307          sdp.sdp_DC = MAX(MIN(dc, sysdc_maxDC), sysdc_minDC);
1308 1308  
1309 1309          VERIFY0(CL_ALLOC(&buf, sysdccid, KM_SLEEP));
1310 1310  
1311 1311          ASSERT(t->t_lwp != NULL);
1312 1312          ASSERT(t->t_cid == syscid);
1313 1313          ASSERT(t->t_cldata == NULL);
1314 1314          VERIFY0(CL_CANEXIT(t, NULL));
1315 1315          VERIFY0(CL_ENTERCLASS(t, sysdccid, &sdp, kcred, buf));
1316 1316          CL_EXITCLASS(syscid, NULL);
1317 1317  }

↓ open down ↓

37 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX