illumos-gate Wdiff usr/src/uts/common/disp/sysdc.c

Print this page

11909 THREAD_KPRI_RELEASE does nothing of the sort
Reviewed by: Bryan Cantrill <bryan@joyent.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/disp/sysdc.c
          +++ new/usr/src/uts/common/disp/sysdc.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  24   24   */
  25   25  
  26   26  /*
  27   27   * The System Duty Cycle (SDC) scheduling class
  28   28   * --------------------------------------------
  29   29   *
  30   30   * Background
  31   31   *
  32   32   * Kernel threads in Solaris have traditionally not been large consumers
  33   33   * of CPU time.  They typically wake up, perform a small amount of
  34   34   * work, then go back to sleep waiting for either a timeout or another
  35   35   * signal.  On the assumption that the small amount of work that they do
  36   36   * is important for the behavior of the whole system, these threads are
  37   37   * treated kindly by the dispatcher and the SYS scheduling class: they run
  38   38   * without preemption from anything other than real-time and interrupt
  39   39   * threads; when preempted, they are put at the front of the queue, so they
  40   40   * generally do not migrate between CPUs; and they are allowed to stay
  41   41   * running until they voluntarily give up the CPU.
  42   42   *
  43   43   * As Solaris has evolved, new workloads have emerged which require the
  44   44   * kernel to perform significant amounts of CPU-intensive work.  One
  45   45   * example of such a workload is ZFS's transaction group sync processing.
  46   46   * Each sync operation generates a large batch of I/Os, and each I/O
  47   47   * may need to be compressed and/or checksummed before it is written to
  48   48   * storage.  The taskq threads which perform the compression and checksums
  49   49   * will run nonstop as long as they have work to do; a large sync operation
  50   50   * on a compression-heavy dataset can keep them busy for seconds on end.
  51   51   * This causes human-time-scale dispatch latency bubbles for any other
  52   52   * threads which have the misfortune to share a CPU with the taskq threads.
  53   53   *
  54   54   * The SDC scheduling class is a solution to this problem.
  55   55   *
  56   56   *
  57   57   * Overview
  58   58   *
  59   59   * SDC is centered around the concept of a thread's duty cycle (DC):
  60   60   *
  61   61   *                            ONPROC time
  62   62   *      Duty Cycle =    ----------------------
  63   63   *                      ONPROC + Runnable time
  64   64   *
  65   65   * This is the ratio of the time that the thread spent running on a CPU
  66   66   * divided by the time it spent running or trying to run.  It is unaffected
  67   67   * by any time the thread spent sleeping, stopped, etc.
  68   68   *
  69   69   * A thread joining the SDC class specifies a "target" DC that it wants
  70   70   * to run at.  To implement this policy, the routine sysdc_update() scans
  71   71   * the list of active SDC threads every few ticks and uses each thread's
  72   72   * microstate data to compute the actual duty cycle that that thread
  73   73   * has experienced recently.  If the thread is under its target DC, its
  74   74   * priority is increased to the maximum available (sysdc_maxpri, which is
  75   75   * 99 by default).  If the thread is over its target DC, its priority is
  76   76   * reduced to the minimum available (sysdc_minpri, 0 by default).  This
  77   77   * is a fairly primitive approach, in that it doesn't use any of the
  78   78   * intermediate priorities, but it's not completely inappropriate.  Even
  79   79   * though threads in the SDC class might take a while to do their job, they
  80   80   * are by some definition important if they're running inside the kernel,
  81   81   * so it is reasonable that they should get to run at priority 99.
  82   82   *
  83   83   * If a thread is running when sysdc_update() calculates its actual duty
  84   84   * cycle, and there are other threads of equal or greater priority on its
  85   85   * CPU's dispatch queue, sysdc_update() preempts that thread.  The thread
  86   86   * acknowledges the preemption by calling sysdc_preempt(), which calls
  87   87   * setbackdq(), which gives other threads with the same priority a chance
  88   88   * to run.  This creates a de facto time quantum for threads in the SDC
  89   89   * scheduling class.
  90   90   *
  91   91   * An SDC thread which is assigned priority 0 can continue to run if
  92   92   * nothing else needs to use the CPU that it's running on.  Similarly, an
  93   93   * SDC thread at priority 99 might not get to run as much as it wants to
  94   94   * if there are other priority-99 or higher threads on its CPU.  These
  95   95   * situations would cause the thread to get ahead of or behind its target
  96   96   * DC; the longer the situations lasted, the further ahead or behind the
  97   97   * thread would get.  Rather than condemning a thread to a lifetime of
  98   98   * paying for its youthful indiscretions, SDC keeps "base" values for
  99   99   * ONPROC and Runnable times in each thread's sysdc data, and updates these
 100  100   * values periodically.  The duty cycle is then computed using the elapsed
 101  101   * amount of ONPROC and Runnable times since those base times.
 102  102   *
 103  103   * Since sysdc_update() scans SDC threads fairly frequently, it tries to
 104  104   * keep the list of "active" threads small by pruning out threads which
 105  105   * have been asleep for a brief time.  They are not pruned immediately upon
 106  106   * going to sleep, since some threads may bounce back and forth between
 107  107   * sleeping and being runnable.
 108  108   *
 109  109   *
 110  110   * Interfaces
 111  111   *
 112  112   * void sysdc_thread_enter(t, dc, flags)
 113  113   *
 114  114   *      Moves a kernel thread from the SYS scheduling class to the
 115  115   *      SDC class. t must have an associated LWP (created by calling
 116  116   *      lwp_kernel_create()).  The thread will have a target DC of dc.
 117  117   *      Flags should be either 0 or SYSDC_THREAD_BATCH.  If
 118  118   *      SYSDC_THREAD_BATCH is specified, the thread is expected to be
 119  119   *      doing large amounts of processing.
 120  120   *
 121  121   *
 122  122   * Complications
 123  123   *
 124  124   * - Run queue balancing
 125  125   *
 126  126   *      The Solaris dispatcher is biased towards letting a thread run
 127  127   *      on the same CPU which it last ran on, if no more than 3 ticks
 128  128   *      (i.e. rechoose_interval) have passed since the thread last ran.
 129  129   *      This helps to preserve cache warmth.  On the other hand, it also
 130  130   *      tries to keep the per-CPU run queues fairly balanced; if the CPU
 131  131   *      chosen for a runnable thread has a run queue which is three or
 132  132   *      more threads longer than a neighboring CPU's queue, the runnable
 133  133   *      thread is dispatched onto the neighboring CPU instead.
 134  134   *
 135  135   *      These policies work well for some workloads, but not for many SDC
 136  136   *      threads.  The taskq client of SDC, for example, has many discrete
 137  137   *      units of work to do.  The work units are largely independent, so
 138  138   *      cache warmth is not an important consideration.  It is important
 139  139   *      that the threads fan out quickly to different CPUs, since the
 140  140   *      amount of work these threads have to do (a few seconds worth at a
 141  141   *      time) doesn't leave much time to correct thread placement errors
 142  142   *      (i.e. two SDC threads being dispatched to the same CPU).
 143  143   *
 144  144   *      To fix this, SDC uses the TS_RUNQMATCH flag introduced for FSS.
 145  145   *      This tells the dispatcher to keep neighboring run queues' lengths
 146  146   *      more evenly matched, which allows SDC threads to migrate more
 147  147   *      easily.
 148  148   *
 149  149   * - LWPs and system processes
 150  150   *
 151  151   *      SDC can only be used for kernel threads.  Since SDC uses microstate
 152  152   *      accounting data to compute each thread's actual duty cycle, all
 153  153   *      threads entering the SDC class must have associated LWPs (which
 154  154   *      store the microstate data).  This means that the threads have to
 155  155   *      be associated with an SSYS process, i.e. one created by newproc().
 156  156   *      If the microstate accounting information is ever moved into the
 157  157   *      kthread_t, this restriction could be lifted.
 158  158   *
 159  159   * - Dealing with oversubscription
 160  160   *
 161  161   *      Since SDC duty cycles are per-thread, it is possible that the
 162  162   *      aggregate requested duty cycle of all SDC threads in a processor
 163  163   *      set could be greater than the total CPU time available in that set.
 164  164   *      The FSS scheduling class has an analogous situation, which it deals
 165  165   *      with by reducing each thread's allotted CPU time proportionally.
 166  166   *      Since SDC doesn't need to be as precise as FSS, it uses a simpler
 167  167   *      solution to the oversubscription problem.
 168  168   *
 169  169   *      sysdc_update() accumulates the amount of time that max-priority SDC
 170  170   *      threads have spent on-CPU in each processor set, and uses that sum
 171  171   *      to create an implied duty cycle for that processor set:
 172  172   *
 173  173   *                              accumulated CPU time
 174  174   *         pset DC =    -----------------------------------
 175  175   *                       (# CPUs) * time since last update
 176  176   *
 177  177   *      If this implied duty cycle is above a maximum pset duty cycle (90%
 178  178   *      by default), sysdc_update() sets the priority of all SDC threads
 179  179   *      in that processor set to sysdc_minpri for a "break" period.  After
 180  180   *      the break period, it waits for a "nobreak" period before trying to
 181  181   *      enforce the pset duty cycle limit again.
 182  182   *
 183  183   * - Processor sets
 184  184   *
 185  185   *      As the above implies, SDC is processor set aware, but it does not

↓ open down ↓

185 lines elided

↑ open up ↑

 186  186   *      currently allow threads to change processor sets while in the SDC
 187  187   *      class.  Instead, those threads must join the desired processor set
 188  188   *      before entering SDC. [1]
 189  189   *
 190  190   * - Batch threads
 191  191   *
 192  192   *      A thread joining the SDC class can specify the SDC_THREAD_BATCH
 193  193   *      flag.  This flag currently has no effect, but marks threads which
 194  194   *      do bulk processing.
 195  195   *
 196      - * - t_kpri_req
 197      - *
 198      - *      The TS and FSS scheduling classes pay attention to t_kpri_req,
 199      - *      which provides a simple form of priority inheritance for
 200      - *      synchronization primitives (such as rwlocks held as READER) which
 201      - *      cannot be traced to a unique thread.  The SDC class does not honor
 202      - *      t_kpri_req, for a few reasons:
 203      - *
 204      - *      1.  t_kpri_req is notoriously inaccurate.  A measure of its
 205      - *          inaccuracy is that it needs to be cleared every time a thread
 206      - *          returns to user mode, because it is frequently non-zero at that
 207      - *          point.  This can happen because "ownership" of synchronization
 208      - *          primitives that use t_kpri_req can be silently handed off,
 209      - *          leaving no opportunity to will the t_kpri_req inheritance.
 210      - *
 211      - *      2.  Unlike in TS and FSS, threads in SDC *will* eventually run at
 212      - *          kernel priority.  This means that even if an SDC thread
 213      - *          is holding a synchronization primitive and running at low
 214      - *          priority, its priority will eventually be raised above 60,
 215      - *          allowing it to drive on and release the resource.
 216      - *
 217      - *      3.  The first consumer of SDC uses the taskq subsystem, which holds
 218      - *          a reader lock for the duration of the task's execution.  This
 219      - *          would mean that SDC threads would never drop below kernel
 220      - *          priority in practice, which defeats one of the purposes of SDC.
 221      - *
 222  196   * - Why not FSS?
 223  197   *
 224  198   *      It might seem that the existing FSS scheduling class could solve
 225  199   *      the problems that SDC is attempting to solve.  FSS's more precise
 226  200   *      solution to the oversubscription problem would hardly cause
 227  201   *      trouble, as long as it performed well.  SDC is implemented as
 228  202   *      a separate scheduling class for two main reasons: the initial
 229  203   *      consumer of SDC does not map well onto the "project" abstraction
 230  204   *      that is central to FSS, and FSS does not expect to run at kernel
 231  205   *      priorities.

 232  206   *
 233  207   *
 234  208   * Tunables
 235  209   *
 236  210   * - sysdc_update_interval_msec:  Number of milliseconds between
 237  211   *      consecutive thread priority updates.
 238  212   *
 239  213   * - sysdc_reset_interval_msec:  Number of milliseconds between
 240  214   *      consecutive resets of a thread's base ONPROC and Runnable
 241  215   *      times.
 242  216   *
 243  217   * - sysdc_prune_interval_msec:  Number of milliseconds of sleeping
 244  218   *      before a thread is pruned from the active list.
 245  219   *
 246  220   * - sysdc_max_pset_DC:  Allowable percentage of a processor set's
 247  221   *      CPU time which SDC can give to its high-priority threads.
 248  222   *
 249  223   * - sysdc_break_msec:  Number of milliseconds of "break" taken when
 250  224   *      sysdc_max_pset_DC is exceeded.
 251  225   *
 252  226   *
 253  227   * Future work (in SDC and related subsystems)
 254  228   *
 255  229   * - Per-thread rechoose interval (0 for SDC)
 256  230   *
 257  231   *      Allow each thread to specify its own rechoose interval.  SDC
 258  232   *      threads would specify an interval of zero, which would rechoose
 259  233   *      the CPU with the lowest priority once per update.
 260  234   *
 261  235   * - Allow threads to change processor sets after joining the SDC class
 262  236   *
 263  237   * - Thread groups and per-group DC
 264  238   *
 265  239   *      It might be nice to be able to specify a duty cycle which applies
 266  240   *      to a group of threads in aggregate.
 267  241   *
 268  242   * - Per-group DC callback to allow dynamic DC tuning
 269  243   *
 270  244   *      Currently, DCs are assigned when the thread joins SDC.  Some
 271  245   *      workloads could benefit from being able to tune their DC using
 272  246   *      subsystem-specific knowledge about the workload.
 273  247   *
 274  248   * - Finer-grained priority updates
 275  249   *
 276  250   * - More nuanced management of oversubscription
 277  251   *
 278  252   * - Moving other CPU-intensive threads into SDC
 279  253   *
 280  254   * - Move msacct data into kthread_t
 281  255   *
 282  256   *      This would allow kernel threads without LWPs to join SDC.
 283  257   *
 284  258   *
 285  259   * Footnotes
 286  260   *
 287  261   * [1] The details of doing so are left as an exercise for the reader.
 288  262   */
 289  263  
 290  264  #include <sys/types.h>
 291  265  #include <sys/sysdc.h>
 292  266  #include <sys/sysdc_impl.h>
 293  267  
 294  268  #include <sys/class.h>
 295  269  #include <sys/cmn_err.h>
 296  270  #include <sys/cpuvar.h>
 297  271  #include <sys/cpupart.h>
 298  272  #include <sys/debug.h>
 299  273  #include <sys/disp.h>
 300  274  #include <sys/errno.h>
 301  275  #include <sys/inline.h>
 302  276  #include <sys/kmem.h>
 303  277  #include <sys/modctl.h>
 304  278  #include <sys/schedctl.h>
 305  279  #include <sys/sdt.h>
 306  280  #include <sys/sunddi.h>
 307  281  #include <sys/sysmacros.h>
 308  282  #include <sys/systm.h>
 309  283  #include <sys/var.h>
 310  284  
 311  285  /*
 312  286   * Tunables - loaded into the internal state at module load time
 313  287   */
 314  288  uint_t          sysdc_update_interval_msec = 20;
 315  289  uint_t          sysdc_reset_interval_msec = 400;
 316  290  uint_t          sysdc_prune_interval_msec = 100;
 317  291  uint_t          sysdc_max_pset_DC = 90;
 318  292  uint_t          sysdc_break_msec = 80;
 319  293  
 320  294  /*
 321  295   * Internal state - constants set up by sysdc_initparam()
 322  296   */
 323  297  static clock_t  sysdc_update_ticks;     /* ticks between updates */
 324  298  static uint_t   sysdc_prune_updates;    /* updates asleep before pruning */
 325  299  static uint_t   sysdc_reset_updates;    /* # of updates before reset */
 326  300  static uint_t   sysdc_break_updates;    /* updates to break */
 327  301  static uint_t   sysdc_nobreak_updates;  /* updates to not check */
 328  302  static uint_t   sysdc_minDC;            /* minimum allowed DC */
 329  303  static uint_t   sysdc_maxDC;            /* maximum allowed DC */
 330  304  static pri_t    sysdc_minpri;           /* minimum allowed priority */
 331  305  static pri_t    sysdc_maxpri;           /* maximum allowed priority */
 332  306  
 333  307  /*
 334  308   * Internal state
 335  309   */
 336  310  static kmutex_t sysdc_pset_lock;        /* lock protecting pset data */
 337  311  static list_t   sysdc_psets;            /* list of psets with SDC threads */
 338  312  static uint_t   sysdc_param_init;       /* sysdc_initparam() has been called */
 339  313  static uint_t   sysdc_update_timeout_started; /* update timeout is active */
 340  314  static hrtime_t sysdc_last_update;      /* time of last sysdc_update() */
 341  315  static sysdc_t  sysdc_dummy;            /* used to terminate active lists */
 342  316  
 343  317  /*
 344  318   * Internal state - active hash table
 345  319   */
 346  320  #define SYSDC_NLISTS    8
 347  321  #define SYSDC_HASH(sdc) (((uintptr_t)(sdc) >> 6) & (SYSDC_NLISTS - 1))
 348  322  static sysdc_list_t     sysdc_active[SYSDC_NLISTS];
 349  323  #define SYSDC_LIST(sdc)         (&sysdc_active[SYSDC_HASH(sdc)])
 350  324  
 351  325  #ifdef DEBUG
 352  326  static struct {
 353  327          uint64_t        sysdc_update_times_asleep;
 354  328          uint64_t        sysdc_update_times_base_ran_backwards;
 355  329          uint64_t        sysdc_update_times_already_done;
 356  330          uint64_t        sysdc_update_times_cur_ran_backwards;
 357  331          uint64_t        sysdc_compute_pri_breaking;
 358  332          uint64_t        sysdc_activate_enter;
 359  333          uint64_t        sysdc_update_enter;
 360  334          uint64_t        sysdc_update_exited;
 361  335          uint64_t        sysdc_update_not_sdc;
 362  336          uint64_t        sysdc_update_idle;
 363  337          uint64_t        sysdc_update_take_break;
 364  338          uint64_t        sysdc_update_no_psets;
 365  339          uint64_t        sysdc_tick_not_sdc;
 366  340          uint64_t        sysdc_tick_quantum_expired;
 367  341          uint64_t        sysdc_thread_enter_enter;
 368  342  } sysdc_stats;
 369  343  
 370  344  #define SYSDC_INC_STAT(x)       (sysdc_stats.x++)
 371  345  #else
 372  346  #define SYSDC_INC_STAT(x)       ((void)0)
 373  347  #endif
 374  348  
 375  349  /* macros are UPPER CASE */
 376  350  #define HOWMANY(a, b)   howmany((a), (b))
 377  351  #define MSECTOTICKS(a)  HOWMANY((a) * 1000, usec_per_tick)
 378  352  
 379  353  static void
 380  354  sysdc_initparam(void)
 381  355  {
 382  356          uint_t sysdc_break_ticks;
 383  357  
 384  358          /* update / prune intervals */
 385  359          sysdc_update_ticks = MSECTOTICKS(sysdc_update_interval_msec);
 386  360  
 387  361          sysdc_prune_updates = HOWMANY(sysdc_prune_interval_msec,
 388  362              sysdc_update_interval_msec);
 389  363          sysdc_reset_updates = HOWMANY(sysdc_reset_interval_msec,
 390  364              sysdc_update_interval_msec);
 391  365  
 392  366          /* We must get at least a little time on CPU. */
 393  367          sysdc_minDC = 1;
 394  368          sysdc_maxDC = SYSDC_DC_MAX;
 395  369          sysdc_minpri = 0;
 396  370          sysdc_maxpri = maxclsyspri - 1;
 397  371  
 398  372          /* break parameters */
 399  373          if (sysdc_max_pset_DC > SYSDC_DC_MAX) {
 400  374                  sysdc_max_pset_DC = SYSDC_DC_MAX;
 401  375          }
 402  376          sysdc_break_ticks = MSECTOTICKS(sysdc_break_msec);
 403  377          sysdc_break_updates = HOWMANY(sysdc_break_ticks, sysdc_update_ticks);
 404  378  
 405  379          /*
 406  380           * We want:
 407  381           *
 408  382           *      sysdc_max_pset_DC = (nobreak / (break + nobreak))
 409  383           *
 410  384           *      ==>       nobreak = sysdc_max_pset_DC * (break + nobreak)
 411  385           *
 412  386           *                          sysdc_max_pset_DC * break
 413  387           *      ==>       nobreak = -------------------------
 414  388           *                          1 - sysdc_max_pset_DC
 415  389           */
 416  390          sysdc_nobreak_updates =
 417  391              HOWMANY((uint64_t)sysdc_break_updates * sysdc_max_pset_DC,
 418  392              (SYSDC_DC_MAX - sysdc_max_pset_DC));
 419  393  
 420  394          sysdc_param_init = 1;
 421  395  }
 422  396  
 423  397  #undef HOWMANY
 424  398  #undef MSECTOTICKS
 425  399  
 426  400  #define SDC_UPDATE_INITIAL      0x1     /* for the initial update */
 427  401  #define SDC_UPDATE_TIMEOUT      0x2     /* from sysdc_update() */
 428  402  #define SDC_UPDATE_TICK         0x4     /* from sysdc_tick(), on expiry */
 429  403  
 430  404  /*
 431  405   * Updates the recorded times in the sdc, and returns the elapsed ONPROC
 432  406   * and Runnable times since the last reset.
 433  407   *
 434  408   * newO is the thread's actual ONPROC time; it's used during sysdc_update()
 435  409   * to track processor set usage.
 436  410   */
 437  411  static void
 438  412  sysdc_update_times(sysdc_t *sdc, uint_t flags,
 439  413      hrtime_t *O, hrtime_t *R, hrtime_t *newO)
 440  414  {
 441  415          kthread_t *const t = sdc->sdc_thread;
 442  416          const uint_t    initial = (flags & SDC_UPDATE_INITIAL);
 443  417          const uint_t    update = (flags & SDC_UPDATE_TIMEOUT);
 444  418          const clock_t   now = ddi_get_lbolt();
 445  419          uint_t          do_reset;
 446  420  
 447  421          ASSERT(THREAD_LOCK_HELD(t));
 448  422  
 449  423          *O = *R = 0;
 450  424  
 451  425          /* If we've been sleeping, we know we haven't had any ONPROC time. */
 452  426          if (sdc->sdc_sleep_updates != 0 &&
 453  427              sdc->sdc_sleep_updates != sdc->sdc_nupdates) {
 454  428                  *newO = sdc->sdc_last_base_O;
 455  429                  SYSDC_INC_STAT(sysdc_update_times_asleep);
 456  430                  return;
 457  431          }
 458  432  
 459  433          /*
 460  434           * If this is our first update, or we've hit the reset point,
 461  435           * we need to reset our base_{O,R}.  Once we've updated them, we
 462  436           * report O and R for the entire prior interval.
 463  437           */
 464  438          do_reset = initial;
 465  439          if (update) {
 466  440                  ++sdc->sdc_nupdates;
 467  441                  if ((sdc->sdc_nupdates % sysdc_reset_updates) == 0)
 468  442                          do_reset = 1;
 469  443          }
 470  444          if (do_reset) {
 471  445                  hrtime_t baseO, baseR;
 472  446                  if (initial) {
 473  447                          /*
 474  448                           * Start off our cycle count somewhere in the middle,
 475  449                           * to keep the resets from all happening at once.
 476  450                           *
 477  451                           * 4999 is a handy prime much larger than
 478  452                           * sysdc_reset_updates, so that we don't run into
 479  453                           * trouble if the resolution is a multiple of
 480  454                           * sysdc_reset_updates.
 481  455                           */
 482  456                          sdc->sdc_nupdates = (uint_t)((gethrtime() % 4999) %
 483  457                              sysdc_reset_updates);
 484  458                          baseO = baseR = 0;
 485  459                  } else {
 486  460                          baseO = sdc->sdc_base_O;
 487  461                          baseR = sdc->sdc_base_R;
 488  462                  }
 489  463  
 490  464                  mstate_systhread_times(t, &sdc->sdc_base_O, &sdc->sdc_base_R);
 491  465                  *newO = sdc->sdc_base_O;
 492  466  
 493  467                  sdc->sdc_reset = now;
 494  468                  sdc->sdc_pri_check = -1; /* force mismatch below */
 495  469  
 496  470                  /*
 497  471                   * See below for rationale.
 498  472                   */
 499  473                  if (baseO > sdc->sdc_base_O || baseR > sdc->sdc_base_R) {
 500  474                          SYSDC_INC_STAT(sysdc_update_times_base_ran_backwards);
 501  475                          baseO = sdc->sdc_base_O;
 502  476                          baseR = sdc->sdc_base_R;
 503  477                  }
 504  478  
 505  479                  /* compute based on the entire interval */
 506  480                  *O = (sdc->sdc_base_O - baseO);
 507  481                  *R = (sdc->sdc_base_R - baseR);
 508  482                  return;
 509  483          }
 510  484  
 511  485          /*
 512  486           * If we're called from sysdc_update(), we *must* return a value
 513  487           * for newO, so we always call mstate_systhread_times().
 514  488           *
 515  489           * Otherwise, if we've already done a pri check this tick,
 516  490           * we can skip it.
 517  491           */
 518  492          if (!update && sdc->sdc_pri_check == now) {
 519  493                  SYSDC_INC_STAT(sysdc_update_times_already_done);
 520  494                  return;
 521  495          }
 522  496  
 523  497          /* Get the current times from the thread */
 524  498          sdc->sdc_pri_check = now;
 525  499          mstate_systhread_times(t, &sdc->sdc_cur_O, &sdc->sdc_cur_R);
 526  500          *newO = sdc->sdc_cur_O;
 527  501  
 528  502          /*
 529  503           * The updating of microstate accounting is not done under a
 530  504           * consistent set of locks, particularly the t_waitrq field.  This
 531  505           * can lead to narrow windows in which we account for time in the
 532  506           * wrong bucket, which on the next read will be accounted for
 533  507           * correctly.
 534  508           *
 535  509           * If our sdc_base_* fields were affected by one of these blips, we
 536  510           * throw away the old data, and pretend this tick didn't happen.
 537  511           */
 538  512          if (sdc->sdc_cur_O < sdc->sdc_base_O ||
 539  513              sdc->sdc_cur_R < sdc->sdc_base_R) {
 540  514  
 541  515                  sdc->sdc_base_O = sdc->sdc_cur_O;
 542  516                  sdc->sdc_base_R = sdc->sdc_cur_R;
 543  517  
 544  518                  SYSDC_INC_STAT(sysdc_update_times_cur_ran_backwards);
 545  519                  return;
 546  520          }
 547  521  
 548  522          *O = sdc->sdc_cur_O - sdc->sdc_base_O;
 549  523          *R = sdc->sdc_cur_R - sdc->sdc_base_R;
 550  524  }
 551  525  
 552  526  /*
 553  527   * sysdc_compute_pri()
 554  528   *
 555  529   *      Recomputes the priority of the thread, leaving the result in
 556  530   *      sdc->sdc_epri.  Returns 1 if a priority update should occur
 557  531   *      (which will also trigger a cpu_surrender()), otherwise
 558  532   *      returns 0.
 559  533   */
 560  534  static uint_t
 561  535  sysdc_compute_pri(sysdc_t *sdc, uint_t flags)
 562  536  {
 563  537          kthread_t *const t = sdc->sdc_thread;
 564  538          const uint_t    update = (flags & SDC_UPDATE_TIMEOUT);
 565  539          const uint_t    tick = (flags & SDC_UPDATE_TICK);
 566  540  
 567  541          hrtime_t        O, R;
 568  542          hrtime_t        newO = -1;
 569  543  
 570  544          ASSERT(THREAD_LOCK_HELD(t));
 571  545  
 572  546          sysdc_update_times(sdc, flags, &O, &R, &newO);
 573  547          ASSERT(!update || newO != -1);
 574  548  
 575  549          /* If we have new data, recompute our priority. */
 576  550          if ((O + R) != 0) {
 577  551                  sdc->sdc_cur_DC = (O * SYSDC_DC_MAX) / (O + R);
 578  552  
 579  553                  /* Adjust our priority to move our DC closer to the target. */
 580  554                  if (sdc->sdc_cur_DC < sdc->sdc_target_DC)
 581  555                          sdc->sdc_pri = sdc->sdc_maxpri;
 582  556                  else
 583  557                          sdc->sdc_pri = sdc->sdc_minpri;
 584  558          }
 585  559  
 586  560          /*
 587  561           * If our per-pset duty cycle goes over the max, we will take a break.
 588  562           * This forces all sysdc threads in the pset to minimum priority, in
 589  563           * order to let everyone else have a chance at the CPU.
 590  564           */
 591  565          if (sdc->sdc_pset->sdp_need_break) {
 592  566                  SYSDC_INC_STAT(sysdc_compute_pri_breaking);
 593  567                  sdc->sdc_epri = sdc->sdc_minpri;
 594  568          } else {
 595  569                  sdc->sdc_epri = sdc->sdc_pri;
 596  570          }
 597  571  
 598  572          DTRACE_PROBE4(sysdc__compute__pri,
 599  573              kthread_t *, t, pri_t, sdc->sdc_epri, uint_t, sdc->sdc_cur_DC,
 600  574              uint_t, sdc->sdc_target_DC);
 601  575  
 602  576          /*
 603  577           * For sysdc_update(), we compute the ONPROC time for high-priority
 604  578           * threads, which is used to calculate the per-pset duty cycle.  We
 605  579           * will always tell our callers to update the thread's priority,
 606  580           * since we want to force a cpu_surrender().
 607  581           *
 608  582           * We reset sdc_update_ticks so that sysdc_tick() will only update
 609  583           * the thread's priority if our timeout is delayed by a tick or
 610  584           * more.
 611  585           */
 612  586          if (update) {
 613  587                  /* SDC threads are not allowed to change cpupart bindings. */
 614  588                  ASSERT(t->t_cpupart == sdc->sdc_pset->sdp_cpupart);
 615  589  
 616  590                  /* If we were at MAXPRI, account for our onproc time. */
 617  591                  if (t->t_pri == sdc->sdc_maxpri &&
 618  592                      sdc->sdc_last_base_O != 0 &&
 619  593                      sdc->sdc_last_base_O < newO) {
 620  594                          sdc->sdc_last_O = newO - sdc->sdc_last_base_O;
 621  595                          sdc->sdc_pset->sdp_onproc_time +=
 622  596                              (uint64_t)sdc->sdc_last_O;
 623  597                          sdc->sdc_pset->sdp_onproc_threads++;
 624  598                  } else {
 625  599                          sdc->sdc_last_O = 0;
 626  600                  }
 627  601                  sdc->sdc_last_base_O = newO;
 628  602  
 629  603                  sdc->sdc_update_ticks = sdc->sdc_ticks + sysdc_update_ticks + 1;
 630  604                  return (1);
 631  605          }
 632  606  
 633  607          /*
 634  608           * Like sysdc_update(), sysdc_tick() always wants to update the
 635  609           * thread's priority, so that the CPU is surrendered if necessary.
 636  610           * We reset sdc_update_ticks so that if the timeout continues to be
 637  611           * delayed, we'll update at the regular interval.
 638  612           */
 639  613          if (tick) {
 640  614                  ASSERT(sdc->sdc_ticks == sdc->sdc_update_ticks);
 641  615                  sdc->sdc_update_ticks = sdc->sdc_ticks + sysdc_update_ticks;
 642  616                  return (1);
 643  617          }
 644  618  
 645  619          /*
 646  620           * Otherwise, only tell our callers to update the priority if it has
 647  621           * changed.
 648  622           */
 649  623          return (sdc->sdc_epri != t->t_pri);
 650  624  }
 651  625  
 652  626  static void
 653  627  sysdc_update_pri(sysdc_t *sdc, uint_t flags)
 654  628  {
 655  629          kthread_t *t = sdc->sdc_thread;
 656  630  
 657  631          ASSERT(THREAD_LOCK_HELD(t));
 658  632  
 659  633          if (sysdc_compute_pri(sdc, flags)) {
 660  634                  if (!thread_change_pri(t, sdc->sdc_epri, 0)) {
 661  635                          cpu_surrender(t);
 662  636                  }
 663  637          }
 664  638  }
 665  639  
 666  640  /*
 667  641   * Add a thread onto the active list.  It will only be removed by
 668  642   * sysdc_update().
 669  643   */
 670  644  static void
 671  645  sysdc_activate(sysdc_t *sdc)
 672  646  {
 673  647          sysdc_t *volatile *headp = &SYSDC_LIST(sdc)->sdl_list;
 674  648          sysdc_t         *head;
 675  649          kthread_t       *t = sdc->sdc_thread;
 676  650  
 677  651          SYSDC_INC_STAT(sysdc_activate_enter);
 678  652  
 679  653          ASSERT(sdc->sdc_next == NULL);
 680  654          ASSERT(THREAD_LOCK_HELD(t));
 681  655  
 682  656          do {
 683  657                  head = *headp;
 684  658                  sdc->sdc_next = head;
 685  659          } while (atomic_cas_ptr(headp, head, sdc) != head);
 686  660  }
 687  661  
 688  662  /*
 689  663   * sysdc_update() has two jobs:
 690  664   *
 691  665   *      1. It updates the priorities of all active SDC threads on the system.
 692  666   *      2. It measures pset CPU usage and enforces sysdc_max_pset_DC.
 693  667   */
 694  668  static void
 695  669  sysdc_update(void *arg)
 696  670  {
 697  671          int             idx;
 698  672          sysdc_t         *freelist = NULL;
 699  673          sysdc_pset_t    *cur;
 700  674          hrtime_t        now, diff;
 701  675          uint_t          redeploy = 1;
 702  676  
 703  677          SYSDC_INC_STAT(sysdc_update_enter);
 704  678  
 705  679          ASSERT(sysdc_update_timeout_started);
 706  680  
 707  681          /*
 708  682           * If this is our first time through, diff will be gigantic, and
 709  683           * no breaks will be necessary.
 710  684           */
 711  685          now = gethrtime();
 712  686          diff = now - sysdc_last_update;
 713  687          sysdc_last_update = now;
 714  688  
 715  689          mutex_enter(&sysdc_pset_lock);
 716  690          for (cur = list_head(&sysdc_psets); cur != NULL;
 717  691              cur = list_next(&sysdc_psets, cur)) {
 718  692                  boolean_t breaking = (cur->sdp_should_break != 0);
 719  693  
 720  694                  if (cur->sdp_need_break != breaking) {
 721  695                          DTRACE_PROBE2(sdc__pset__break, sysdc_pset_t *, cur,
 722  696                              boolean_t, breaking);
 723  697                  }
 724  698                  cur->sdp_onproc_time = 0;
 725  699                  cur->sdp_onproc_threads = 0;
 726  700                  cur->sdp_need_break = breaking;
 727  701          }
 728  702          mutex_exit(&sysdc_pset_lock);
 729  703  
 730  704          for (idx = 0; idx < SYSDC_NLISTS; idx++) {
 731  705                  sysdc_list_t            *sdl = &sysdc_active[idx];
 732  706                  sysdc_t *volatile       *headp = &sdl->sdl_list;
 733  707                  sysdc_t                 *head, *tail;
 734  708                  sysdc_t                 **prevptr;
 735  709  
 736  710                  if (*headp == &sysdc_dummy)
 737  711                          continue;
 738  712  
 739  713                  /* Prevent any threads from exiting while we're poking them. */
 740  714                  mutex_enter(&sdl->sdl_lock);
 741  715  
 742  716                  /*
 743  717                   * Each sdl_list contains a singly-linked list of active
 744  718                   * threads. Threads which become active while we are
 745  719                   * processing the list will be added to sdl_list.  Since we
 746  720                   * don't want that to interfere with our own processing, we
 747  721                   * swap in an empty list.  Any newly active threads will
 748  722                   * go on to this empty list.  When finished, we'll put any
 749  723                   * such threads at the end of the processed list.
 750  724                   */
 751  725                  head = atomic_swap_ptr(headp, &sysdc_dummy);
 752  726                  prevptr = &head;
 753  727                  while (*prevptr != &sysdc_dummy) {
 754  728                          sysdc_t         *const  sdc = *prevptr;
 755  729                          kthread_t       *const  t = sdc->sdc_thread;
 756  730  
 757  731                          /*
 758  732                           * If the thread has exited, move its sysdc_t onto
 759  733                           * freelist, to be freed later.
 760  734                           */
 761  735                          if (t == NULL) {
 762  736                                  *prevptr = sdc->sdc_next;
 763  737                                  SYSDC_INC_STAT(sysdc_update_exited);
 764  738                                  sdc->sdc_next = freelist;
 765  739                                  freelist = sdc;
 766  740                                  continue;
 767  741                          }
 768  742  
 769  743                          thread_lock(t);
 770  744                          if (t->t_cid != sysdccid) {
 771  745                                  thread_unlock(t);
 772  746                                  prevptr = &sdc->sdc_next;
 773  747                                  SYSDC_INC_STAT(sysdc_update_not_sdc);
 774  748                                  continue;
 775  749                          }
 776  750                          ASSERT(t->t_cldata == sdc);
 777  751  
 778  752                          /*
 779  753                           * If the thread has been sleeping for longer
 780  754                           * than sysdc_prune_interval, make it inactive by
 781  755                           * removing it from the list.
 782  756                           */
 783  757                          if (!(t->t_state & (TS_RUN | TS_ONPROC)) &&
 784  758                              sdc->sdc_sleep_updates != 0 &&
 785  759                              (sdc->sdc_sleep_updates - sdc->sdc_nupdates) >
 786  760                              sysdc_prune_updates) {
 787  761                                  *prevptr = sdc->sdc_next;
 788  762                                  SYSDC_INC_STAT(sysdc_update_idle);
 789  763                                  sdc->sdc_next = NULL;
 790  764                                  thread_unlock(t);
 791  765                                  continue;
 792  766                          }
 793  767                          sysdc_update_pri(sdc, SDC_UPDATE_TIMEOUT);
 794  768                          thread_unlock(t);
 795  769  
 796  770                          prevptr = &sdc->sdc_next;
 797  771                  }
 798  772  
 799  773                  /*
 800  774                   * Add our list to the bucket, putting any new entries
 801  775                   * added while we were working at the tail of the list.
 802  776                   */
 803  777                  do {
 804  778                          tail = *headp;
 805  779                          *prevptr = tail;
 806  780                  } while (atomic_cas_ptr(headp, tail, head) != tail);
 807  781  
 808  782                  mutex_exit(&sdl->sdl_lock);
 809  783          }
 810  784  
 811  785          mutex_enter(&sysdc_pset_lock);
 812  786          for (cur = list_head(&sysdc_psets); cur != NULL;
 813  787              cur = list_next(&sysdc_psets, cur)) {
 814  788  
 815  789                  cur->sdp_vtime_last_interval =
 816  790                      diff * cur->sdp_cpupart->cp_ncpus;
 817  791                  cur->sdp_DC_last_interval =
 818  792                      (cur->sdp_onproc_time * SYSDC_DC_MAX) /
 819  793                      cur->sdp_vtime_last_interval;
 820  794  
 821  795                  if (cur->sdp_should_break > 0) {
 822  796                          cur->sdp_should_break--;        /* breaking */
 823  797                          continue;
 824  798                  }
 825  799                  if (cur->sdp_dont_break > 0) {
 826  800                          cur->sdp_dont_break--;  /* waiting before checking */
 827  801                          continue;
 828  802                  }
 829  803                  if (cur->sdp_DC_last_interval > sysdc_max_pset_DC) {
 830  804                          cur->sdp_should_break = sysdc_break_updates;
 831  805                          cur->sdp_dont_break = sysdc_nobreak_updates;
 832  806                          SYSDC_INC_STAT(sysdc_update_take_break);
 833  807                  }
 834  808          }
 835  809  
 836  810          /*
 837  811           * If there are no sysdc_psets, there can be no threads, so
 838  812           * we can stop doing our timeout.  Since we're holding the
 839  813           * sysdc_pset_lock, no new sysdc_psets can come in, which will
 840  814           * prevent anyone from racing with this and dropping our timeout
 841  815           * on the floor.
 842  816           */
 843  817          if (list_is_empty(&sysdc_psets)) {
 844  818                  SYSDC_INC_STAT(sysdc_update_no_psets);
 845  819                  ASSERT(sysdc_update_timeout_started);
 846  820                  sysdc_update_timeout_started = 0;
 847  821  
 848  822                  redeploy = 0;
 849  823          }
 850  824          mutex_exit(&sysdc_pset_lock);
 851  825  
 852  826          while (freelist != NULL) {
 853  827                  sysdc_t *cur = freelist;
 854  828                  freelist = cur->sdc_next;
 855  829                  kmem_free(cur, sizeof (*cur));
 856  830          }
 857  831  
 858  832          if (redeploy) {
 859  833                  (void) timeout(sysdc_update, arg, sysdc_update_ticks);
 860  834          }
 861  835  }
 862  836  
 863  837  static void
 864  838  sysdc_preempt(kthread_t *t)
 865  839  {
 866  840          ASSERT(t == curthread);
 867  841          ASSERT(THREAD_LOCK_HELD(t));
 868  842  
 869  843          setbackdq(t);           /* give others a chance to run */
 870  844  }
 871  845  
 872  846  static void
 873  847  sysdc_tick(kthread_t *t)
 874  848  {
 875  849          sysdc_t *sdc;
 876  850  
 877  851          thread_lock(t);
 878  852          if (t->t_cid != sysdccid) {
 879  853                  SYSDC_INC_STAT(sysdc_tick_not_sdc);
 880  854                  thread_unlock(t);
 881  855                  return;
 882  856          }
 883  857          sdc = t->t_cldata;
 884  858          if (t->t_state == TS_ONPROC &&
 885  859              t->t_pri < t->t_disp_queue->disp_maxrunpri) {
 886  860                  cpu_surrender(t);
 887  861          }
 888  862  
 889  863          if (t->t_state == TS_ONPROC || t->t_state == TS_RUN) {
 890  864                  ASSERT(sdc->sdc_sleep_updates == 0);
 891  865          }
 892  866  
 893  867          ASSERT(sdc->sdc_ticks != sdc->sdc_update_ticks);
 894  868          sdc->sdc_ticks++;
 895  869          if (sdc->sdc_ticks == sdc->sdc_update_ticks) {
 896  870                  SYSDC_INC_STAT(sysdc_tick_quantum_expired);
 897  871                  sysdc_update_pri(sdc, SDC_UPDATE_TICK);
 898  872                  ASSERT(sdc->sdc_ticks != sdc->sdc_update_ticks);
 899  873          }
 900  874          thread_unlock(t);
 901  875  }
 902  876  
 903  877  static void
 904  878  sysdc_setrun(kthread_t *t)
 905  879  {
 906  880          sysdc_t *sdc = t->t_cldata;
 907  881  
 908  882          ASSERT(THREAD_LOCK_HELD(t));    /* t should be in transition */
 909  883  
 910  884          sdc->sdc_sleep_updates = 0;
 911  885  
 912  886          if (sdc->sdc_next == NULL) {
 913  887                  /*
 914  888                   * Since we're in transition, we don't want to use the
 915  889                   * full thread_update_pri().
 916  890                   */
 917  891                  if (sysdc_compute_pri(sdc, 0)) {
 918  892                          THREAD_CHANGE_PRI(t, sdc->sdc_epri);
 919  893                  }
 920  894                  sysdc_activate(sdc);
 921  895  
 922  896                  ASSERT(sdc->sdc_next != NULL);
 923  897          }
 924  898  
 925  899          setbackdq(t);
 926  900  }
 927  901  
 928  902  static void
 929  903  sysdc_wakeup(kthread_t *t)
 930  904  {
 931  905          sysdc_setrun(t);
 932  906  }
 933  907  
 934  908  static void
 935  909  sysdc_sleep(kthread_t *t)
 936  910  {
 937  911          sysdc_t *sdc = t->t_cldata;
 938  912  
 939  913          ASSERT(THREAD_LOCK_HELD(t));    /* t should be in transition */
 940  914  
 941  915          sdc->sdc_sleep_updates = sdc->sdc_nupdates;
 942  916  }
 943  917  
 944  918  /*ARGSUSED*/
 945  919  static int
 946  920  sysdc_enterclass(kthread_t *t, id_t cid, void *parmsp, cred_t *reqpcredp,
 947  921      void *bufp)
 948  922  {
 949  923          cpupart_t *const cpupart = t->t_cpupart;
 950  924          sysdc_t *sdc = bufp;
 951  925          sysdc_params_t *sdpp = parmsp;
 952  926          sysdc_pset_t *newpset = sdc->sdc_pset;
 953  927          sysdc_pset_t *pset;
 954  928          int start_timeout;
 955  929  
 956  930          if (t->t_cid != syscid)
 957  931                  return (EPERM);
 958  932  
 959  933          ASSERT(ttolwp(t) != NULL);
 960  934          ASSERT(sdpp != NULL);
 961  935          ASSERT(newpset != NULL);
 962  936          ASSERT(sysdc_param_init);
 963  937  
 964  938          ASSERT(sdpp->sdp_minpri >= sysdc_minpri);
 965  939          ASSERT(sdpp->sdp_maxpri <= sysdc_maxpri);
 966  940          ASSERT(sdpp->sdp_DC >= sysdc_minDC);
 967  941          ASSERT(sdpp->sdp_DC <= sysdc_maxDC);
 968  942  
 969  943          sdc->sdc_thread = t;
 970  944          sdc->sdc_pri = sdpp->sdp_maxpri;        /* start off maximally */
 971  945          sdc->sdc_minpri = sdpp->sdp_minpri;
 972  946          sdc->sdc_maxpri = sdpp->sdp_maxpri;
 973  947          sdc->sdc_target_DC = sdpp->sdp_DC;
 974  948          sdc->sdc_ticks = 0;
 975  949          sdc->sdc_update_ticks = sysdc_update_ticks + 1;
 976  950  
 977  951          /* Assign ourselves to the appropriate pset. */
 978  952          sdc->sdc_pset = NULL;
 979  953          mutex_enter(&sysdc_pset_lock);
 980  954          for (pset = list_head(&sysdc_psets); pset != NULL;
 981  955              pset = list_next(&sysdc_psets, pset)) {
 982  956                  if (pset->sdp_cpupart == cpupart) {
 983  957                          break;
 984  958                  }
 985  959          }
 986  960          if (pset == NULL) {
 987  961                  pset = newpset;
 988  962                  newpset = NULL;
 989  963                  pset->sdp_cpupart = cpupart;
 990  964                  list_insert_tail(&sysdc_psets, pset);
 991  965          }
 992  966          pset->sdp_nthreads++;
 993  967          ASSERT(pset->sdp_nthreads > 0);
 994  968  
 995  969          sdc->sdc_pset = pset;
 996  970  
 997  971          start_timeout = (sysdc_update_timeout_started == 0);
 998  972          sysdc_update_timeout_started = 1;
 999  973          mutex_exit(&sysdc_pset_lock);
1000  974  
1001  975          if (newpset != NULL)
1002  976                  kmem_free(newpset, sizeof (*newpset));
1003  977  
1004  978          /* Update t's scheduling class and priority. */
1005  979          thread_lock(t);
1006  980          t->t_clfuncs = &(sclass[cid].cl_funcs->thread);
1007  981          t->t_cid = cid;
1008  982          t->t_cldata = sdc;
1009  983          t->t_schedflag |= TS_RUNQMATCH;
1010  984  
1011  985          sysdc_update_pri(sdc, SDC_UPDATE_INITIAL);
1012  986          thread_unlock(t);
1013  987  
1014  988          /* Kick off the thread timeout if we're the first one in. */
1015  989          if (start_timeout) {
1016  990                  (void) timeout(sysdc_update, NULL, sysdc_update_ticks);
1017  991          }
1018  992  
1019  993          return (0);
1020  994  }
1021  995  
1022  996  static void
1023  997  sysdc_leave(sysdc_t *sdc)
1024  998  {
1025  999          sysdc_pset_t *sdp = sdc->sdc_pset;
1026 1000          sysdc_list_t *sdl = SYSDC_LIST(sdc);
1027 1001          uint_t freedc;
1028 1002  
1029 1003          mutex_enter(&sdl->sdl_lock);            /* block sysdc_update() */
1030 1004          sdc->sdc_thread = NULL;
1031 1005          freedc = (sdc->sdc_next == NULL);
1032 1006          mutex_exit(&sdl->sdl_lock);
1033 1007  
1034 1008          mutex_enter(&sysdc_pset_lock);
1035 1009          ASSERT(sdp != NULL);
1036 1010          ASSERT(sdp->sdp_nthreads > 0);
1037 1011          --sdp->sdp_nthreads;
1038 1012          if (sdp->sdp_nthreads == 0) {
1039 1013                  list_remove(&sysdc_psets, sdp);
1040 1014          } else {
1041 1015                  sdp = NULL;
1042 1016          }
1043 1017          mutex_exit(&sysdc_pset_lock);
1044 1018  
1045 1019          if (freedc)
1046 1020                  kmem_free(sdc, sizeof (*sdc));
1047 1021          if (sdp != NULL)
1048 1022                  kmem_free(sdp, sizeof (*sdp));
1049 1023  }
1050 1024  
1051 1025  static void
1052 1026  sysdc_exitclass(void *buf)
1053 1027  {
1054 1028          sysdc_leave((sysdc_t *)buf);
1055 1029  }
1056 1030  
1057 1031  /*ARGSUSED*/
1058 1032  static int
1059 1033  sysdc_canexit(kthread_t *t, cred_t *reqpcredp)
1060 1034  {
1061 1035          /* Threads cannot exit SDC once joined, except in a body bag. */
1062 1036          return (EPERM);
1063 1037  }
1064 1038  
1065 1039  static void
1066 1040  sysdc_exit(kthread_t *t)
1067 1041  {
1068 1042          sysdc_t *sdc;
1069 1043  
1070 1044          /* We're exiting, so we just rejoin the SYS class. */
1071 1045          thread_lock(t);
1072 1046          ASSERT(t->t_cid == sysdccid);
1073 1047          sdc = t->t_cldata;
1074 1048          t->t_cid = syscid;
1075 1049          t->t_cldata = NULL;
1076 1050          t->t_clfuncs = &(sclass[syscid].cl_funcs->thread);
1077 1051          (void) thread_change_pri(t, maxclsyspri, 0);
1078 1052          t->t_schedflag &= ~TS_RUNQMATCH;
1079 1053          thread_unlock_nopreempt(t);
1080 1054  
1081 1055          /* Unlink the sdc from everything. */
1082 1056          sysdc_leave(sdc);
1083 1057  }
1084 1058  
1085 1059  /*ARGSUSED*/
1086 1060  static int
1087 1061  sysdc_fork(kthread_t *t, kthread_t *ct, void *bufp)
1088 1062  {
1089 1063          /*
1090 1064           * Threads cannot be created with SDC as their class; they must
1091 1065           * be created as SYS and then added with sysdc_thread_enter().
1092 1066           * Because of this restriction, sysdc_fork() should never be called.
1093 1067           */
1094 1068          panic("sysdc cannot be forked");
1095 1069  
1096 1070          return (ENOSYS);
1097 1071  }
1098 1072  
1099 1073  /*ARGSUSED*/
1100 1074  static void
1101 1075  sysdc_forkret(kthread_t *t, kthread_t *ct)
1102 1076  {
1103 1077          /* SDC threads are part of system processes, which never fork. */
1104 1078          panic("sysdc cannot be forked");
1105 1079  }
1106 1080  
1107 1081  static pri_t
1108 1082  sysdc_globpri(kthread_t *t)
1109 1083  {
1110 1084          return (t->t_epri);
1111 1085  }
1112 1086  
1113 1087  /*ARGSUSED*/
1114 1088  static pri_t
1115 1089  sysdc_no_swap(kthread_t *t, int flags)
1116 1090  {
1117 1091          /* SDC threads cannot be swapped. */
1118 1092          return (-1);
1119 1093  }
1120 1094  
1121 1095  /*
1122 1096   * Get maximum and minimum priorities enjoyed by SDC threads.
1123 1097   */
1124 1098  static int
1125 1099  sysdc_getclpri(pcpri_t *pcprip)
1126 1100  {
1127 1101          pcprip->pc_clpmax = sysdc_maxpri;
1128 1102          pcprip->pc_clpmin = sysdc_minpri;
1129 1103          return (0);
1130 1104  }
1131 1105  
1132 1106  /*ARGSUSED*/
1133 1107  static int
1134 1108  sysdc_getclinfo(void *arg)
1135 1109  {
1136 1110          return (0);             /* no class-specific info */
1137 1111  }
1138 1112  
1139 1113  /*ARGSUSED*/
1140 1114  static int
1141 1115  sysdc_alloc(void **p, int flag)
1142 1116  {
1143 1117          sysdc_t *new;
1144 1118  
1145 1119          *p = NULL;
1146 1120          if ((new = kmem_zalloc(sizeof (*new), flag)) == NULL) {
1147 1121                  return (ENOMEM);
1148 1122          }
1149 1123          if ((new->sdc_pset = kmem_zalloc(sizeof (*new->sdc_pset), flag)) ==
1150 1124              NULL) {
1151 1125                  kmem_free(new, sizeof (*new));
1152 1126                  return (ENOMEM);
1153 1127          }
1154 1128          *p = new;
1155 1129          return (0);
1156 1130  }
1157 1131  
1158 1132  static void
1159 1133  sysdc_free(void *p)
1160 1134  {
1161 1135          sysdc_t *sdc = p;
1162 1136  
1163 1137          if (sdc != NULL) {
1164 1138                  /*
1165 1139                   * We must have failed CL_ENTERCLASS(), so our pset should be
1166 1140                   * there and unused.
1167 1141                   */
1168 1142                  ASSERT(sdc->sdc_pset != NULL);
1169 1143                  ASSERT(sdc->sdc_pset->sdp_cpupart == NULL);
1170 1144                  kmem_free(sdc->sdc_pset, sizeof (*sdc->sdc_pset));
1171 1145                  kmem_free(sdc, sizeof (*sdc));
1172 1146          }
1173 1147  }
1174 1148  
1175 1149  static int sysdc_enosys();      /* Boy, ANSI-C's K&R compatibility is weird. */
1176 1150  static int sysdc_einval();
1177 1151  static void sysdc_nullsys();
1178 1152  
1179 1153  static struct classfuncs sysdc_classfuncs = {
1180 1154          /* messages to class manager */
1181 1155          {
1182 1156                  sysdc_enosys,   /* admin */
1183 1157                  sysdc_getclinfo,
1184 1158                  sysdc_enosys,   /* parmsin */
1185 1159                  sysdc_enosys,   /* parmsout */
1186 1160                  sysdc_enosys,   /* vaparmsin */
1187 1161                  sysdc_enosys,   /* vaparmsout */
1188 1162                  sysdc_getclpri,
1189 1163                  sysdc_alloc,
1190 1164                  sysdc_free,
1191 1165          },
1192 1166          /* operations on threads */
1193 1167          {
1194 1168                  sysdc_enterclass,
1195 1169                  sysdc_exitclass,
1196 1170                  sysdc_canexit,
1197 1171                  sysdc_fork,
1198 1172                  sysdc_forkret,
1199 1173                  sysdc_nullsys,  /* parmsget */
1200 1174                  sysdc_enosys,   /* parmsset */
1201 1175                  sysdc_nullsys,  /* stop */
1202 1176                  sysdc_exit,
1203 1177                  sysdc_nullsys,  /* active */
1204 1178                  sysdc_nullsys,  /* inactive */
1205 1179                  sysdc_no_swap,  /* swapin */
1206 1180                  sysdc_no_swap,  /* swapout */
1207 1181                  sysdc_nullsys,  /* trapret */
1208 1182                  sysdc_preempt,
1209 1183                  sysdc_setrun,
1210 1184                  sysdc_sleep,
1211 1185                  sysdc_tick,
1212 1186                  sysdc_wakeup,
1213 1187                  sysdc_einval,   /* donice */
1214 1188                  sysdc_globpri,
1215 1189                  sysdc_nullsys,  /* set_process_group */
1216 1190                  sysdc_nullsys,  /* yield */
1217 1191                  sysdc_einval,   /* doprio */
1218 1192          }
1219 1193  };
1220 1194  
1221 1195  static int
1222 1196  sysdc_enosys()
1223 1197  {
1224 1198          return (ENOSYS);
1225 1199  }
1226 1200  
1227 1201  static int
1228 1202  sysdc_einval()
1229 1203  {
1230 1204          return (EINVAL);
1231 1205  }
1232 1206  
1233 1207  static void
1234 1208  sysdc_nullsys()
1235 1209  {
1236 1210  }
1237 1211  
1238 1212  /*ARGSUSED*/
1239 1213  static pri_t
1240 1214  sysdc_init(id_t cid, int clparmsz, classfuncs_t **clfuncspp)
1241 1215  {
1242 1216          int idx;
1243 1217  
1244 1218          list_create(&sysdc_psets, sizeof (sysdc_pset_t),
1245 1219              offsetof(sysdc_pset_t, sdp_node));
1246 1220  
1247 1221          for (idx = 0; idx < SYSDC_NLISTS; idx++) {
1248 1222                  sysdc_active[idx].sdl_list = &sysdc_dummy;
1249 1223          }
1250 1224  
1251 1225          sysdc_initparam();
1252 1226  
1253 1227          sysdccid = cid;
1254 1228          *clfuncspp = &sysdc_classfuncs;
1255 1229  
1256 1230          return ((pri_t)v.v_maxsyspri);
1257 1231  }
1258 1232  
1259 1233  static struct sclass csw = {
1260 1234          "SDC",
1261 1235          sysdc_init,
1262 1236          0
1263 1237  };
1264 1238  
1265 1239  static struct modlsched modlsched = {
1266 1240          &mod_schedops, "system duty cycle scheduling class", &csw
1267 1241  };
1268 1242  
1269 1243  static struct modlinkage modlinkage = {
1270 1244          MODREV_1, (void *)&modlsched, NULL
1271 1245  };
1272 1246  
1273 1247  int
1274 1248  _init()
1275 1249  {
1276 1250          return (mod_install(&modlinkage));
1277 1251  }
1278 1252  
1279 1253  int
1280 1254  _fini()
1281 1255  {
1282 1256          return (EBUSY);         /* can't unload for now */
1283 1257  }
1284 1258  
1285 1259  int
1286 1260  _info(struct modinfo *modinfop)
1287 1261  {
1288 1262          return (mod_info(&modlinkage, modinfop));
1289 1263  }
1290 1264  
1291 1265  /* --- consolidation-private interfaces --- */
1292 1266  void
1293 1267  sysdc_thread_enter(kthread_t *t, uint_t dc, uint_t flags)
1294 1268  {
1295 1269          void *buf = NULL;
1296 1270          sysdc_params_t sdp;
1297 1271  
1298 1272          SYSDC_INC_STAT(sysdc_thread_enter_enter);
1299 1273  
1300 1274          ASSERT(sysdc_param_init);
1301 1275          ASSERT(sysdccid >= 0);
1302 1276  
1303 1277          ASSERT((flags & ~SYSDC_THREAD_BATCH) == 0);
1304 1278  
1305 1279          sdp.sdp_minpri = sysdc_minpri;
1306 1280          sdp.sdp_maxpri = sysdc_maxpri;
1307 1281          sdp.sdp_DC = MAX(MIN(dc, sysdc_maxDC), sysdc_minDC);
1308 1282  
1309 1283          VERIFY0(CL_ALLOC(&buf, sysdccid, KM_SLEEP));
1310 1284  
1311 1285          ASSERT(t->t_lwp != NULL);
1312 1286          ASSERT(t->t_cid == syscid);
1313 1287          ASSERT(t->t_cldata == NULL);
1314 1288          VERIFY0(CL_CANEXIT(t, NULL));
1315 1289          VERIFY0(CL_ENTERCLASS(t, sysdccid, &sdp, kcred, buf));
1316 1290          CL_EXITCLASS(syscid, NULL);
1317 1291  }

↓ open down ↓

1086 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX