illumos-gate Wdiff usr/src/uts/common/disp/sysdc.c

Print this page

3006 VERIFY[S,U,P] and ASSERT[S,U,P] frequently check if first argument is zero

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/disp/sysdc.c
          +++ new/usr/src/uts/common/disp/sysdc.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the

↓ open down ↓

15 lines elided

↑ open up ↑

  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   */
  24   24  
  25   25  /*
       26 + * Copyright (c) 2012 by Delphix. All rights reserved.
       27 + */
       28 +
       29 +/*
  26   30   * The System Duty Cycle (SDC) scheduling class
  27   31   * --------------------------------------------
  28   32   *
  29   33   * Background
  30   34   *
  31   35   * Kernel threads in Solaris have traditionally not been large consumers
  32   36   * of CPU time.  They typically wake up, perform a small amount of
  33   37   * work, then go back to sleep waiting for either a timeout or another
  34   38   * signal.  On the assumption that the small amount of work that they do
  35   39   * is important for the behavior of the whole system, these threads are

  36   40   * treated kindly by the dispatcher and the SYS scheduling class: they run
  37   41   * without preemption from anything other than real-time and interrupt
  38   42   * threads; when preempted, they are put at the front of the queue, so they
  39   43   * generally do not migrate between CPUs; and they are allowed to stay
  40   44   * running until they voluntarily give up the CPU.
  41   45   *
  42   46   * As Solaris has evolved, new workloads have emerged which require the
  43   47   * kernel to perform significant amounts of CPU-intensive work.  One
  44   48   * example of such a workload is ZFS's transaction group sync processing.
  45   49   * Each sync operation generates a large batch of I/Os, and each I/O
  46   50   * may need to be compressed and/or checksummed before it is written to
  47   51   * storage.  The taskq threads which perform the compression and checksums
  48   52   * will run nonstop as long as they have work to do; a large sync operation
  49   53   * on a compression-heavy dataset can keep them busy for seconds on end.
  50   54   * This causes human-time-scale dispatch latency bubbles for any other
  51   55   * threads which have the misfortune to share a CPU with the taskq threads.
  52   56   *
  53   57   * The SDC scheduling class is a solution to this problem.
  54   58   *
  55   59   *
  56   60   * Overview
  57   61   *
  58   62   * SDC is centered around the concept of a thread's duty cycle (DC):
  59   63   *
  60   64   *                            ONPROC time
  61   65   *      Duty Cycle =    ----------------------
  62   66   *                      ONPROC + Runnable time
  63   67   *
  64   68   * This is the ratio of the time that the thread spent running on a CPU
  65   69   * divided by the time it spent running or trying to run.  It is unaffected
  66   70   * by any time the thread spent sleeping, stopped, etc.
  67   71   *
  68   72   * A thread joining the SDC class specifies a "target" DC that it wants
  69   73   * to run at.  To implement this policy, the routine sysdc_update() scans
  70   74   * the list of active SDC threads every few ticks and uses each thread's
  71   75   * microstate data to compute the actual duty cycle that that thread
  72   76   * has experienced recently.  If the thread is under its target DC, its
  73   77   * priority is increased to the maximum available (sysdc_maxpri, which is
  74   78   * 99 by default).  If the thread is over its target DC, its priority is
  75   79   * reduced to the minimum available (sysdc_minpri, 0 by default).  This
  76   80   * is a fairly primitive approach, in that it doesn't use any of the
  77   81   * intermediate priorities, but it's not completely inappropriate.  Even
  78   82   * though threads in the SDC class might take a while to do their job, they
  79   83   * are by some definition important if they're running inside the kernel,
  80   84   * so it is reasonable that they should get to run at priority 99.
  81   85   *
  82   86   * If a thread is running when sysdc_update() calculates its actual duty
  83   87   * cycle, and there are other threads of equal or greater priority on its
  84   88   * CPU's dispatch queue, sysdc_update() preempts that thread.  The thread
  85   89   * acknowledges the preemption by calling sysdc_preempt(), which calls
  86   90   * setbackdq(), which gives other threads with the same priority a chance
  87   91   * to run.  This creates a de facto time quantum for threads in the SDC
  88   92   * scheduling class.
  89   93   *
  90   94   * An SDC thread which is assigned priority 0 can continue to run if
  91   95   * nothing else needs to use the CPU that it's running on.  Similarly, an
  92   96   * SDC thread at priority 99 might not get to run as much as it wants to
  93   97   * if there are other priority-99 or higher threads on its CPU.  These
  94   98   * situations would cause the thread to get ahead of or behind its target
  95   99   * DC; the longer the situations lasted, the further ahead or behind the
  96  100   * thread would get.  Rather than condemning a thread to a lifetime of
  97  101   * paying for its youthful indiscretions, SDC keeps "base" values for
  98  102   * ONPROC and Runnable times in each thread's sysdc data, and updates these
  99  103   * values periodically.  The duty cycle is then computed using the elapsed
 100  104   * amount of ONPROC and Runnable times since those base times.
 101  105   *
 102  106   * Since sysdc_update() scans SDC threads fairly frequently, it tries to
 103  107   * keep the list of "active" threads small by pruning out threads which
 104  108   * have been asleep for a brief time.  They are not pruned immediately upon
 105  109   * going to sleep, since some threads may bounce back and forth between
 106  110   * sleeping and being runnable.
 107  111   *
 108  112   *
 109  113   * Interfaces
 110  114   *
 111  115   * void sysdc_thread_enter(t, dc, flags)
 112  116   *
 113  117   *      Moves a kernel thread from the SYS scheduling class to the
 114  118   *      SDC class. t must have an associated LWP (created by calling
 115  119   *      lwp_kernel_create()).  The thread will have a target DC of dc.
 116  120   *      Flags should be either 0 or SYSDC_THREAD_BATCH.  If
 117  121   *      SYSDC_THREAD_BATCH is specified, the thread is expected to be
 118  122   *      doing large amounts of processing.
 119  123   *
 120  124   *
 121  125   * Complications
 122  126   *
 123  127   * - Run queue balancing
 124  128   *
 125  129   *      The Solaris dispatcher is biased towards letting a thread run
 126  130   *      on the same CPU which it last ran on, if no more than 3 ticks
 127  131   *      (i.e. rechoose_interval) have passed since the thread last ran.
 128  132   *      This helps to preserve cache warmth.  On the other hand, it also
 129  133   *      tries to keep the per-CPU run queues fairly balanced; if the CPU
 130  134   *      chosen for a runnable thread has a run queue which is three or
 131  135   *      more threads longer than a neighboring CPU's queue, the runnable
 132  136   *      thread is dispatched onto the neighboring CPU instead.
 133  137   *
 134  138   *      These policies work well for some workloads, but not for many SDC
 135  139   *      threads.  The taskq client of SDC, for example, has many discrete
 136  140   *      units of work to do.  The work units are largely independent, so
 137  141   *      cache warmth is not an important consideration.  It is important
 138  142   *      that the threads fan out quickly to different CPUs, since the
 139  143   *      amount of work these threads have to do (a few seconds worth at a
 140  144   *      time) doesn't leave much time to correct thread placement errors
 141  145   *      (i.e. two SDC threads being dispatched to the same CPU).
 142  146   *
 143  147   *      To fix this, SDC uses the TS_RUNQMATCH flag introduced for FSS.
 144  148   *      This tells the dispatcher to keep neighboring run queues' lengths
 145  149   *      more evenly matched, which allows SDC threads to migrate more
 146  150   *      easily.
 147  151   *
 148  152   * - LWPs and system processes
 149  153   *
 150  154   *      SDC can only be used for kernel threads.  Since SDC uses microstate
 151  155   *      accounting data to compute each thread's actual duty cycle, all
 152  156   *      threads entering the SDC class must have associated LWPs (which
 153  157   *      store the microstate data).  This means that the threads have to
 154  158   *      be associated with an SSYS process, i.e. one created by newproc().
 155  159   *      If the microstate accounting information is ever moved into the
 156  160   *      kthread_t, this restriction could be lifted.
 157  161   *
 158  162   * - Dealing with oversubscription
 159  163   *
 160  164   *      Since SDC duty cycles are per-thread, it is possible that the
 161  165   *      aggregate requested duty cycle of all SDC threads in a processor
 162  166   *      set could be greater than the total CPU time available in that set.
 163  167   *      The FSS scheduling class has an analogous situation, which it deals
 164  168   *      with by reducing each thread's allotted CPU time proportionally.
 165  169   *      Since SDC doesn't need to be as precise as FSS, it uses a simpler
 166  170   *      solution to the oversubscription problem.
 167  171   *
 168  172   *      sysdc_update() accumulates the amount of time that max-priority SDC
 169  173   *      threads have spent on-CPU in each processor set, and uses that sum
 170  174   *      to create an implied duty cycle for that processor set:
 171  175   *
 172  176   *                              accumulated CPU time
 173  177   *         pset DC =    -----------------------------------
 174  178   *                       (# CPUs) * time since last update
 175  179   *
 176  180   *      If this implied duty cycle is above a maximum pset duty cycle (90%
 177  181   *      by default), sysdc_update() sets the priority of all SDC threads
 178  182   *      in that processor set to sysdc_minpri for a "break" period.  After
 179  183   *      the break period, it waits for a "nobreak" period before trying to
 180  184   *      enforce the pset duty cycle limit again.
 181  185   *
 182  186   * - Processor sets
 183  187   *
 184  188   *      As the above implies, SDC is processor set aware, but it does not
 185  189   *      currently allow threads to change processor sets while in the SDC
 186  190   *      class.  Instead, those threads must join the desired processor set
 187  191   *      before entering SDC. [1]
 188  192   *
 189  193   * - Batch threads
 190  194   *
 191  195   *      A thread joining the SDC class can specify the SDC_THREAD_BATCH
 192  196   *      flag.  This flag currently has no effect, but marks threads which
 193  197   *      do bulk processing.
 194  198   *
 195  199   * - t_kpri_req
 196  200   *
 197  201   *      The TS and FSS scheduling classes pay attention to t_kpri_req,
 198  202   *      which provides a simple form of priority inheritance for
 199  203   *      synchronization primitives (such as rwlocks held as READER) which
 200  204   *      cannot be traced to a unique thread.  The SDC class does not honor
 201  205   *      t_kpri_req, for a few reasons:
 202  206   *
 203  207   *      1.  t_kpri_req is notoriously inaccurate.  A measure of its
 204  208   *          inaccuracy is that it needs to be cleared every time a thread
 205  209   *          returns to user mode, because it is frequently non-zero at that
 206  210   *          point.  This can happen because "ownership" of synchronization
 207  211   *          primitives that use t_kpri_req can be silently handed off,
 208  212   *          leaving no opportunity to will the t_kpri_req inheritance.
 209  213   *
 210  214   *      2.  Unlike in TS and FSS, threads in SDC *will* eventually run at
 211  215   *          kernel priority.  This means that even if an SDC thread
 212  216   *          is holding a synchronization primitive and running at low
 213  217   *          priority, its priority will eventually be raised above 60,
 214  218   *          allowing it to drive on and release the resource.
 215  219   *
 216  220   *      3.  The first consumer of SDC uses the taskq subsystem, which holds
 217  221   *          a reader lock for the duration of the task's execution.  This
 218  222   *          would mean that SDC threads would never drop below kernel
 219  223   *          priority in practice, which defeats one of the purposes of SDC.
 220  224   *
 221  225   * - Why not FSS?
 222  226   *
 223  227   *      It might seem that the existing FSS scheduling class could solve
 224  228   *      the problems that SDC is attempting to solve.  FSS's more precise
 225  229   *      solution to the oversubscription problem would hardly cause
 226  230   *      trouble, as long as it performed well.  SDC is implemented as
 227  231   *      a separate scheduling class for two main reasons: the initial
 228  232   *      consumer of SDC does not map well onto the "project" abstraction
 229  233   *      that is central to FSS, and FSS does not expect to run at kernel
 230  234   *      priorities.
 231  235   *
 232  236   *
 233  237   * Tunables
 234  238   *
 235  239   * - sysdc_update_interval_msec:  Number of milliseconds between
 236  240   *      consecutive thread priority updates.
 237  241   *
 238  242   * - sysdc_reset_interval_msec:  Number of milliseconds between
 239  243   *      consecutive resets of a thread's base ONPROC and Runnable
 240  244   *      times.
 241  245   *
 242  246   * - sysdc_prune_interval_msec:  Number of milliseconds of sleeping
 243  247   *      before a thread is pruned from the active list.
 244  248   *
 245  249   * - sysdc_max_pset_DC:  Allowable percentage of a processor set's
 246  250   *      CPU time which SDC can give to its high-priority threads.
 247  251   *
 248  252   * - sysdc_break_msec:  Number of milliseconds of "break" taken when
 249  253   *      sysdc_max_pset_DC is exceeded.
 250  254   *
 251  255   *
 252  256   * Future work (in SDC and related subsystems)
 253  257   *
 254  258   * - Per-thread rechoose interval (0 for SDC)
 255  259   *
 256  260   *      Allow each thread to specify its own rechoose interval.  SDC
 257  261   *      threads would specify an interval of zero, which would rechoose
 258  262   *      the CPU with the lowest priority once per update.
 259  263   *
 260  264   * - Allow threads to change processor sets after joining the SDC class
 261  265   *
 262  266   * - Thread groups and per-group DC
 263  267   *
 264  268   *      It might be nice to be able to specify a duty cycle which applies
 265  269   *      to a group of threads in aggregate.
 266  270   *
 267  271   * - Per-group DC callback to allow dynamic DC tuning
 268  272   *
 269  273   *      Currently, DCs are assigned when the thread joins SDC.  Some
 270  274   *      workloads could benefit from being able to tune their DC using
 271  275   *      subsystem-specific knowledge about the workload.
 272  276   *
 273  277   * - Finer-grained priority updates
 274  278   *
 275  279   * - More nuanced management of oversubscription
 276  280   *
 277  281   * - Moving other CPU-intensive threads into SDC
 278  282   *
 279  283   * - Move msacct data into kthread_t
 280  284   *
 281  285   *      This would allow kernel threads without LWPs to join SDC.
 282  286   *
 283  287   *
 284  288   * Footnotes
 285  289   *
 286  290   * [1] The details of doing so are left as an exercise for the reader.
 287  291   */
 288  292  
 289  293  #include <sys/types.h>
 290  294  #include <sys/sysdc.h>
 291  295  #include <sys/sysdc_impl.h>
 292  296  
 293  297  #include <sys/class.h>
 294  298  #include <sys/cmn_err.h>
 295  299  #include <sys/cpuvar.h>
 296  300  #include <sys/cpupart.h>
 297  301  #include <sys/debug.h>
 298  302  #include <sys/disp.h>
 299  303  #include <sys/errno.h>
 300  304  #include <sys/inline.h>
 301  305  #include <sys/kmem.h>
 302  306  #include <sys/modctl.h>
 303  307  #include <sys/schedctl.h>
 304  308  #include <sys/sdt.h>
 305  309  #include <sys/sunddi.h>
 306  310  #include <sys/sysmacros.h>
 307  311  #include <sys/systm.h>
 308  312  #include <sys/var.h>
 309  313  
 310  314  /*
 311  315   * Tunables - loaded into the internal state at module load time
 312  316   */
 313  317  uint_t          sysdc_update_interval_msec = 20;
 314  318  uint_t          sysdc_reset_interval_msec = 400;
 315  319  uint_t          sysdc_prune_interval_msec = 100;
 316  320  uint_t          sysdc_max_pset_DC = 90;
 317  321  uint_t          sysdc_break_msec = 80;
 318  322  
 319  323  /*
 320  324   * Internal state - constants set up by sysdc_initparam()
 321  325   */
 322  326  static clock_t  sysdc_update_ticks;     /* ticks between updates */
 323  327  static uint_t   sysdc_prune_updates;    /* updates asleep before pruning */
 324  328  static uint_t   sysdc_reset_updates;    /* # of updates before reset */
 325  329  static uint_t   sysdc_break_updates;    /* updates to break */
 326  330  static uint_t   sysdc_nobreak_updates;  /* updates to not check */
 327  331  static uint_t   sysdc_minDC;            /* minimum allowed DC */
 328  332  static uint_t   sysdc_maxDC;            /* maximum allowed DC */
 329  333  static pri_t    sysdc_minpri;           /* minimum allowed priority */
 330  334  static pri_t    sysdc_maxpri;           /* maximum allowed priority */
 331  335  
 332  336  /*
 333  337   * Internal state
 334  338   */
 335  339  static kmutex_t sysdc_pset_lock;        /* lock protecting pset data */
 336  340  static list_t   sysdc_psets;            /* list of psets with SDC threads */
 337  341  static uint_t   sysdc_param_init;       /* sysdc_initparam() has been called */
 338  342  static uint_t   sysdc_update_timeout_started; /* update timeout is active */
 339  343  static hrtime_t sysdc_last_update;      /* time of last sysdc_update() */
 340  344  static sysdc_t  sysdc_dummy;            /* used to terminate active lists */
 341  345  
 342  346  /*
 343  347   * Internal state - active hash table
 344  348   */
 345  349  #define SYSDC_NLISTS    8
 346  350  #define SYSDC_HASH(sdc) (((uintptr_t)(sdc) >> 6) & (SYSDC_NLISTS - 1))
 347  351  static sysdc_list_t     sysdc_active[SYSDC_NLISTS];
 348  352  #define SYSDC_LIST(sdc)         (&sysdc_active[SYSDC_HASH(sdc)])
 349  353  
 350  354  #ifdef DEBUG
 351  355  static struct {
 352  356          uint64_t        sysdc_update_times_asleep;
 353  357          uint64_t        sysdc_update_times_base_ran_backwards;
 354  358          uint64_t        sysdc_update_times_already_done;
 355  359          uint64_t        sysdc_update_times_cur_ran_backwards;
 356  360          uint64_t        sysdc_compute_pri_breaking;
 357  361          uint64_t        sysdc_activate_enter;
 358  362          uint64_t        sysdc_update_enter;
 359  363          uint64_t        sysdc_update_exited;
 360  364          uint64_t        sysdc_update_not_sdc;
 361  365          uint64_t        sysdc_update_idle;
 362  366          uint64_t        sysdc_update_take_break;
 363  367          uint64_t        sysdc_update_no_psets;
 364  368          uint64_t        sysdc_tick_not_sdc;
 365  369          uint64_t        sysdc_tick_quantum_expired;
 366  370          uint64_t        sysdc_thread_enter_enter;
 367  371  } sysdc_stats;
 368  372  
 369  373  #define SYSDC_INC_STAT(x)       (sysdc_stats.x++)
 370  374  #else
 371  375  #define SYSDC_INC_STAT(x)       ((void)0)
 372  376  #endif
 373  377  
 374  378  /* macros are UPPER CASE */
 375  379  #define HOWMANY(a, b)   howmany((a), (b))
 376  380  #define MSECTOTICKS(a)  HOWMANY((a) * 1000, usec_per_tick)
 377  381  
 378  382  static void
 379  383  sysdc_initparam(void)
 380  384  {
 381  385          uint_t sysdc_break_ticks;
 382  386  
 383  387          /* update / prune intervals */
 384  388          sysdc_update_ticks = MSECTOTICKS(sysdc_update_interval_msec);
 385  389  
 386  390          sysdc_prune_updates = HOWMANY(sysdc_prune_interval_msec,
 387  391              sysdc_update_interval_msec);
 388  392          sysdc_reset_updates = HOWMANY(sysdc_reset_interval_msec,
 389  393              sysdc_update_interval_msec);
 390  394  
 391  395          /* We must get at least a little time on CPU. */
 392  396          sysdc_minDC = 1;
 393  397          sysdc_maxDC = SYSDC_DC_MAX;
 394  398          sysdc_minpri = 0;
 395  399          sysdc_maxpri = maxclsyspri;
 396  400  
 397  401          /* break parameters */
 398  402          if (sysdc_max_pset_DC > SYSDC_DC_MAX) {
 399  403                  sysdc_max_pset_DC = SYSDC_DC_MAX;
 400  404          }
 401  405          sysdc_break_ticks = MSECTOTICKS(sysdc_break_msec);
 402  406          sysdc_break_updates = HOWMANY(sysdc_break_ticks, sysdc_update_ticks);
 403  407  
 404  408          /*
 405  409           * We want:
 406  410           *
 407  411           *      sysdc_max_pset_DC = (nobreak / (break + nobreak))
 408  412           *
 409  413           *      ==>       nobreak = sysdc_max_pset_DC * (break + nobreak)
 410  414           *
 411  415           *                          sysdc_max_pset_DC * break
 412  416           *      ==>       nobreak = -------------------------
 413  417           *                          1 - sysdc_max_pset_DC
 414  418           */
 415  419          sysdc_nobreak_updates =
 416  420              HOWMANY((uint64_t)sysdc_break_updates * sysdc_max_pset_DC,
 417  421              (SYSDC_DC_MAX - sysdc_max_pset_DC));
 418  422  
 419  423          sysdc_param_init = 1;
 420  424  }
 421  425  
 422  426  #undef HOWMANY
 423  427  #undef MSECTOTICKS
 424  428  
 425  429  #define SDC_UPDATE_INITIAL      0x1     /* for the initial update */
 426  430  #define SDC_UPDATE_TIMEOUT      0x2     /* from sysdc_update() */
 427  431  #define SDC_UPDATE_TICK         0x4     /* from sysdc_tick(), on expiry */
 428  432  
 429  433  /*
 430  434   * Updates the recorded times in the sdc, and returns the elapsed ONPROC
 431  435   * and Runnable times since the last reset.
 432  436   *
 433  437   * newO is the thread's actual ONPROC time; it's used during sysdc_update()
 434  438   * to track processor set usage.
 435  439   */
 436  440  static void
 437  441  sysdc_update_times(sysdc_t *sdc, uint_t flags,
 438  442      hrtime_t *O, hrtime_t *R, hrtime_t *newO)
 439  443  {
 440  444          kthread_t *const t = sdc->sdc_thread;
 441  445          const uint_t    initial = (flags & SDC_UPDATE_INITIAL);
 442  446          const uint_t    update = (flags & SDC_UPDATE_TIMEOUT);
 443  447          const clock_t   now = ddi_get_lbolt();
 444  448          uint_t          do_reset;
 445  449  
 446  450          ASSERT(THREAD_LOCK_HELD(t));
 447  451  
 448  452          *O = *R = 0;
 449  453  
 450  454          /* If we've been sleeping, we know we haven't had any ONPROC time. */
 451  455          if (sdc->sdc_sleep_updates != 0 &&
 452  456              sdc->sdc_sleep_updates != sdc->sdc_nupdates) {
 453  457                  *newO = sdc->sdc_last_base_O;
 454  458                  SYSDC_INC_STAT(sysdc_update_times_asleep);
 455  459                  return;
 456  460          }
 457  461  
 458  462          /*
 459  463           * If this is our first update, or we've hit the reset point,
 460  464           * we need to reset our base_{O,R}.  Once we've updated them, we
 461  465           * report O and R for the entire prior interval.
 462  466           */
 463  467          do_reset = initial;
 464  468          if (update) {
 465  469                  ++sdc->sdc_nupdates;
 466  470                  if ((sdc->sdc_nupdates % sysdc_reset_updates) == 0)
 467  471                          do_reset = 1;
 468  472          }
 469  473          if (do_reset) {
 470  474                  hrtime_t baseO, baseR;
 471  475                  if (initial) {
 472  476                          /*
 473  477                           * Start off our cycle count somewhere in the middle,
 474  478                           * to keep the resets from all happening at once.
 475  479                           *
 476  480                           * 4999 is a handy prime much larger than
 477  481                           * sysdc_reset_updates, so that we don't run into
 478  482                           * trouble if the resolution is a multiple of
 479  483                           * sysdc_reset_updates.
 480  484                           */
 481  485                          sdc->sdc_nupdates = (uint_t)((gethrtime() % 4999) %
 482  486                              sysdc_reset_updates);
 483  487                          baseO = baseR = 0;
 484  488                  } else {
 485  489                          baseO = sdc->sdc_base_O;
 486  490                          baseR = sdc->sdc_base_R;
 487  491                  }
 488  492  
 489  493                  mstate_systhread_times(t, &sdc->sdc_base_O, &sdc->sdc_base_R);
 490  494                  *newO = sdc->sdc_base_O;
 491  495  
 492  496                  sdc->sdc_reset = now;
 493  497                  sdc->sdc_pri_check = -1; /* force mismatch below */
 494  498  
 495  499                  /*
 496  500                   * See below for rationale.
 497  501                   */
 498  502                  if (baseO > sdc->sdc_base_O || baseR > sdc->sdc_base_R) {
 499  503                          SYSDC_INC_STAT(sysdc_update_times_base_ran_backwards);
 500  504                          baseO = sdc->sdc_base_O;
 501  505                          baseR = sdc->sdc_base_R;
 502  506                  }
 503  507  
 504  508                  /* compute based on the entire interval */
 505  509                  *O = (sdc->sdc_base_O - baseO);
 506  510                  *R = (sdc->sdc_base_R - baseR);
 507  511                  return;
 508  512          }
 509  513  
 510  514          /*
 511  515           * If we're called from sysdc_update(), we *must* return a value
 512  516           * for newO, so we always call mstate_systhread_times().
 513  517           *
 514  518           * Otherwise, if we've already done a pri check this tick,
 515  519           * we can skip it.
 516  520           */
 517  521          if (!update && sdc->sdc_pri_check == now) {
 518  522                  SYSDC_INC_STAT(sysdc_update_times_already_done);
 519  523                  return;
 520  524          }
 521  525  
 522  526          /* Get the current times from the thread */
 523  527          sdc->sdc_pri_check = now;
 524  528          mstate_systhread_times(t, &sdc->sdc_cur_O, &sdc->sdc_cur_R);
 525  529          *newO = sdc->sdc_cur_O;
 526  530  
 527  531          /*
 528  532           * The updating of microstate accounting is not done under a
 529  533           * consistent set of locks, particularly the t_waitrq field.  This
 530  534           * can lead to narrow windows in which we account for time in the
 531  535           * wrong bucket, which on the next read will be accounted for
 532  536           * correctly.
 533  537           *
 534  538           * If our sdc_base_* fields were affected by one of these blips, we
 535  539           * throw away the old data, and pretend this tick didn't happen.
 536  540           */
 537  541          if (sdc->sdc_cur_O < sdc->sdc_base_O ||
 538  542              sdc->sdc_cur_R < sdc->sdc_base_R) {
 539  543  
 540  544                  sdc->sdc_base_O = sdc->sdc_cur_O;
 541  545                  sdc->sdc_base_R = sdc->sdc_cur_R;
 542  546  
 543  547                  SYSDC_INC_STAT(sysdc_update_times_cur_ran_backwards);
 544  548                  return;
 545  549          }
 546  550  
 547  551          *O = sdc->sdc_cur_O - sdc->sdc_base_O;
 548  552          *R = sdc->sdc_cur_R - sdc->sdc_base_R;
 549  553  }
 550  554  
 551  555  /*
 552  556   * sysdc_compute_pri()
 553  557   *
 554  558   *      Recomputes the priority of the thread, leaving the result in
 555  559   *      sdc->sdc_epri.  Returns 1 if a priority update should occur
 556  560   *      (which will also trigger a cpu_surrender()), otherwise
 557  561   *      returns 0.
 558  562   */
 559  563  static uint_t
 560  564  sysdc_compute_pri(sysdc_t *sdc, uint_t flags)
 561  565  {
 562  566          kthread_t *const t = sdc->sdc_thread;
 563  567          const uint_t    update = (flags & SDC_UPDATE_TIMEOUT);
 564  568          const uint_t    tick = (flags & SDC_UPDATE_TICK);
 565  569  
 566  570          hrtime_t        O, R;
 567  571          hrtime_t        newO = -1;
 568  572  
 569  573          ASSERT(THREAD_LOCK_HELD(t));
 570  574  
 571  575          sysdc_update_times(sdc, flags, &O, &R, &newO);
 572  576          ASSERT(!update || newO != -1);
 573  577  
 574  578          /* If we have new data, recompute our priority. */
 575  579          if ((O + R) != 0) {
 576  580                  sdc->sdc_cur_DC = (O * SYSDC_DC_MAX) / (O + R);
 577  581  
 578  582                  /* Adjust our priority to move our DC closer to the target. */
 579  583                  if (sdc->sdc_cur_DC < sdc->sdc_target_DC)
 580  584                          sdc->sdc_pri = sdc->sdc_maxpri;
 581  585                  else
 582  586                          sdc->sdc_pri = sdc->sdc_minpri;
 583  587          }
 584  588  
 585  589          /*
 586  590           * If our per-pset duty cycle goes over the max, we will take a break.
 587  591           * This forces all sysdc threads in the pset to minimum priority, in
 588  592           * order to let everyone else have a chance at the CPU.
 589  593           */
 590  594          if (sdc->sdc_pset->sdp_need_break) {
 591  595                  SYSDC_INC_STAT(sysdc_compute_pri_breaking);
 592  596                  sdc->sdc_epri = sdc->sdc_minpri;
 593  597          } else {
 594  598                  sdc->sdc_epri = sdc->sdc_pri;
 595  599          }
 596  600  
 597  601          DTRACE_PROBE4(sysdc__compute__pri,
 598  602              kthread_t *, t, pri_t, sdc->sdc_epri, uint_t, sdc->sdc_cur_DC,
 599  603              uint_t, sdc->sdc_target_DC);
 600  604  
 601  605          /*
 602  606           * For sysdc_update(), we compute the ONPROC time for high-priority
 603  607           * threads, which is used to calculate the per-pset duty cycle.  We
 604  608           * will always tell our callers to update the thread's priority,
 605  609           * since we want to force a cpu_surrender().
 606  610           *
 607  611           * We reset sdc_update_ticks so that sysdc_tick() will only update
 608  612           * the thread's priority if our timeout is delayed by a tick or
 609  613           * more.
 610  614           */
 611  615          if (update) {
 612  616                  /* SDC threads are not allowed to change cpupart bindings. */
 613  617                  ASSERT(t->t_cpupart == sdc->sdc_pset->sdp_cpupart);
 614  618  
 615  619                  /* If we were at MAXPRI, account for our onproc time. */
 616  620                  if (t->t_pri == sdc->sdc_maxpri &&
 617  621                      sdc->sdc_last_base_O != 0 &&
 618  622                      sdc->sdc_last_base_O < newO) {
 619  623                          sdc->sdc_last_O = newO - sdc->sdc_last_base_O;
 620  624                          sdc->sdc_pset->sdp_onproc_time +=
 621  625                              (uint64_t)sdc->sdc_last_O;
 622  626                          sdc->sdc_pset->sdp_onproc_threads++;
 623  627                  } else {
 624  628                          sdc->sdc_last_O = 0;
 625  629                  }
 626  630                  sdc->sdc_last_base_O = newO;
 627  631  
 628  632                  sdc->sdc_update_ticks = sdc->sdc_ticks + sysdc_update_ticks + 1;
 629  633                  return (1);
 630  634          }
 631  635  
 632  636          /*
 633  637           * Like sysdc_update(), sysdc_tick() always wants to update the
 634  638           * thread's priority, so that the CPU is surrendered if necessary.
 635  639           * We reset sdc_update_ticks so that if the timeout continues to be
 636  640           * delayed, we'll update at the regular interval.
 637  641           */
 638  642          if (tick) {
 639  643                  ASSERT(sdc->sdc_ticks == sdc->sdc_update_ticks);
 640  644                  sdc->sdc_update_ticks = sdc->sdc_ticks + sysdc_update_ticks;
 641  645                  return (1);
 642  646          }
 643  647  
 644  648          /*
 645  649           * Otherwise, only tell our callers to update the priority if it has
 646  650           * changed.
 647  651           */
 648  652          return (sdc->sdc_epri != t->t_pri);
 649  653  }
 650  654  
 651  655  static void
 652  656  sysdc_update_pri(sysdc_t *sdc, uint_t flags)
 653  657  {
 654  658          kthread_t *t = sdc->sdc_thread;
 655  659  
 656  660          ASSERT(THREAD_LOCK_HELD(t));
 657  661  
 658  662          if (sysdc_compute_pri(sdc, flags)) {
 659  663                  if (!thread_change_pri(t, sdc->sdc_epri, 0)) {
 660  664                          cpu_surrender(t);
 661  665                  }
 662  666          }
 663  667  }
 664  668  
 665  669  /*
 666  670   * Add a thread onto the active list.  It will only be removed by
 667  671   * sysdc_update().
 668  672   */
 669  673  static void
 670  674  sysdc_activate(sysdc_t *sdc)
 671  675  {
 672  676          sysdc_t *volatile *headp = &SYSDC_LIST(sdc)->sdl_list;
 673  677          sysdc_t         *head;
 674  678          kthread_t       *t = sdc->sdc_thread;
 675  679  
 676  680          SYSDC_INC_STAT(sysdc_activate_enter);
 677  681  
 678  682          ASSERT(sdc->sdc_next == NULL);
 679  683          ASSERT(THREAD_LOCK_HELD(t));
 680  684  
 681  685          do {
 682  686                  head = *headp;
 683  687                  sdc->sdc_next = head;
 684  688          } while (atomic_cas_ptr(headp, head, sdc) != head);
 685  689  }
 686  690  
 687  691  /*
 688  692   * sysdc_update() has two jobs:
 689  693   *
 690  694   *      1. It updates the priorities of all active SDC threads on the system.
 691  695   *      2. It measures pset CPU usage and enforces sysdc_max_pset_DC.
 692  696   */
 693  697  static void
 694  698  sysdc_update(void *arg)
 695  699  {
 696  700          int             idx;
 697  701          sysdc_t         *freelist = NULL;
 698  702          sysdc_pset_t    *cur;
 699  703          hrtime_t        now, diff;
 700  704          uint_t          redeploy = 1;
 701  705  
 702  706          SYSDC_INC_STAT(sysdc_update_enter);
 703  707  
 704  708          ASSERT(sysdc_update_timeout_started);
 705  709  
 706  710          /*
 707  711           * If this is our first time through, diff will be gigantic, and
 708  712           * no breaks will be necessary.
 709  713           */
 710  714          now = gethrtime();
 711  715          diff = now - sysdc_last_update;
 712  716          sysdc_last_update = now;
 713  717  
 714  718          mutex_enter(&sysdc_pset_lock);
 715  719          for (cur = list_head(&sysdc_psets); cur != NULL;
 716  720              cur = list_next(&sysdc_psets, cur)) {
 717  721                  boolean_t breaking = (cur->sdp_should_break != 0);
 718  722  
 719  723                  if (cur->sdp_need_break != breaking) {
 720  724                          DTRACE_PROBE2(sdc__pset__break, sysdc_pset_t *, cur,
 721  725                              boolean_t, breaking);
 722  726                  }
 723  727                  cur->sdp_onproc_time = 0;
 724  728                  cur->sdp_onproc_threads = 0;
 725  729                  cur->sdp_need_break = breaking;
 726  730          }
 727  731          mutex_exit(&sysdc_pset_lock);
 728  732  
 729  733          for (idx = 0; idx < SYSDC_NLISTS; idx++) {
 730  734                  sysdc_list_t            *sdl = &sysdc_active[idx];
 731  735                  sysdc_t *volatile       *headp = &sdl->sdl_list;
 732  736                  sysdc_t                 *head, *tail;
 733  737                  sysdc_t                 **prevptr;
 734  738  
 735  739                  if (*headp == &sysdc_dummy)
 736  740                          continue;
 737  741  
 738  742                  /* Prevent any threads from exiting while we're poking them. */
 739  743                  mutex_enter(&sdl->sdl_lock);
 740  744  
 741  745                  /*
 742  746                   * Each sdl_list contains a singly-linked list of active
 743  747                   * threads. Threads which become active while we are
 744  748                   * processing the list will be added to sdl_list.  Since we
 745  749                   * don't want that to interfere with our own processing, we
 746  750                   * swap in an empty list.  Any newly active threads will
 747  751                   * go on to this empty list.  When finished, we'll put any
 748  752                   * such threads at the end of the processed list.
 749  753                   */
 750  754                  head = atomic_swap_ptr(headp, &sysdc_dummy);
 751  755                  prevptr = &head;
 752  756                  while (*prevptr != &sysdc_dummy) {
 753  757                          sysdc_t         *const  sdc = *prevptr;
 754  758                          kthread_t       *const  t = sdc->sdc_thread;
 755  759  
 756  760                          /*
 757  761                           * If the thread has exited, move its sysdc_t onto
 758  762                           * freelist, to be freed later.
 759  763                           */
 760  764                          if (t == NULL) {
 761  765                                  *prevptr = sdc->sdc_next;
 762  766                                  SYSDC_INC_STAT(sysdc_update_exited);
 763  767                                  sdc->sdc_next = freelist;
 764  768                                  freelist = sdc;
 765  769                                  continue;
 766  770                          }
 767  771  
 768  772                          thread_lock(t);
 769  773                          if (t->t_cid != sysdccid) {
 770  774                                  thread_unlock(t);
 771  775                                  prevptr = &sdc->sdc_next;
 772  776                                  SYSDC_INC_STAT(sysdc_update_not_sdc);
 773  777                                  continue;
 774  778                          }
 775  779                          ASSERT(t->t_cldata == sdc);
 776  780  
 777  781                          /*
 778  782                           * If the thread has been sleeping for longer
 779  783                           * than sysdc_prune_interval, make it inactive by
 780  784                           * removing it from the list.
 781  785                           */
 782  786                          if (!(t->t_state & (TS_RUN | TS_ONPROC)) &&
 783  787                              sdc->sdc_sleep_updates != 0 &&
 784  788                              (sdc->sdc_sleep_updates - sdc->sdc_nupdates) >
 785  789                              sysdc_prune_updates) {
 786  790                                  *prevptr = sdc->sdc_next;
 787  791                                  SYSDC_INC_STAT(sysdc_update_idle);
 788  792                                  sdc->sdc_next = NULL;
 789  793                                  thread_unlock(t);
 790  794                                  continue;
 791  795                          }
 792  796                          sysdc_update_pri(sdc, SDC_UPDATE_TIMEOUT);
 793  797                          thread_unlock(t);
 794  798  
 795  799                          prevptr = &sdc->sdc_next;
 796  800                  }
 797  801  
 798  802                  /*
 799  803                   * Add our list to the bucket, putting any new entries
 800  804                   * added while we were working at the tail of the list.
 801  805                   */
 802  806                  do {
 803  807                          tail = *headp;
 804  808                          *prevptr = tail;
 805  809                  } while (atomic_cas_ptr(headp, tail, head) != tail);
 806  810  
 807  811                  mutex_exit(&sdl->sdl_lock);
 808  812          }
 809  813  
 810  814          mutex_enter(&sysdc_pset_lock);
 811  815          for (cur = list_head(&sysdc_psets); cur != NULL;
 812  816              cur = list_next(&sysdc_psets, cur)) {
 813  817  
 814  818                  cur->sdp_vtime_last_interval =
 815  819                      diff * cur->sdp_cpupart->cp_ncpus;
 816  820                  cur->sdp_DC_last_interval =
 817  821                      (cur->sdp_onproc_time * SYSDC_DC_MAX) /
 818  822                      cur->sdp_vtime_last_interval;
 819  823  
 820  824                  if (cur->sdp_should_break > 0) {
 821  825                          cur->sdp_should_break--;        /* breaking */
 822  826                          continue;
 823  827                  }
 824  828                  if (cur->sdp_dont_break > 0) {
 825  829                          cur->sdp_dont_break--;  /* waiting before checking */
 826  830                          continue;
 827  831                  }
 828  832                  if (cur->sdp_DC_last_interval > sysdc_max_pset_DC) {
 829  833                          cur->sdp_should_break = sysdc_break_updates;
 830  834                          cur->sdp_dont_break = sysdc_nobreak_updates;
 831  835                          SYSDC_INC_STAT(sysdc_update_take_break);
 832  836                  }
 833  837          }
 834  838  
 835  839          /*
 836  840           * If there are no sysdc_psets, there can be no threads, so
 837  841           * we can stop doing our timeout.  Since we're holding the
 838  842           * sysdc_pset_lock, no new sysdc_psets can come in, which will
 839  843           * prevent anyone from racing with this and dropping our timeout
 840  844           * on the floor.
 841  845           */
 842  846          if (list_is_empty(&sysdc_psets)) {
 843  847                  SYSDC_INC_STAT(sysdc_update_no_psets);
 844  848                  ASSERT(sysdc_update_timeout_started);
 845  849                  sysdc_update_timeout_started = 0;
 846  850  
 847  851                  redeploy = 0;
 848  852          }
 849  853          mutex_exit(&sysdc_pset_lock);
 850  854  
 851  855          while (freelist != NULL) {
 852  856                  sysdc_t *cur = freelist;
 853  857                  freelist = cur->sdc_next;
 854  858                  kmem_free(cur, sizeof (*cur));
 855  859          }
 856  860  
 857  861          if (redeploy) {
 858  862                  (void) timeout(sysdc_update, arg, sysdc_update_ticks);
 859  863          }
 860  864  }
 861  865  
 862  866  static void
 863  867  sysdc_preempt(kthread_t *t)
 864  868  {
 865  869          ASSERT(t == curthread);
 866  870          ASSERT(THREAD_LOCK_HELD(t));
 867  871  
 868  872          setbackdq(t);           /* give others a chance to run */
 869  873  }
 870  874  
 871  875  static void
 872  876  sysdc_tick(kthread_t *t)
 873  877  {
 874  878          sysdc_t *sdc;
 875  879  
 876  880          thread_lock(t);
 877  881          if (t->t_cid != sysdccid) {
 878  882                  SYSDC_INC_STAT(sysdc_tick_not_sdc);
 879  883                  thread_unlock(t);
 880  884                  return;
 881  885          }
 882  886          sdc = t->t_cldata;
 883  887          if (t->t_state == TS_ONPROC &&
 884  888              t->t_pri < t->t_disp_queue->disp_maxrunpri) {
 885  889                  cpu_surrender(t);
 886  890          }
 887  891  
 888  892          if (t->t_state == TS_ONPROC || t->t_state == TS_RUN) {
 889  893                  ASSERT(sdc->sdc_sleep_updates == 0);
 890  894          }
 891  895  
 892  896          ASSERT(sdc->sdc_ticks != sdc->sdc_update_ticks);
 893  897          sdc->sdc_ticks++;
 894  898          if (sdc->sdc_ticks == sdc->sdc_update_ticks) {
 895  899                  SYSDC_INC_STAT(sysdc_tick_quantum_expired);
 896  900                  sysdc_update_pri(sdc, SDC_UPDATE_TICK);
 897  901                  ASSERT(sdc->sdc_ticks != sdc->sdc_update_ticks);
 898  902          }
 899  903          thread_unlock(t);
 900  904  }
 901  905  
 902  906  static void
 903  907  sysdc_setrun(kthread_t *t)
 904  908  {
 905  909          sysdc_t *sdc = t->t_cldata;
 906  910  
 907  911          ASSERT(THREAD_LOCK_HELD(t));    /* t should be in transition */
 908  912  
 909  913          sdc->sdc_sleep_updates = 0;
 910  914  
 911  915          if (sdc->sdc_next == NULL) {
 912  916                  /*
 913  917                   * Since we're in transition, we don't want to use the
 914  918                   * full thread_update_pri().
 915  919                   */
 916  920                  if (sysdc_compute_pri(sdc, 0)) {
 917  921                          THREAD_CHANGE_PRI(t, sdc->sdc_epri);
 918  922                  }
 919  923                  sysdc_activate(sdc);
 920  924  
 921  925                  ASSERT(sdc->sdc_next != NULL);
 922  926          }
 923  927  
 924  928          setbackdq(t);
 925  929  }
 926  930  
 927  931  static void
 928  932  sysdc_wakeup(kthread_t *t)
 929  933  {
 930  934          sysdc_setrun(t);
 931  935  }
 932  936  
 933  937  static void
 934  938  sysdc_sleep(kthread_t *t)
 935  939  {
 936  940          sysdc_t *sdc = t->t_cldata;
 937  941  
 938  942          ASSERT(THREAD_LOCK_HELD(t));    /* t should be in transition */
 939  943  
 940  944          sdc->sdc_sleep_updates = sdc->sdc_nupdates;
 941  945  }
 942  946  
 943  947  /*ARGSUSED*/
 944  948  static int
 945  949  sysdc_enterclass(kthread_t *t, id_t cid, void *parmsp, cred_t *reqpcredp,
 946  950      void *bufp)
 947  951  {
 948  952          cpupart_t *const cpupart = t->t_cpupart;
 949  953          sysdc_t *sdc = bufp;
 950  954          sysdc_params_t *sdpp = parmsp;
 951  955          sysdc_pset_t *newpset = sdc->sdc_pset;
 952  956          sysdc_pset_t *pset;
 953  957          int start_timeout;
 954  958  
 955  959          if (t->t_cid != syscid)
 956  960                  return (EPERM);
 957  961  
 958  962          ASSERT(ttolwp(t) != NULL);
 959  963          ASSERT(sdpp != NULL);
 960  964          ASSERT(newpset != NULL);
 961  965          ASSERT(sysdc_param_init);
 962  966  
 963  967          ASSERT(sdpp->sdp_minpri >= sysdc_minpri);
 964  968          ASSERT(sdpp->sdp_maxpri <= sysdc_maxpri);
 965  969          ASSERT(sdpp->sdp_DC >= sysdc_minDC);
 966  970          ASSERT(sdpp->sdp_DC <= sysdc_maxDC);
 967  971  
 968  972          sdc->sdc_thread = t;
 969  973          sdc->sdc_pri = sdpp->sdp_maxpri;        /* start off maximally */
 970  974          sdc->sdc_minpri = sdpp->sdp_minpri;
 971  975          sdc->sdc_maxpri = sdpp->sdp_maxpri;
 972  976          sdc->sdc_target_DC = sdpp->sdp_DC;
 973  977          sdc->sdc_ticks = 0;
 974  978          sdc->sdc_update_ticks = sysdc_update_ticks + 1;
 975  979  
 976  980          /* Assign ourselves to the appropriate pset. */
 977  981          sdc->sdc_pset = NULL;
 978  982          mutex_enter(&sysdc_pset_lock);
 979  983          for (pset = list_head(&sysdc_psets); pset != NULL;
 980  984              pset = list_next(&sysdc_psets, pset)) {
 981  985                  if (pset->sdp_cpupart == cpupart) {
 982  986                          break;
 983  987                  }
 984  988          }
 985  989          if (pset == NULL) {
 986  990                  pset = newpset;
 987  991                  newpset = NULL;
 988  992                  pset->sdp_cpupart = cpupart;
 989  993                  list_insert_tail(&sysdc_psets, pset);
 990  994          }
 991  995          pset->sdp_nthreads++;
 992  996          ASSERT(pset->sdp_nthreads > 0);
 993  997  
 994  998          sdc->sdc_pset = pset;
 995  999  
 996 1000          start_timeout = (sysdc_update_timeout_started == 0);
 997 1001          sysdc_update_timeout_started = 1;
 998 1002          mutex_exit(&sysdc_pset_lock);
 999 1003  
1000 1004          if (newpset != NULL)
1001 1005                  kmem_free(newpset, sizeof (*newpset));
1002 1006  
1003 1007          /* Update t's scheduling class and priority. */
1004 1008          thread_lock(t);
1005 1009          t->t_clfuncs = &(sclass[cid].cl_funcs->thread);
1006 1010          t->t_cid = cid;
1007 1011          t->t_cldata = sdc;
1008 1012          t->t_schedflag |= TS_RUNQMATCH;
1009 1013  
1010 1014          sysdc_update_pri(sdc, SDC_UPDATE_INITIAL);
1011 1015          thread_unlock(t);
1012 1016  
1013 1017          /* Kick off the thread timeout if we're the first one in. */
1014 1018          if (start_timeout) {
1015 1019                  (void) timeout(sysdc_update, NULL, sysdc_update_ticks);
1016 1020          }
1017 1021  
1018 1022          return (0);
1019 1023  }
1020 1024  
1021 1025  static void
1022 1026  sysdc_leave(sysdc_t *sdc)
1023 1027  {
1024 1028          sysdc_pset_t *sdp = sdc->sdc_pset;
1025 1029          sysdc_list_t *sdl = SYSDC_LIST(sdc);
1026 1030          uint_t freedc;
1027 1031  
1028 1032          mutex_enter(&sdl->sdl_lock);            /* block sysdc_update() */
1029 1033          sdc->sdc_thread = NULL;
1030 1034          freedc = (sdc->sdc_next == NULL);
1031 1035          mutex_exit(&sdl->sdl_lock);
1032 1036  
1033 1037          mutex_enter(&sysdc_pset_lock);
1034 1038          ASSERT(sdp != NULL);
1035 1039          ASSERT(sdp->sdp_nthreads > 0);
1036 1040          --sdp->sdp_nthreads;
1037 1041          if (sdp->sdp_nthreads == 0) {
1038 1042                  list_remove(&sysdc_psets, sdp);
1039 1043          } else {
1040 1044                  sdp = NULL;
1041 1045          }
1042 1046          mutex_exit(&sysdc_pset_lock);
1043 1047  
1044 1048          if (freedc)
1045 1049                  kmem_free(sdc, sizeof (*sdc));
1046 1050          if (sdp != NULL)
1047 1051                  kmem_free(sdp, sizeof (*sdp));
1048 1052  }
1049 1053  
1050 1054  static void
1051 1055  sysdc_exitclass(void *buf)
1052 1056  {
1053 1057          sysdc_leave((sysdc_t *)buf);
1054 1058  }
1055 1059  
1056 1060  /*ARGSUSED*/
1057 1061  static int
1058 1062  sysdc_canexit(kthread_t *t, cred_t *reqpcredp)
1059 1063  {
1060 1064          /* Threads cannot exit SDC once joined, except in a body bag. */
1061 1065          return (EPERM);
1062 1066  }
1063 1067  
1064 1068  static void
1065 1069  sysdc_exit(kthread_t *t)
1066 1070  {
1067 1071          sysdc_t *sdc;
1068 1072  
1069 1073          /* We're exiting, so we just rejoin the SYS class. */
1070 1074          thread_lock(t);
1071 1075          ASSERT(t->t_cid == sysdccid);
1072 1076          sdc = t->t_cldata;
1073 1077          t->t_cid = syscid;
1074 1078          t->t_cldata = NULL;
1075 1079          t->t_clfuncs = &(sclass[syscid].cl_funcs->thread);
1076 1080          (void) thread_change_pri(t, maxclsyspri, 0);
1077 1081          t->t_schedflag &= ~TS_RUNQMATCH;
1078 1082          thread_unlock_nopreempt(t);
1079 1083  
1080 1084          /* Unlink the sdc from everything. */
1081 1085          sysdc_leave(sdc);
1082 1086  }
1083 1087  
1084 1088  /*ARGSUSED*/
1085 1089  static int
1086 1090  sysdc_fork(kthread_t *t, kthread_t *ct, void *bufp)
1087 1091  {
1088 1092          /*
1089 1093           * Threads cannot be created with SDC as their class; they must
1090 1094           * be created as SYS and then added with sysdc_thread_enter().
1091 1095           * Because of this restriction, sysdc_fork() should never be called.
1092 1096           */
1093 1097          panic("sysdc cannot be forked");
1094 1098  
1095 1099          return (ENOSYS);
1096 1100  }
1097 1101  
1098 1102  /*ARGSUSED*/
1099 1103  static void
1100 1104  sysdc_forkret(kthread_t *t, kthread_t *ct)
1101 1105  {
1102 1106          /* SDC threads are part of system processes, which never fork. */
1103 1107          panic("sysdc cannot be forked");
1104 1108  }
1105 1109  
1106 1110  static pri_t
1107 1111  sysdc_globpri(kthread_t *t)
1108 1112  {
1109 1113          return (t->t_epri);
1110 1114  }
1111 1115  
1112 1116  /*ARGSUSED*/
1113 1117  static pri_t
1114 1118  sysdc_no_swap(kthread_t *t, int flags)
1115 1119  {
1116 1120          /* SDC threads cannot be swapped. */
1117 1121          return (-1);
1118 1122  }
1119 1123  
1120 1124  /*
1121 1125   * Get maximum and minimum priorities enjoyed by SDC threads.
1122 1126   */
1123 1127  static int
1124 1128  sysdc_getclpri(pcpri_t *pcprip)
1125 1129  {
1126 1130          pcprip->pc_clpmax = sysdc_maxpri;
1127 1131          pcprip->pc_clpmin = sysdc_minpri;
1128 1132          return (0);
1129 1133  }
1130 1134  
1131 1135  /*ARGSUSED*/
1132 1136  static int
1133 1137  sysdc_getclinfo(void *arg)
1134 1138  {
1135 1139          return (0);             /* no class-specific info */
1136 1140  }
1137 1141  
1138 1142  /*ARGSUSED*/
1139 1143  static int
1140 1144  sysdc_alloc(void **p, int flag)
1141 1145  {
1142 1146          sysdc_t *new;
1143 1147  
1144 1148          *p = NULL;
1145 1149          if ((new = kmem_zalloc(sizeof (*new), flag)) == NULL) {
1146 1150                  return (ENOMEM);
1147 1151          }
1148 1152          if ((new->sdc_pset = kmem_zalloc(sizeof (*new->sdc_pset), flag)) ==
1149 1153              NULL) {
1150 1154                  kmem_free(new, sizeof (*new));
1151 1155                  return (ENOMEM);
1152 1156          }
1153 1157          *p = new;
1154 1158          return (0);
1155 1159  }
1156 1160  
1157 1161  static void
1158 1162  sysdc_free(void *p)
1159 1163  {
1160 1164          sysdc_t *sdc = p;
1161 1165  
1162 1166          if (sdc != NULL) {
1163 1167                  /*
1164 1168                   * We must have failed CL_ENTERCLASS(), so our pset should be
1165 1169                   * there and unused.
1166 1170                   */
1167 1171                  ASSERT(sdc->sdc_pset != NULL);
1168 1172                  ASSERT(sdc->sdc_pset->sdp_cpupart == NULL);
1169 1173                  kmem_free(sdc->sdc_pset, sizeof (*sdc->sdc_pset));
1170 1174                  kmem_free(sdc, sizeof (*sdc));
1171 1175          }
1172 1176  }
1173 1177  
1174 1178  static int sysdc_enosys();      /* Boy, ANSI-C's K&R compatibility is weird. */
1175 1179  static int sysdc_einval();
1176 1180  static void sysdc_nullsys();
1177 1181  
1178 1182  static struct classfuncs sysdc_classfuncs = {
1179 1183          /* messages to class manager */
1180 1184          {
1181 1185                  sysdc_enosys,   /* admin */
1182 1186                  sysdc_getclinfo,
1183 1187                  sysdc_enosys,   /* parmsin */
1184 1188                  sysdc_enosys,   /* parmsout */
1185 1189                  sysdc_enosys,   /* vaparmsin */
1186 1190                  sysdc_enosys,   /* vaparmsout */
1187 1191                  sysdc_getclpri,
1188 1192                  sysdc_alloc,
1189 1193                  sysdc_free,
1190 1194          },
1191 1195          /* operations on threads */
1192 1196          {
1193 1197                  sysdc_enterclass,
1194 1198                  sysdc_exitclass,
1195 1199                  sysdc_canexit,
1196 1200                  sysdc_fork,
1197 1201                  sysdc_forkret,
1198 1202                  sysdc_nullsys,  /* parmsget */
1199 1203                  sysdc_enosys,   /* parmsset */
1200 1204                  sysdc_nullsys,  /* stop */
1201 1205                  sysdc_exit,
1202 1206                  sysdc_nullsys,  /* active */
1203 1207                  sysdc_nullsys,  /* inactive */
1204 1208                  sysdc_no_swap,  /* swapin */
1205 1209                  sysdc_no_swap,  /* swapout */
1206 1210                  sysdc_nullsys,  /* trapret */
1207 1211                  sysdc_preempt,
1208 1212                  sysdc_setrun,
1209 1213                  sysdc_sleep,
1210 1214                  sysdc_tick,
1211 1215                  sysdc_wakeup,
1212 1216                  sysdc_einval,   /* donice */
1213 1217                  sysdc_globpri,
1214 1218                  sysdc_nullsys,  /* set_process_group */
1215 1219                  sysdc_nullsys,  /* yield */
1216 1220                  sysdc_einval,   /* doprio */
1217 1221          }
1218 1222  };
1219 1223  
1220 1224  static int
1221 1225  sysdc_enosys()
1222 1226  {
1223 1227          return (ENOSYS);
1224 1228  }
1225 1229  
1226 1230  static int
1227 1231  sysdc_einval()
1228 1232  {
1229 1233          return (EINVAL);
1230 1234  }
1231 1235  
1232 1236  static void
1233 1237  sysdc_nullsys()
1234 1238  {
1235 1239  }
1236 1240  
1237 1241  /*ARGSUSED*/
1238 1242  static pri_t
1239 1243  sysdc_init(id_t cid, int clparmsz, classfuncs_t **clfuncspp)
1240 1244  {
1241 1245          int idx;
1242 1246  
1243 1247          list_create(&sysdc_psets, sizeof (sysdc_pset_t),
1244 1248              offsetof(sysdc_pset_t, sdp_node));
1245 1249  
1246 1250          for (idx = 0; idx < SYSDC_NLISTS; idx++) {
1247 1251                  sysdc_active[idx].sdl_list = &sysdc_dummy;
1248 1252          }
1249 1253  
1250 1254          sysdc_initparam();
1251 1255  
1252 1256          sysdccid = cid;
1253 1257          *clfuncspp = &sysdc_classfuncs;
1254 1258  
1255 1259          return ((pri_t)v.v_maxsyspri);
1256 1260  }
1257 1261  
1258 1262  static struct sclass csw = {
1259 1263          "SDC",
1260 1264          sysdc_init,
1261 1265          0
1262 1266  };
1263 1267  
1264 1268  static struct modlsched modlsched = {
1265 1269          &mod_schedops, "system duty cycle scheduling class", &csw
1266 1270  };
1267 1271  
1268 1272  static struct modlinkage modlinkage = {
1269 1273          MODREV_1, (void *)&modlsched, NULL
1270 1274  };
1271 1275  
1272 1276  int
1273 1277  _init()
1274 1278  {
1275 1279          return (mod_install(&modlinkage));
1276 1280  }
1277 1281  
1278 1282  int
1279 1283  _fini()
1280 1284  {
1281 1285          return (EBUSY);         /* can't unload for now */
1282 1286  }
1283 1287  
1284 1288  int
1285 1289  _info(struct modinfo *modinfop)
1286 1290  {
1287 1291          return (mod_info(&modlinkage, modinfop));
1288 1292  }
1289 1293  
1290 1294  /* --- consolidation-private interfaces --- */
1291 1295  void
1292 1296  sysdc_thread_enter(kthread_t *t, uint_t dc, uint_t flags)
1293 1297  {
1294 1298          void *buf = NULL;
1295 1299          sysdc_params_t sdp;
1296 1300  
1297 1301          SYSDC_INC_STAT(sysdc_thread_enter_enter);

↓ open down ↓

1262 lines elided

↑ open up ↑

1298 1302  
1299 1303          ASSERT(sysdc_param_init);
1300 1304          ASSERT(sysdccid >= 0);
1301 1305  
1302 1306          ASSERT((flags & ~SYSDC_THREAD_BATCH) == 0);
1303 1307  
1304 1308          sdp.sdp_minpri = sysdc_minpri;
1305 1309          sdp.sdp_maxpri = sysdc_maxpri;
1306 1310          sdp.sdp_DC = MAX(MIN(dc, sysdc_maxDC), sysdc_minDC);
1307 1311  
1308      -        VERIFY3U(CL_ALLOC(&buf, sysdccid, KM_SLEEP), ==, 0);
     1312 +        VERIFY0(CL_ALLOC(&buf, sysdccid, KM_SLEEP));
1309 1313  
1310 1314          ASSERT(t->t_lwp != NULL);
1311 1315          ASSERT(t->t_cid == syscid);
1312 1316          ASSERT(t->t_cldata == NULL);
1313      -        VERIFY3U(CL_CANEXIT(t, NULL), ==, 0);
1314      -        VERIFY3U(CL_ENTERCLASS(t, sysdccid, &sdp, kcred, buf), ==, 0);
     1317 +        VERIFY0(CL_CANEXIT(t, NULL));
     1318 +        VERIFY0(CL_ENTERCLASS(t, sysdccid, &sdp, kcred, buf));
1315 1319          CL_EXITCLASS(syscid, NULL);
1316 1320  }

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX