illumos-gate Wdiff usr/src/uts/common/os/msacct.c

Print this page

9936 atomic ops in syscall_mstate() induce significant overhead
9942 zone secflags are not initialized correctly

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/os/msacct.c
          +++ new/usr/src/uts/common/os/msacct.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each

↓ open down ↓

13 lines elided

↑ open up ↑

  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23   23   * Use is subject to license terms.
  24      - * Copyright 2012 Joyent, Inc.  All rights reserved.
       24 + * Copyright (c) 2018, Joyent, Inc.
  25   25   */
  26   26  
  27   27  #include <sys/types.h>
  28   28  #include <sys/param.h>
  29   29  #include <sys/systm.h>
  30   30  #include <sys/user.h>
  31   31  #include <sys/proc.h>
  32   32  #include <sys/cpuvar.h>
  33   33  #include <sys/thread.h>
  34   34  #include <sys/debug.h>

  35   35  #include <sys/msacct.h>
  36   36  #include <sys/time.h>
  37   37  #include <sys/zone.h>
  38   38  
  39   39  /*
  40   40   * Mega-theory block comment:
  41   41   *
  42   42   * Microstate accounting uses finite states and the transitions between these
  43   43   * states to measure timing and accounting information.  The state information
  44   44   * is presently tracked for threads (via microstate accounting) and cpus (via
  45   45   * cpu microstate accounting).  In each case, these accounting mechanisms use
  46   46   * states and transitions to measure time spent in each state instead of
  47   47   * clock-based sampling methodologies.
  48   48   *
  49   49   * For microstate accounting:
  50   50   * state transitions are accomplished by calling new_mstate() to switch between
  51   51   * states.  Transitions from a sleeping state (LMS_SLEEP and LMS_STOPPED) occur
  52   52   * by calling restore_mstate() which restores a thread to its previously running
  53   53   * state.  This code is primarialy executed by the dispatcher in disp() before
  54   54   * running a process that was put to sleep.  If the thread was not in a sleeping
  55   55   * state, this call has little effect other than to update the count of time the
  56   56   * thread has spent waiting on run-queues in its lifetime.
  57   57   *
  58   58   * For cpu microstate accounting:
  59   59   * Cpu microstate accounting is similar to the microstate accounting for threads
  60   60   * but it tracks user, system, and idle time for cpus.  Cpu microstate
  61   61   * accounting does not track interrupt times as there is a pre-existing
  62   62   * interrupt accounting mechanism for this purpose.  Cpu microstate accounting
  63   63   * tracks time that user threads have spent active, idle, or in the system on a
  64   64   * given cpu.  Cpu microstate accounting has fewer states which allows it to
  65   65   * have better defined transitions.  The states transition in the following
  66   66   * order:
  67   67   *
  68   68   *  CMS_USER <-> CMS_SYSTEM <-> CMS_IDLE
  69   69   *
  70   70   * In order to get to the idle state, the cpu microstate must first go through
  71   71   * the system state, and vice-versa for the user state from idle.  The switching
  72   72   * of the microstates from user to system is done as part of the regular thread
  73   73   * microstate accounting code, except for the idle state which is switched by
  74   74   * the dispatcher before it runs the idle loop.
  75   75   *
  76   76   * Cpu percentages:
  77   77   * Cpu percentages are now handled by and based upon microstate accounting
  78   78   * information (the same is true for load averages).  The routines which handle
  79   79   * the growing/shrinking and exponentiation of cpu percentages have been moved
  80   80   * here as it now makes more sense for them to be generated from the microstate
  81   81   * code.  Cpu percentages are generated similarly to the way they were before;
  82   82   * however, now they are based upon high-resolution timestamps and the
  83   83   * timestamps are modified at various state changes instead of during a clock()
  84   84   * interrupt.  This allows us to generate more accurate cpu percentages which
  85   85   * are also in-sync with microstate data.
  86   86   */
  87   87  
  88   88  /*
  89   89   * Initialize the microstate level and the
  90   90   * associated accounting information for an LWP.
  91   91   */
  92   92  void
  93   93  init_mstate(
  94   94          kthread_t       *t,
  95   95          int             init_state)
  96   96  {
  97   97          struct mstate *ms;
  98   98          klwp_t *lwp;
  99   99          hrtime_t curtime;
 100  100  
 101  101          ASSERT(init_state != LMS_WAIT_CPU);
 102  102          ASSERT((unsigned)init_state < NMSTATES);
 103  103  
 104  104          if ((lwp = ttolwp(t)) != NULL) {
 105  105                  ms = &lwp->lwp_mstate;
 106  106                  curtime = gethrtime_unscaled();
 107  107                  ms->ms_prev = LMS_SYSTEM;
 108  108                  ms->ms_start = curtime;
 109  109                  ms->ms_term = 0;
 110  110                  ms->ms_state_start = curtime;
 111  111                  t->t_mstate = init_state;
 112  112                  t->t_waitrq = 0;
 113  113                  t->t_hrtime = curtime;
 114  114                  if ((t->t_proc_flag & TP_MSACCT) == 0)
 115  115                          t->t_proc_flag |= TP_MSACCT;
 116  116                  bzero((caddr_t)&ms->ms_acct[0], sizeof (ms->ms_acct));
 117  117          }
 118  118  }
 119  119  
 120  120  /*
 121  121   * Initialize the microstate level and associated accounting information
 122  122   * for the specified cpu
 123  123   */
 124  124  
 125  125  void
 126  126  init_cpu_mstate(
 127  127          cpu_t *cpu,
 128  128          int init_state)
 129  129  {
 130  130          ASSERT(init_state != CMS_DISABLED);
 131  131  
 132  132          cpu->cpu_mstate = init_state;
 133  133          cpu->cpu_mstate_start = gethrtime_unscaled();
 134  134          cpu->cpu_waitrq = 0;
 135  135          bzero((caddr_t)&cpu->cpu_acct[0], sizeof (cpu->cpu_acct));
 136  136  }
 137  137  
 138  138  /*
 139  139   * sets cpu state to OFFLINE.  We don't actually track this time,
 140  140   * but it serves as a useful placeholder state for when we're not
 141  141   * doing anything.
 142  142   */
 143  143  
 144  144  void
 145  145  term_cpu_mstate(struct cpu *cpu)
 146  146  {
 147  147          ASSERT(cpu->cpu_mstate != CMS_DISABLED);
 148  148          cpu->cpu_mstate = CMS_DISABLED;
 149  149          cpu->cpu_mstate_start = 0;
 150  150  }
 151  151  
 152  152  /* NEW_CPU_MSTATE comments inline in new_cpu_mstate below. */
 153  153  
 154  154  #define NEW_CPU_MSTATE(state)                                           \
 155  155          gen = cpu->cpu_mstate_gen;                                      \
 156  156          cpu->cpu_mstate_gen = 0;                                        \
 157  157          /* Need membar_producer() here if stores not ordered / TSO */   \
 158  158          cpu->cpu_acct[cpu->cpu_mstate] += curtime - cpu->cpu_mstate_start; \
 159  159          cpu->cpu_mstate = state;                                        \
 160  160          cpu->cpu_mstate_start = curtime;                                \
 161  161          /* Need membar_producer() here if stores not ordered / TSO */   \
 162  162          cpu->cpu_mstate_gen = (++gen == 0) ? 1 : gen;
 163  163  
 164  164  void
 165  165  new_cpu_mstate(int cmstate, hrtime_t curtime)
 166  166  {
 167  167          cpu_t *cpu = CPU;
 168  168          uint16_t gen;
 169  169  
 170  170          ASSERT(cpu->cpu_mstate != CMS_DISABLED);
 171  171          ASSERT(cmstate < NCMSTATES);
 172  172          ASSERT(cmstate != CMS_DISABLED);
 173  173  
 174  174          /*
 175  175           * This function cannot be re-entrant on a given CPU. As such,
 176  176           * we ASSERT and panic if we are called on behalf of an interrupt.
 177  177           * The one exception is for an interrupt which has previously
 178  178           * blocked. Such an interrupt is being scheduled by the dispatcher
 179  179           * just like a normal thread, and as such cannot arrive here
 180  180           * in a re-entrant manner.
 181  181           */
 182  182  
 183  183          ASSERT(!CPU_ON_INTR(cpu) && curthread->t_intr == NULL);
 184  184          ASSERT(curthread->t_preempt > 0 || curthread == cpu->cpu_idle_thread);
 185  185  
 186  186          /*
 187  187           * LOCKING, or lack thereof:
 188  188           *
 189  189           * Updates to CPU mstate can only be made by the CPU
 190  190           * itself, and the above check to ignore interrupts
 191  191           * should prevent recursion into this function on a given
 192  192           * processor. i.e. no possible write contention.
 193  193           *
 194  194           * However, reads of CPU mstate can occur at any time
 195  195           * from any CPU. Any locking added to this code path
 196  196           * would seriously impact syscall performance. So,
 197  197           * instead we have a best-effort protection for readers.
 198  198           * The reader will want to account for any time between
 199  199           * cpu_mstate_start and the present time. This requires
 200  200           * some guarantees that the reader is getting coherent
 201  201           * information.
 202  202           *
 203  203           * We use a generation counter, which is set to 0 before
 204  204           * we start making changes, and is set to a new value
 205  205           * after we're done. Someone reading the CPU mstate
 206  206           * should check for the same non-zero value of this
 207  207           * counter both before and after reading all state. The
 208  208           * important point is that the reader is not a
 209  209           * performance-critical path, but this function is.
 210  210           *
 211  211           * The ordering of writes is critical. cpu_mstate_gen must
 212  212           * be visibly zero on all CPUs before we change cpu_mstate
 213  213           * and cpu_mstate_start. Additionally, cpu_mstate_gen must
 214  214           * not be restored to oldgen+1 until after all of the other
 215  215           * writes have become visible.
 216  216           *
 217  217           * Normally one puts membar_producer() calls to accomplish
 218  218           * this. Unfortunately this routine is extremely performance
 219  219           * critical (esp. in syscall_mstate below) and we cannot
 220  220           * afford the additional time, particularly on some x86
 221  221           * architectures with extremely slow sfence calls. On a
 222  222           * CPU which guarantees write ordering (including sparc, x86,
 223  223           * and amd64) this is not a problem. The compiler could still
 224  224           * reorder the writes, so we make the four cpu fields
 225  225           * volatile to prevent this.
 226  226           *
 227  227           * TSO warning: should we port to a non-TSO (or equivalent)
 228  228           * CPU, this will break.
 229  229           *
 230  230           * The reader stills needs the membar_consumer() calls because,
 231  231           * although the volatiles prevent the compiler from reordering
 232  232           * loads, the CPU can still do so.
 233  233           */
 234  234  
 235  235          NEW_CPU_MSTATE(cmstate);
 236  236  }
 237  237  
 238  238  /*
 239  239   * Return an aggregation of user and system CPU time consumed by
 240  240   * the specified thread in scaled nanoseconds.
 241  241   */
 242  242  hrtime_t
 243  243  mstate_thread_onproc_time(kthread_t *t)
 244  244  {
 245  245          hrtime_t aggr_time;
 246  246          hrtime_t now;
 247  247          hrtime_t waitrq;
 248  248          hrtime_t state_start;
 249  249          struct mstate *ms;
 250  250          klwp_t *lwp;
 251  251          int     mstate;
 252  252  
 253  253          ASSERT(THREAD_LOCK_HELD(t));
 254  254  
 255  255          if ((lwp = ttolwp(t)) == NULL)
 256  256                  return (0);
 257  257  
 258  258          mstate = t->t_mstate;
 259  259          waitrq = t->t_waitrq;
 260  260          ms = &lwp->lwp_mstate;
 261  261          state_start = ms->ms_state_start;
 262  262  
 263  263          aggr_time = ms->ms_acct[LMS_USER] +
 264  264              ms->ms_acct[LMS_SYSTEM] + ms->ms_acct[LMS_TRAP];
 265  265  
 266  266          now = gethrtime_unscaled();
 267  267  
 268  268          /*
 269  269           * NOTE: gethrtime_unscaled on X86 taken on different CPUs is
 270  270           * inconsistent, so it is possible that now < state_start.
 271  271           */
 272  272          if (mstate == LMS_USER || mstate == LMS_SYSTEM || mstate == LMS_TRAP) {
 273  273                  /* if waitrq is zero, count all of the time. */
 274  274                  if (waitrq == 0) {
 275  275                          waitrq = now;
 276  276                  }
 277  277  
 278  278                  if (waitrq > state_start) {
 279  279                          aggr_time += waitrq - state_start;
 280  280                  }
 281  281          }
 282  282  
 283  283          scalehrtime(&aggr_time);
 284  284          return (aggr_time);
 285  285  }
 286  286  
 287  287  /*
 288  288   * Return the amount of onproc and runnable time this thread has experienced.
 289  289   *
 290  290   * Because the fields we read are not protected by locks when updated
 291  291   * by the thread itself, this is an inherently racey interface.  In
 292  292   * particular, the ASSERT(THREAD_LOCK_HELD(t)) doesn't guarantee as much
 293  293   * as it might appear to.
 294  294   *
 295  295   * The implication for users of this interface is that onproc and runnable
 296  296   * are *NOT* monotonically increasing; they may temporarily be larger than
 297  297   * they should be.
 298  298   */
 299  299  void
 300  300  mstate_systhread_times(kthread_t *t, hrtime_t *onproc, hrtime_t *runnable)
 301  301  {
 302  302          struct mstate   *const  ms = &ttolwp(t)->lwp_mstate;
 303  303  
 304  304          int             mstate;
 305  305          hrtime_t        now;
 306  306          hrtime_t        state_start;
 307  307          hrtime_t        waitrq;
 308  308          hrtime_t        aggr_onp;
 309  309          hrtime_t        aggr_run;
 310  310  
 311  311          ASSERT(THREAD_LOCK_HELD(t));
 312  312          ASSERT(t->t_procp->p_flag & SSYS);
 313  313          ASSERT(ttolwp(t) != NULL);
 314  314  
 315  315          /* shouldn't be any non-SYSTEM on-CPU time */
 316  316          ASSERT(ms->ms_acct[LMS_USER] == 0);
 317  317          ASSERT(ms->ms_acct[LMS_TRAP] == 0);
 318  318  
 319  319          mstate = t->t_mstate;
 320  320          waitrq = t->t_waitrq;
 321  321          state_start = ms->ms_state_start;
 322  322  
 323  323          aggr_onp = ms->ms_acct[LMS_SYSTEM];
 324  324          aggr_run = ms->ms_acct[LMS_WAIT_CPU];
 325  325  
 326  326          now = gethrtime_unscaled();
 327  327  
 328  328          /* if waitrq == 0, then there is no time to account to TS_RUN */
 329  329          if (waitrq == 0)
 330  330                  waitrq = now;
 331  331  
 332  332          /* If there is system time to accumulate, do so */
 333  333          if (mstate == LMS_SYSTEM && state_start < waitrq)
 334  334                  aggr_onp += waitrq - state_start;
 335  335  
 336  336          if (waitrq < now)
 337  337                  aggr_run += now - waitrq;
 338  338  
 339  339          scalehrtime(&aggr_onp);
 340  340          scalehrtime(&aggr_run);
 341  341  
 342  342          *onproc = aggr_onp;
 343  343          *runnable = aggr_run;
 344  344  }
 345  345  
 346  346  /*
 347  347   * Return an aggregation of microstate times in scaled nanoseconds (high-res
 348  348   * time).  This keeps in mind that p_acct is already scaled, and ms_acct is
 349  349   * not.
 350  350   */
 351  351  hrtime_t
 352  352  mstate_aggr_state(proc_t *p, int a_state)
 353  353  {
 354  354          struct mstate *ms;
 355  355          kthread_t *t;
 356  356          klwp_t *lwp;
 357  357          hrtime_t aggr_time;
 358  358          hrtime_t scaledtime;
 359  359  
 360  360          ASSERT(MUTEX_HELD(&p->p_lock));
 361  361          ASSERT((unsigned)a_state < NMSTATES);
 362  362  
 363  363          aggr_time = p->p_acct[a_state];
 364  364          if (a_state == LMS_SYSTEM)
 365  365                  aggr_time += p->p_acct[LMS_TRAP];
 366  366  
 367  367          t = p->p_tlist;
 368  368          if (t == NULL)
 369  369                  return (aggr_time);
 370  370  
 371  371          do {
 372  372                  if (t->t_proc_flag & TP_LWPEXIT)
 373  373                          continue;
 374  374  
 375  375                  lwp = ttolwp(t);
 376  376                  ms = &lwp->lwp_mstate;
 377  377                  scaledtime = ms->ms_acct[a_state];
 378  378                  scalehrtime(&scaledtime);
 379  379                  aggr_time += scaledtime;
 380  380                  if (a_state == LMS_SYSTEM) {
 381  381                          scaledtime = ms->ms_acct[LMS_TRAP];
 382  382                          scalehrtime(&scaledtime);
 383  383                          aggr_time += scaledtime;
 384  384                  }
 385  385          } while ((t = t->t_forw) != p->p_tlist);
 386  386  
 387  387          return (aggr_time);
 388  388  }
 389  389  
 390  390  
 391  391  void
 392  392  syscall_mstate(int fromms, int toms)
 393  393  {
 394  394          kthread_t *t = curthread;
 395  395          zone_t *z = ttozone(t);
 396  396          struct mstate *ms;
 397  397          hrtime_t *mstimep;
 398  398          hrtime_t curtime;
 399  399          klwp_t *lwp;
 400  400          hrtime_t newtime;
 401  401          cpu_t *cpu;
 402  402          uint16_t gen;
 403  403  
 404  404          if ((lwp = ttolwp(t)) == NULL)
 405  405                  return;
 406  406  
 407  407          ASSERT(fromms < NMSTATES);
 408  408          ASSERT(toms < NMSTATES);

↓ open down ↓

374 lines elided

↑ open up ↑

 409  409  
 410  410          ms = &lwp->lwp_mstate;
 411  411          mstimep = &ms->ms_acct[fromms];
 412  412          curtime = gethrtime_unscaled();
 413  413          newtime = curtime - ms->ms_state_start;
 414  414          while (newtime < 0) {
 415  415                  curtime = gethrtime_unscaled();
 416  416                  newtime = curtime - ms->ms_state_start;
 417  417          }
 418  418          *mstimep += newtime;
 419      -        if (fromms == LMS_USER)
 420      -                atomic_add_64(&z->zone_utime, newtime);
 421      -        else if (fromms == LMS_SYSTEM)
 422      -                atomic_add_64(&z->zone_stime, newtime);
 423  419          t->t_mstate = toms;
 424  420          ms->ms_state_start = curtime;
 425  421          ms->ms_prev = fromms;
 426  422          kpreempt_disable(); /* don't change CPU while changing CPU's state */
 427  423          cpu = CPU;
 428  424          ASSERT(cpu == t->t_cpu);
      425 +
      426 +        if (fromms == LMS_USER) {
      427 +                CPU_UARRAY_VAL(z->zone_ustate, cpu->cpu_id,
      428 +                    ZONE_USTATE_UTIME) += newtime;
      429 +        } else if (fromms == LMS_SYSTEM) {
      430 +                CPU_UARRAY_VAL(z->zone_ustate, cpu->cpu_id,
      431 +                    ZONE_USTATE_STIME) += newtime;
      432 +        }
      433 +
 429  434          if ((toms != LMS_USER) && (cpu->cpu_mstate != CMS_SYSTEM)) {
 430  435                  NEW_CPU_MSTATE(CMS_SYSTEM);
 431  436          } else if ((toms == LMS_USER) && (cpu->cpu_mstate != CMS_USER)) {
 432  437                  NEW_CPU_MSTATE(CMS_USER);
 433  438          }
 434  439          kpreempt_enable();
 435  440  }
 436  441  
 437  442  #undef NEW_CPU_MSTATE
 438  443

 439  444  /*
 440  445   * The following is for computing the percentage of cpu time used recently
 441  446   * by an lwp.  The function cpu_decay() is also called from /proc code.
 442  447   *
 443  448   * exp_x(x):
 444  449   * Given x as a 64-bit non-negative scaled integer of arbitrary magnitude,
 445  450   * Return exp(-x) as a 64-bit scaled integer in the range [0 .. 1].
 446  451   *
 447  452   * Scaling for 64-bit scaled integer:
 448  453   * The binary point is to the right of the high-order bit
 449  454   * of the low-order 32-bit word.
 450  455   */
 451  456  
 452  457  #define LSHIFT  31
 453  458  #define LSI_ONE ((uint32_t)1 << LSHIFT) /* 32-bit scaled integer 1 */
 454  459  
 455  460  #ifdef DEBUG
 456  461  uint_t expx_cnt = 0;    /* number of calls to exp_x() */
 457  462  uint_t expx_mul = 0;    /* number of long multiplies in exp_x() */
 458  463  #endif
 459  464  
 460  465  static uint64_t
 461  466  exp_x(uint64_t x)
 462  467  {
 463  468          int i;
 464  469          uint64_t ull;
 465  470          uint32_t ui;
 466  471  
 467  472  #ifdef DEBUG
 468  473          expx_cnt++;
 469  474  #endif
 470  475          /*
 471  476           * By the formula:
 472  477           *      exp(-x) = exp(-x/2) * exp(-x/2)
 473  478           * we keep halving x until it becomes small enough for
 474  479           * the following approximation to be accurate enough:
 475  480           *      exp(-x) = 1 - x
 476  481           * We reduce x until it is less than 1/4 (the 2 in LSHIFT-2 below).
 477  482           * Our final error will be smaller than 4% .
 478  483           */
 479  484  
 480  485          /*
 481  486           * Use a uint64_t for the initial shift calculation.
 482  487           */
 483  488          ull = x >> (LSHIFT-2);
 484  489  
 485  490          /*
 486  491           * Short circuit:
 487  492           * A number this large produces effectively 0 (actually .005).
 488  493           * This way, we will never do more than 5 multiplies.
 489  494           */
 490  495          if (ull >= (1 << 5))
 491  496                  return (0);
 492  497  
 493  498          ui = ull;       /* OK.  Now we can use a uint_t. */
 494  499          for (i = 0; ui != 0; i++)
 495  500                  ui >>= 1;
 496  501  
 497  502          if (i != 0) {
 498  503  #ifdef DEBUG
 499  504                  expx_mul += i;  /* seldom happens */
 500  505  #endif
 501  506                  x >>= i;
 502  507          }
 503  508  
 504  509          /*
 505  510           * Now we compute 1 - x and square it the number of times
 506  511           * that we halved x above to produce the final result:
 507  512           */
 508  513          x = LSI_ONE - x;
 509  514          while (i--)
 510  515                  x = (x * x) >> LSHIFT;
 511  516  
 512  517          return (x);
 513  518  }
 514  519  
 515  520  /*
 516  521   * Given the old percent cpu and a time delta in nanoseconds,
 517  522   * return the new decayed percent cpu:  pct * exp(-tau),
 518  523   * where 'tau' is the time delta multiplied by a decay factor.
 519  524   * We have chosen the decay factor (cpu_decay_factor in param.c)
 520  525   * to make the decay over five seconds be approximately 20%.
 521  526   *
 522  527   * 'pct' is a 32-bit scaled integer <= 1
 523  528   * The binary point is to the right of the high-order bit
 524  529   * of the 32-bit word.
 525  530   */
 526  531  static uint32_t
 527  532  cpu_decay(uint32_t pct, hrtime_t nsec)
 528  533  {
 529  534          uint64_t delta = (uint64_t)nsec;
 530  535  
 531  536          delta /= cpu_decay_factor;
 532  537          return ((pct * exp_x(delta)) >> LSHIFT);
 533  538  }
 534  539  
 535  540  /*
 536  541   * Given the old percent cpu and a time delta in nanoseconds,
 537  542   * return the new grown percent cpu:  1 - ( 1 - pct ) * exp(-tau)
 538  543   */
 539  544  static uint32_t
 540  545  cpu_grow(uint32_t pct, hrtime_t nsec)
 541  546  {
 542  547          return (LSI_ONE - cpu_decay(LSI_ONE - pct, nsec));
 543  548  }
 544  549  
 545  550  
 546  551  /*
 547  552   * Defined to determine whether a lwp is still on a processor.
 548  553   */
 549  554  
 550  555  #define T_ONPROC(kt)    \
 551  556          ((kt)->t_mstate < LMS_SLEEP)
 552  557  #define T_OFFPROC(kt)   \
 553  558          ((kt)->t_mstate >= LMS_SLEEP)
 554  559  
 555  560  uint_t
 556  561  cpu_update_pct(kthread_t *t, hrtime_t newtime)
 557  562  {
 558  563          hrtime_t delta;
 559  564          hrtime_t hrlb;
 560  565          uint_t pctcpu;
 561  566          uint_t npctcpu;
 562  567  
 563  568          /*
 564  569           * This routine can get called at PIL > 0, this *has* to be
 565  570           * done atomically. Holding locks here causes bad things to happen.
 566  571           * (read: deadlock).
 567  572           */
 568  573  
 569  574          do {
 570  575                  pctcpu = t->t_pctcpu;
 571  576                  hrlb = t->t_hrtime;
 572  577                  delta = newtime - hrlb;
 573  578                  if (delta < 0) {
 574  579                          newtime = gethrtime_unscaled();
 575  580                          delta = newtime - hrlb;
 576  581                  }
 577  582                  t->t_hrtime = newtime;
 578  583                  scalehrtime(&delta);
 579  584                  if (T_ONPROC(t) && t->t_waitrq == 0) {
 580  585                          npctcpu = cpu_grow(pctcpu, delta);
 581  586                  } else {
 582  587                          npctcpu = cpu_decay(pctcpu, delta);
 583  588                  }
 584  589          } while (atomic_cas_32(&t->t_pctcpu, pctcpu, npctcpu) != pctcpu);
 585  590  
 586  591          return (npctcpu);
 587  592  }
 588  593  
 589  594  /*
 590  595   * Change the microstate level for the LWP and update the
 591  596   * associated accounting information.  Return the previous
 592  597   * LWP state.
 593  598   */
 594  599  int
 595  600  new_mstate(kthread_t *t, int new_state)
 596  601  {
 597  602          struct mstate *ms;
 598  603          unsigned state;
 599  604          hrtime_t *mstimep;
 600  605          hrtime_t curtime;
 601  606          hrtime_t newtime;
 602  607          hrtime_t oldtime;
 603  608          hrtime_t ztime;
 604  609          hrtime_t origstart;
 605  610          klwp_t *lwp;
 606  611          zone_t *z;
 607  612  
 608  613          ASSERT(new_state != LMS_WAIT_CPU);
 609  614          ASSERT((unsigned)new_state < NMSTATES);
 610  615          ASSERT(t == curthread || THREAD_LOCK_HELD(t));
 611  616  
 612  617          /*
 613  618           * Don't do microstate processing for threads without a lwp (kernel
 614  619           * threads).  Also, if we're an interrupt thread that is pinning another
 615  620           * thread, our t_mstate hasn't been initialized.  We'd be modifying the
 616  621           * microstate of the underlying lwp which doesn't realize that it's
 617  622           * pinned.  In this case, also don't change the microstate.
 618  623           */
 619  624          if (((lwp = ttolwp(t)) == NULL) || t->t_intr)
 620  625                  return (LMS_SYSTEM);
 621  626  
 622  627          curtime = gethrtime_unscaled();
 623  628  
 624  629          /* adjust cpu percentages before we go any further */
 625  630          (void) cpu_update_pct(t, curtime);
 626  631  
 627  632          ms = &lwp->lwp_mstate;
 628  633          state = t->t_mstate;
 629  634          origstart = ms->ms_state_start;
 630  635          do {
 631  636                  switch (state) {
 632  637                  case LMS_TFAULT:
 633  638                  case LMS_DFAULT:
 634  639                  case LMS_KFAULT:
 635  640                  case LMS_USER_LOCK:
 636  641                          mstimep = &ms->ms_acct[LMS_SYSTEM];
 637  642                          break;
 638  643                  default:
 639  644                          mstimep = &ms->ms_acct[state];
 640  645                          break;
 641  646                  }
 642  647                  ztime = newtime = curtime - ms->ms_state_start;
 643  648                  if (newtime < 0) {
 644  649                          curtime = gethrtime_unscaled();
 645  650                          oldtime = *mstimep - 1; /* force CAS to fail */

↓ open down ↓

207 lines elided

↑ open up ↑

 646  651                          continue;
 647  652                  }
 648  653                  oldtime = *mstimep;
 649  654                  newtime += oldtime;
 650  655                  t->t_mstate = new_state;
 651  656                  ms->ms_state_start = curtime;
 652  657          } while (atomic_cas_64((uint64_t *)mstimep, oldtime, newtime) !=
 653  658              oldtime);
 654  659  
 655  660          /*
 656      -         * When the system boots the initial startup thread will have a
 657      -         * ms_state_start of 0 which would add a huge system time to the global
 658      -         * zone.  We want to skip aggregating that initial bit of work.
 659      -         */
 660      -        if (origstart != 0) {
 661      -                z = ttozone(t);
 662      -                if (state == LMS_USER)
 663      -                        atomic_add_64(&z->zone_utime, ztime);
 664      -                else if (state == LMS_SYSTEM)
 665      -                        atomic_add_64(&z->zone_stime, ztime);
 666      -        }
 667      -
 668      -        /*
 669  661           * Remember the previous running microstate.
 670  662           */
 671  663          if (state != LMS_SLEEP && state != LMS_STOPPED)
 672  664                  ms->ms_prev = state;
 673  665  
 674  666          /*
 675  667           * Switch CPU microstate if appropriate
 676  668           */
 677  669  
 678  670          kpreempt_disable(); /* MUST disable kpreempt before touching t->cpu */
      671 +
 679  672          ASSERT(t->t_cpu == CPU);
      673 +
      674 +        /*
      675 +         * When the system boots the initial startup thread will have a
      676 +         * ms_state_start of 0 which would add a huge system time to the global
      677 +         * zone.  We want to skip aggregating that initial bit of work.
      678 +         */
      679 +        if (origstart != 0) {
      680 +                z = ttozone(t);
      681 +                if (state == LMS_USER) {
      682 +                        CPU_UARRAY_VAL(z->zone_ustate, t->t_cpu->cpu_id,
      683 +                            ZONE_USTATE_UTIME) += ztime;
      684 +                } else if (state == LMS_SYSTEM) {
      685 +                        CPU_UARRAY_VAL(z->zone_ustate, t->t_cpu->cpu_id,
      686 +                            ZONE_USTATE_STIME) += ztime;
      687 +                }
      688 +        }
      689 +
 680  690          if (!CPU_ON_INTR(t->t_cpu) && curthread->t_intr == NULL) {
 681  691                  if (new_state == LMS_USER && t->t_cpu->cpu_mstate != CMS_USER)
 682  692                          new_cpu_mstate(CMS_USER, curtime);
 683  693                  else if (new_state != LMS_USER &&
 684  694                      t->t_cpu->cpu_mstate != CMS_SYSTEM)
 685  695                          new_cpu_mstate(CMS_SYSTEM, curtime);
 686  696          }
 687  697          kpreempt_enable();
 688  698  
 689  699          return (ms->ms_prev);

 690  700  }
 691  701  
 692  702  /*
 693  703   * Restore the LWP microstate to the previous runnable state.
 694  704   * Called from disp() with the newly selected lwp.
 695  705   */
 696  706  void
 697  707  restore_mstate(kthread_t *t)
 698  708  {
 699  709          struct mstate *ms;
 700  710          hrtime_t *mstimep;
 701  711          klwp_t *lwp;
 702  712          hrtime_t curtime;
 703  713          hrtime_t waitrq;
 704  714          hrtime_t newtime;
 705  715          hrtime_t oldtime;
 706  716          hrtime_t waittime;
 707  717          zone_t *z;
 708  718  
 709  719          /*
 710  720           * Don't call restore mstate of threads without lwps.  (Kernel threads)
 711  721           *
 712  722           * threads with t_intr set shouldn't be in the dispatcher, so assert
 713  723           * that nobody here has t_intr.
 714  724           */
 715  725          ASSERT(t->t_intr == NULL);
 716  726  
 717  727          if ((lwp = ttolwp(t)) == NULL)
 718  728                  return;
 719  729  
 720  730          curtime = gethrtime_unscaled();
 721  731          (void) cpu_update_pct(t, curtime);
 722  732          ms = &lwp->lwp_mstate;
 723  733          ASSERT((unsigned)t->t_mstate < NMSTATES);
 724  734          do {
 725  735                  switch (t->t_mstate) {
 726  736                  case LMS_SLEEP:
 727  737                          /*
 728  738                           * Update the timer for the current sleep state.
 729  739                           */
 730  740                          ASSERT((unsigned)ms->ms_prev < NMSTATES);
 731  741                          switch (ms->ms_prev) {
 732  742                          case LMS_TFAULT:
 733  743                          case LMS_DFAULT:
 734  744                          case LMS_KFAULT:
 735  745                          case LMS_USER_LOCK:
 736  746                                  mstimep = &ms->ms_acct[ms->ms_prev];
 737  747                                  break;
 738  748                          default:
 739  749                                  mstimep = &ms->ms_acct[LMS_SLEEP];
 740  750                                  break;
 741  751                          }
 742  752                          /*
 743  753                           * Return to the previous run state.
 744  754                           */
 745  755                          t->t_mstate = ms->ms_prev;
 746  756                          break;
 747  757                  case LMS_STOPPED:
 748  758                          mstimep = &ms->ms_acct[LMS_STOPPED];
 749  759                          /*
 750  760                           * Return to the previous run state.
 751  761                           */
 752  762                          t->t_mstate = ms->ms_prev;
 753  763                          break;
 754  764                  case LMS_TFAULT:
 755  765                  case LMS_DFAULT:
 756  766                  case LMS_KFAULT:
 757  767                  case LMS_USER_LOCK:
 758  768                          mstimep = &ms->ms_acct[LMS_SYSTEM];
 759  769                          break;
 760  770                  default:
 761  771                          mstimep = &ms->ms_acct[t->t_mstate];
 762  772                          break;
 763  773                  }
 764  774                  waitrq = t->t_waitrq;   /* hopefully atomic */
 765  775                  if (waitrq == 0) {
 766  776                          waitrq = curtime;
 767  777                  }
 768  778                  t->t_waitrq = 0;
 769  779                  newtime = waitrq - ms->ms_state_start;
 770  780                  if (newtime < 0) {
 771  781                          curtime = gethrtime_unscaled();
 772  782                          oldtime = *mstimep - 1; /* force CAS to fail */
 773  783                          continue;
 774  784                  }
 775  785                  oldtime = *mstimep;

↓ open down ↓

86 lines elided

↑ open up ↑

 776  786                  newtime += oldtime;
 777  787          } while (atomic_cas_64((uint64_t *)mstimep, oldtime, newtime) !=
 778  788              oldtime);
 779  789  
 780  790          /*
 781  791           * Update the WAIT_CPU timer and per-cpu waitrq total.
 782  792           */
 783  793          z = ttozone(t);
 784  794          waittime = curtime - waitrq;
 785  795          ms->ms_acct[LMS_WAIT_CPU] += waittime;
 786      -        atomic_add_64(&z->zone_wtime, waittime);
      796 +
      797 +        /*
      798 +         * We are in a disp context where we're not going to migrate CPUs.
      799 +         */
      800 +        CPU_UARRAY_VAL(z->zone_ustate, CPU->cpu_id,
      801 +            ZONE_USTATE_WTIME) += waittime;
      802 +
 787  803          CPU->cpu_waitrq += waittime;
 788  804          ms->ms_state_start = curtime;
 789  805  }
 790  806  
 791  807  /*
 792  808   * Copy lwp microstate accounting and resource usage information
 793  809   * to the process.  (lwp is terminating)
 794  810   */
 795  811  void
 796  812  term_mstate(kthread_t *t)

 797  813  {
 798  814          struct mstate *ms;
 799  815          proc_t *p = ttoproc(t);
 800  816          klwp_t *lwp = ttolwp(t);
 801  817          int i;
 802  818          hrtime_t tmp;
 803  819  
 804  820          ASSERT(MUTEX_HELD(&p->p_lock));
 805  821  
 806  822          ms = &lwp->lwp_mstate;
 807  823          (void) new_mstate(t, LMS_STOPPED);
 808  824          ms->ms_term = ms->ms_state_start;
 809  825          tmp = ms->ms_term - ms->ms_start;
 810  826          scalehrtime(&tmp);
 811  827          p->p_mlreal += tmp;
 812  828          for (i = 0; i < NMSTATES; i++) {
 813  829                  tmp = ms->ms_acct[i];
 814  830                  scalehrtime(&tmp);
 815  831                  p->p_acct[i] += tmp;
 816  832          }
 817  833          p->p_ru.minflt   += lwp->lwp_ru.minflt;
 818  834          p->p_ru.majflt   += lwp->lwp_ru.majflt;
 819  835          p->p_ru.nswap    += lwp->lwp_ru.nswap;
 820  836          p->p_ru.inblock  += lwp->lwp_ru.inblock;
 821  837          p->p_ru.oublock  += lwp->lwp_ru.oublock;
 822  838          p->p_ru.msgsnd   += lwp->lwp_ru.msgsnd;
 823  839          p->p_ru.msgrcv   += lwp->lwp_ru.msgrcv;
 824  840          p->p_ru.nsignals += lwp->lwp_ru.nsignals;
 825  841          p->p_ru.nvcsw    += lwp->lwp_ru.nvcsw;
 826  842          p->p_ru.nivcsw   += lwp->lwp_ru.nivcsw;
 827  843          p->p_ru.sysc     += lwp->lwp_ru.sysc;
 828  844          p->p_ru.ioch     += lwp->lwp_ru.ioch;
 829  845          p->p_defunct++;
 830  846  }

↓ open down ↓

34 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX