illumos-gate Wdiff usr/src/uts/common/disp/disp.c

Print this page

OS-7125 Need mitigation of L1TF (CVE-2018-3646)
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/disp/disp.c
          +++ new/usr/src/uts/common/disp/disp.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the

↓ open down ↓

15 lines elided

↑ open up ↑

  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23   23   * Use is subject to license terms.
  24   24   */
  25   25  
       26 +/*
       27 + * Copyright (c) 2018, Joyent, Inc. All rights reserved.
       28 + */
       29 +
  26   30  /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  27   31  /*        All Rights Reserved   */
  28   32  
  29   33  
  30   34  #include <sys/types.h>
  31   35  #include <sys/param.h>
  32   36  #include <sys/sysmacros.h>
  33   37  #include <sys/signal.h>
  34   38  #include <sys/user.h>
  35   39  #include <sys/systm.h>

  36   40  #include <sys/sysinfo.h>
  37   41  #include <sys/var.h>
  38   42  #include <sys/errno.h>
  39   43  #include <sys/cmn_err.h>
  40   44  #include <sys/debug.h>
  41   45  #include <sys/inline.h>
  42   46  #include <sys/disp.h>
  43   47  #include <sys/class.h>
  44   48  #include <sys/bitmap.h>
  45   49  #include <sys/kmem.h>
  46   50  #include <sys/cpuvar.h>
  47   51  #include <sys/vtrace.h>
  48   52  #include <sys/tnf.h>

↓ open down ↓

13 lines elided

↑ open up ↑

  49   53  #include <sys/cpupart.h>
  50   54  #include <sys/lgrp.h>
  51   55  #include <sys/pg.h>
  52   56  #include <sys/cmt.h>
  53   57  #include <sys/bitset.h>
  54   58  #include <sys/schedctl.h>
  55   59  #include <sys/atomic.h>
  56   60  #include <sys/dtrace.h>
  57   61  #include <sys/sdt.h>
  58   62  #include <sys/archsystm.h>
       63 +#include <sys/ht.h>
  59   64  
  60   65  #include <vm/as.h>
  61   66  
  62   67  #define BOUND_CPU       0x1
  63   68  #define BOUND_PARTITION 0x2
  64   69  #define BOUND_INTR      0x4
  65   70  
  66   71  /* Dispatch queue allocation structure and functions */
  67   72  struct disp_queue_info {
  68   73          disp_t  *dp;

  69   74          dispq_t *olddispq;
  70   75          dispq_t *newdispq;
  71   76          ulong_t *olddqactmap;
  72   77          ulong_t *newdqactmap;
  73   78          int     oldnglobpris;
  74   79  };
  75   80  static void     disp_dq_alloc(struct disp_queue_info *dptr, int numpris,
  76   81      disp_t *dp);
  77   82  static void     disp_dq_assign(struct disp_queue_info *dptr, int numpris);
  78   83  static void     disp_dq_free(struct disp_queue_info *dptr);
  79   84  
  80   85  /* platform-specific routine to call when processor is idle */
  81   86  static void     generic_idle_cpu();
  82   87  void            (*idle_cpu)() = generic_idle_cpu;
  83   88  
  84   89  /* routines invoked when a CPU enters/exits the idle loop */
  85   90  static void     idle_enter();
  86   91  static void     idle_exit();
  87   92  
  88   93  /* platform-specific routine to call when thread is enqueued */
  89   94  static void     generic_enq_thread(cpu_t *, int);
  90   95  void            (*disp_enq_thread)(cpu_t *, int) = generic_enq_thread;
  91   96  
  92   97  pri_t   kpreemptpri;            /* priority where kernel preemption applies */
  93   98  pri_t   upreemptpri = 0;        /* priority where normal preemption applies */
  94   99  pri_t   intr_pri;               /* interrupt thread priority base level */
  95  100  
  96  101  #define KPQPRI  -1              /* pri where cpu affinity is dropped for kpq */
  97  102  pri_t   kpqpri = KPQPRI;        /* can be set in /etc/system */
  98  103  disp_t  cpu0_disp;              /* boot CPU's dispatch queue */
  99  104  disp_lock_t     swapped_lock;   /* lock swapped threads and swap queue */
 100  105  int     nswapped;               /* total number of swapped threads */
 101  106  void    disp_swapped_enq(kthread_t *tp);
 102  107  static void     disp_swapped_setrun(kthread_t *tp);
 103  108  static void     cpu_resched(cpu_t *cp, pri_t tpri);
 104  109  
 105  110  /*
 106  111   * If this is set, only interrupt threads will cause kernel preemptions.
 107  112   * This is done by changing the value of kpreemptpri.  kpreemptpri
 108  113   * will either be the max sysclass pri + 1 or the min interrupt pri.
 109  114   */
 110  115  int     only_intr_kpreempt;
 111  116  
 112  117  extern void set_idle_cpu(int cpun);
 113  118  extern void unset_idle_cpu(int cpun);
 114  119  static void setkpdq(kthread_t *tp, int borf);
 115  120  #define SETKP_BACK      0
 116  121  #define SETKP_FRONT     1
 117  122  /*
 118  123   * Parameter that determines how recently a thread must have run
 119  124   * on the CPU to be considered loosely-bound to that CPU to reduce
 120  125   * cold cache effects.  The interval is in hertz.
 121  126   */
 122  127  #define RECHOOSE_INTERVAL 3
 123  128  int     rechoose_interval = RECHOOSE_INTERVAL;
 124  129  
 125  130  /*
 126  131   * Parameter that determines how long (in nanoseconds) a thread must
 127  132   * be sitting on a run queue before it can be stolen by another CPU
 128  133   * to reduce migrations.  The interval is in nanoseconds.
 129  134   *
 130  135   * The nosteal_nsec should be set by platform code cmp_set_nosteal_interval()
 131  136   * to an appropriate value.  nosteal_nsec is set to NOSTEAL_UNINITIALIZED
 132  137   * here indicating it is uninitiallized.
 133  138   * Setting nosteal_nsec to 0 effectively disables the nosteal 'protection'.
 134  139   *
 135  140   */
 136  141  #define NOSTEAL_UNINITIALIZED   (-1)
 137  142  hrtime_t nosteal_nsec = NOSTEAL_UNINITIALIZED;
 138  143  extern void cmp_set_nosteal_interval(void);
 139  144  
 140  145  id_t    defaultcid;     /* system "default" class; see dispadmin(1M) */
 141  146  
 142  147  disp_lock_t     transition_lock;        /* lock on transitioning threads */
 143  148  disp_lock_t     stop_lock;              /* lock on stopped threads */
 144  149  
 145  150  static void     cpu_dispqalloc(int numpris);
 146  151  
 147  152  /*
 148  153   * This gets returned by disp_getwork/disp_getbest if we couldn't steal
 149  154   * a thread because it was sitting on its run queue for a very short
 150  155   * period of time.
 151  156   */
 152  157  #define T_DONTSTEAL     (kthread_t *)(-1) /* returned by disp_getwork/getbest */
 153  158  
 154  159  static kthread_t        *disp_getwork(cpu_t *to);
 155  160  static kthread_t        *disp_getbest(disp_t *from);
 156  161  static kthread_t        *disp_ratify(kthread_t *tp, disp_t *kpq);
 157  162  
 158  163  void    swtch_to(kthread_t *);
 159  164  
 160  165  /*
 161  166   * dispatcher and scheduler initialization
 162  167   */
 163  168  
 164  169  /*
 165  170   * disp_setup - Common code to calculate and allocate dispatcher
 166  171   *              variables and structures based on the maximum priority.
 167  172   */
 168  173  static void
 169  174  disp_setup(pri_t maxglobpri, pri_t oldnglobpris)
 170  175  {
 171  176          pri_t   newnglobpris;
 172  177  
 173  178          ASSERT(MUTEX_HELD(&cpu_lock));
 174  179  
 175  180          newnglobpris = maxglobpri + 1 + LOCK_LEVEL;
 176  181  
 177  182          if (newnglobpris > oldnglobpris) {
 178  183                  /*
 179  184                   * Allocate new kp queues for each CPU partition.
 180  185                   */
 181  186                  cpupart_kpqalloc(newnglobpris);
 182  187  
 183  188                  /*
 184  189                   * Allocate new dispatch queues for each CPU.
 185  190                   */
 186  191                  cpu_dispqalloc(newnglobpris);
 187  192  
 188  193                  /*
 189  194                   * compute new interrupt thread base priority
 190  195                   */
 191  196                  intr_pri = maxglobpri;
 192  197                  if (only_intr_kpreempt) {
 193  198                          kpreemptpri = intr_pri + 1;
 194  199                          if (kpqpri == KPQPRI)
 195  200                                  kpqpri = kpreemptpri;
 196  201                  }
 197  202                  v.v_nglobpris = newnglobpris;
 198  203          }
 199  204  }
 200  205  
 201  206  /*
 202  207   * dispinit - Called to initialize all loaded classes and the
 203  208   *            dispatcher framework.
 204  209   */
 205  210  void
 206  211  dispinit(void)
 207  212  {
 208  213          id_t    cid;
 209  214          pri_t   maxglobpri;
 210  215          pri_t   cl_maxglobpri;
 211  216  
 212  217          maxglobpri = -1;
 213  218  
 214  219          /*
 215  220           * Initialize transition lock, which will always be set.
 216  221           */
 217  222          DISP_LOCK_INIT(&transition_lock);
 218  223          disp_lock_enter_high(&transition_lock);
 219  224          DISP_LOCK_INIT(&stop_lock);
 220  225  
 221  226          mutex_enter(&cpu_lock);
 222  227          CPU->cpu_disp->disp_maxrunpri = -1;
 223  228          CPU->cpu_disp->disp_max_unbound_pri = -1;
 224  229  
 225  230          /*
 226  231           * Initialize the default CPU partition.
 227  232           */
 228  233          cpupart_initialize_default();
 229  234          /*
 230  235           * Call the class specific initialization functions for
 231  236           * all pre-installed schedulers.
 232  237           *
 233  238           * We pass the size of a class specific parameter
 234  239           * buffer to each of the initialization functions
 235  240           * to try to catch problems with backward compatibility
 236  241           * of class modules.
 237  242           *
 238  243           * For example a new class module running on an old system
 239  244           * which didn't provide sufficiently large parameter buffers
 240  245           * would be bad news. Class initialization modules can check for
 241  246           * this and take action if they detect a problem.
 242  247           */
 243  248  
 244  249          for (cid = 0; cid < nclass; cid++) {
 245  250                  sclass_t        *sc;
 246  251  
 247  252                  sc = &sclass[cid];
 248  253                  if (SCHED_INSTALLED(sc)) {
 249  254                          cl_maxglobpri = sc->cl_init(cid, PC_CLPARMSZ,
 250  255                              &sc->cl_funcs);
 251  256                          if (cl_maxglobpri > maxglobpri)
 252  257                                  maxglobpri = cl_maxglobpri;
 253  258                  }
 254  259          }
 255  260          kpreemptpri = (pri_t)v.v_maxsyspri + 1;
 256  261          if (kpqpri == KPQPRI)
 257  262                  kpqpri = kpreemptpri;
 258  263  
 259  264          ASSERT(maxglobpri >= 0);
 260  265          disp_setup(maxglobpri, 0);
 261  266  
 262  267          mutex_exit(&cpu_lock);
 263  268  
 264  269          /*
 265  270           * Platform specific sticky scheduler setup.
 266  271           */
 267  272          if (nosteal_nsec == NOSTEAL_UNINITIALIZED)
 268  273                  cmp_set_nosteal_interval();
 269  274  
 270  275          /*
 271  276           * Get the default class ID; this may be later modified via
 272  277           * dispadmin(1M).  This will load the class (normally TS) and that will
 273  278           * call disp_add(), which is why we had to drop cpu_lock first.
 274  279           */
 275  280          if (getcid(defaultclass, &defaultcid) != 0) {
 276  281                  cmn_err(CE_PANIC, "Couldn't load default scheduling class '%s'",
 277  282                      defaultclass);
 278  283          }
 279  284  }
 280  285  
 281  286  /*
 282  287   * disp_add - Called with class pointer to initialize the dispatcher
 283  288   *            for a newly loaded class.
 284  289   */
 285  290  void
 286  291  disp_add(sclass_t *clp)
 287  292  {
 288  293          pri_t   maxglobpri;
 289  294          pri_t   cl_maxglobpri;
 290  295  
 291  296          mutex_enter(&cpu_lock);
 292  297          /*
 293  298           * Initialize the scheduler class.
 294  299           */
 295  300          maxglobpri = (pri_t)(v.v_nglobpris - LOCK_LEVEL - 1);
 296  301          cl_maxglobpri = clp->cl_init(clp - sclass, PC_CLPARMSZ, &clp->cl_funcs);
 297  302          if (cl_maxglobpri > maxglobpri)
 298  303                  maxglobpri = cl_maxglobpri;
 299  304  
 300  305          /*
 301  306           * Save old queue information.  Since we're initializing a
 302  307           * new scheduling class which has just been loaded, then
 303  308           * the size of the dispq may have changed.  We need to handle
 304  309           * that here.
 305  310           */
 306  311          disp_setup(maxglobpri, v.v_nglobpris);
 307  312  
 308  313          mutex_exit(&cpu_lock);
 309  314  }
 310  315  
 311  316  
 312  317  /*
 313  318   * For each CPU, allocate new dispatch queues
 314  319   * with the stated number of priorities.
 315  320   */
 316  321  static void
 317  322  cpu_dispqalloc(int numpris)
 318  323  {
 319  324          cpu_t   *cpup;
 320  325          struct disp_queue_info  *disp_mem;
 321  326          int i, num;
 322  327  
 323  328          ASSERT(MUTEX_HELD(&cpu_lock));
 324  329  
 325  330          disp_mem = kmem_zalloc(NCPU *
 326  331              sizeof (struct disp_queue_info), KM_SLEEP);
 327  332  
 328  333          /*
 329  334           * This routine must allocate all of the memory before stopping
 330  335           * the cpus because it must not sleep in kmem_alloc while the
 331  336           * CPUs are stopped.  Locks they hold will not be freed until they
 332  337           * are restarted.
 333  338           */
 334  339          i = 0;
 335  340          cpup = cpu_list;
 336  341          do {
 337  342                  disp_dq_alloc(&disp_mem[i], numpris, cpup->cpu_disp);
 338  343                  i++;
 339  344                  cpup = cpup->cpu_next;
 340  345          } while (cpup != cpu_list);
 341  346          num = i;
 342  347  
 343  348          pause_cpus(NULL, NULL);
 344  349          for (i = 0; i < num; i++)
 345  350                  disp_dq_assign(&disp_mem[i], numpris);
 346  351          start_cpus();
 347  352  
 348  353          /*
 349  354           * I must free all of the memory after starting the cpus because
 350  355           * I can not risk sleeping in kmem_free while the cpus are stopped.
 351  356           */
 352  357          for (i = 0; i < num; i++)
 353  358                  disp_dq_free(&disp_mem[i]);
 354  359  
 355  360          kmem_free(disp_mem, NCPU * sizeof (struct disp_queue_info));
 356  361  }
 357  362  
 358  363  static void
 359  364  disp_dq_alloc(struct disp_queue_info *dptr, int numpris, disp_t *dp)
 360  365  {
 361  366          dptr->newdispq = kmem_zalloc(numpris * sizeof (dispq_t), KM_SLEEP);
 362  367          dptr->newdqactmap = kmem_zalloc(((numpris / BT_NBIPUL) + 1) *
 363  368              sizeof (long), KM_SLEEP);
 364  369          dptr->dp = dp;
 365  370  }
 366  371  
 367  372  static void
 368  373  disp_dq_assign(struct disp_queue_info *dptr, int numpris)
 369  374  {
 370  375          disp_t  *dp;
 371  376  
 372  377          dp = dptr->dp;
 373  378          dptr->olddispq = dp->disp_q;
 374  379          dptr->olddqactmap = dp->disp_qactmap;
 375  380          dptr->oldnglobpris = dp->disp_npri;
 376  381  
 377  382          ASSERT(dptr->oldnglobpris < numpris);
 378  383  
 379  384          if (dptr->olddispq != NULL) {
 380  385                  /*
 381  386                   * Use kcopy because bcopy is platform-specific
 382  387                   * and could block while we might have paused the cpus.
 383  388                   */
 384  389                  (void) kcopy(dptr->olddispq, dptr->newdispq,
 385  390                      dptr->oldnglobpris * sizeof (dispq_t));
 386  391                  (void) kcopy(dptr->olddqactmap, dptr->newdqactmap,
 387  392                      ((dptr->oldnglobpris / BT_NBIPUL) + 1) *
 388  393                      sizeof (long));
 389  394          }
 390  395          dp->disp_q = dptr->newdispq;
 391  396          dp->disp_qactmap = dptr->newdqactmap;
 392  397          dp->disp_q_limit = &dptr->newdispq[numpris];
 393  398          dp->disp_npri = numpris;
 394  399  }
 395  400  
 396  401  static void
 397  402  disp_dq_free(struct disp_queue_info *dptr)
 398  403  {
 399  404          if (dptr->olddispq != NULL)
 400  405                  kmem_free(dptr->olddispq,
 401  406                      dptr->oldnglobpris * sizeof (dispq_t));
 402  407          if (dptr->olddqactmap != NULL)
 403  408                  kmem_free(dptr->olddqactmap,
 404  409                      ((dptr->oldnglobpris / BT_NBIPUL) + 1) * sizeof (long));
 405  410  }
 406  411  
 407  412  /*
 408  413   * For a newly created CPU, initialize the dispatch queue.
 409  414   * This is called before the CPU is known through cpu[] or on any lists.
 410  415   */
 411  416  void
 412  417  disp_cpu_init(cpu_t *cp)
 413  418  {
 414  419          disp_t  *dp;
 415  420          dispq_t *newdispq;
 416  421          ulong_t *newdqactmap;
 417  422  
 418  423          ASSERT(MUTEX_HELD(&cpu_lock));  /* protect dispatcher queue sizes */
 419  424  
 420  425          if (cp == cpu0_disp.disp_cpu)
 421  426                  dp = &cpu0_disp;
 422  427          else
 423  428                  dp = kmem_alloc(sizeof (disp_t), KM_SLEEP);
 424  429          bzero(dp, sizeof (disp_t));
 425  430          cp->cpu_disp = dp;
 426  431          dp->disp_cpu = cp;
 427  432          dp->disp_maxrunpri = -1;
 428  433          dp->disp_max_unbound_pri = -1;
 429  434          DISP_LOCK_INIT(&cp->cpu_thread_lock);
 430  435          /*
 431  436           * Allocate memory for the dispatcher queue headers
 432  437           * and the active queue bitmap.
 433  438           */
 434  439          newdispq = kmem_zalloc(v.v_nglobpris * sizeof (dispq_t), KM_SLEEP);
 435  440          newdqactmap = kmem_zalloc(((v.v_nglobpris / BT_NBIPUL) + 1) *
 436  441              sizeof (long), KM_SLEEP);
 437  442          dp->disp_q = newdispq;
 438  443          dp->disp_qactmap = newdqactmap;
 439  444          dp->disp_q_limit = &newdispq[v.v_nglobpris];
 440  445          dp->disp_npri = v.v_nglobpris;
 441  446  }
 442  447  
 443  448  void
 444  449  disp_cpu_fini(cpu_t *cp)
 445  450  {
 446  451          ASSERT(MUTEX_HELD(&cpu_lock));
 447  452  
 448  453          disp_kp_free(cp->cpu_disp);
 449  454          if (cp->cpu_disp != &cpu0_disp)
 450  455                  kmem_free(cp->cpu_disp, sizeof (disp_t));
 451  456  }
 452  457  
 453  458  /*
 454  459   * Allocate new, larger kpreempt dispatch queue to replace the old one.
 455  460   */
 456  461  void
 457  462  disp_kp_alloc(disp_t *dq, pri_t npri)
 458  463  {
 459  464          struct disp_queue_info  mem_info;
 460  465  
 461  466          if (npri > dq->disp_npri) {
 462  467                  /*
 463  468                   * Allocate memory for the new array.
 464  469                   */
 465  470                  disp_dq_alloc(&mem_info, npri, dq);
 466  471  
 467  472                  /*
 468  473                   * We need to copy the old structures to the new
 469  474                   * and free the old.
 470  475                   */
 471  476                  disp_dq_assign(&mem_info, npri);
 472  477                  disp_dq_free(&mem_info);
 473  478          }
 474  479  }
 475  480  
 476  481  /*
 477  482   * Free dispatch queue.
 478  483   * Used for the kpreempt queues for a removed CPU partition and
 479  484   * for the per-CPU queues of deleted CPUs.
 480  485   */
 481  486  void
 482  487  disp_kp_free(disp_t *dq)
 483  488  {
 484  489          struct disp_queue_info  mem_info;
 485  490  
 486  491          mem_info.olddispq = dq->disp_q;
 487  492          mem_info.olddqactmap = dq->disp_qactmap;
 488  493          mem_info.oldnglobpris = dq->disp_npri;
 489  494          disp_dq_free(&mem_info);
 490  495  }
 491  496  
 492  497  /*
 493  498   * End dispatcher and scheduler initialization.
 494  499   */
 495  500  
 496  501  /*
 497  502   * See if there's anything to do other than remain idle.
 498  503   * Return non-zero if there is.
 499  504   *
 500  505   * This function must be called with high spl, or with
 501  506   * kernel preemption disabled to prevent the partition's
 502  507   * active cpu list from changing while being traversed.
 503  508   *
 504  509   * This is essentially a simpler version of disp_getwork()
 505  510   * to be called by CPUs preparing to "halt".
 506  511   */
 507  512  int
 508  513  disp_anywork(void)
 509  514  {
 510  515          cpu_t           *cp = CPU;
 511  516          cpu_t           *ocp;
 512  517          volatile int    *local_nrunnable = &cp->cpu_disp->disp_nrunnable;
 513  518  
 514  519          if (!(cp->cpu_flags & CPU_OFFLINE)) {
 515  520                  if (CP_MAXRUNPRI(cp->cpu_part) >= 0)
 516  521                          return (1);
 517  522  
 518  523                  for (ocp = cp->cpu_next_part; ocp != cp;
 519  524                      ocp = ocp->cpu_next_part) {
 520  525                          ASSERT(CPU_ACTIVE(ocp));
 521  526  
 522  527                          /*
 523  528                           * Something has appeared on the local run queue.
 524  529                           */
 525  530                          if (*local_nrunnable > 0)
 526  531                                  return (1);
 527  532                          /*
 528  533                           * If we encounter another idle CPU that will
 529  534                           * soon be trolling around through disp_anywork()
 530  535                           * terminate our walk here and let this other CPU
 531  536                           * patrol the next part of the list.
 532  537                           */
 533  538                          if (ocp->cpu_dispatch_pri == -1 &&
 534  539                              (ocp->cpu_disp_flags & CPU_DISP_HALTED) == 0)
 535  540                                  return (0);
 536  541                          /*
 537  542                           * Work can be taken from another CPU if:
 538  543                           *      - There is unbound work on the run queue
 539  544                           *      - That work isn't a thread undergoing a
 540  545                           *      - context switch on an otherwise empty queue.
 541  546                           *      - The CPU isn't running the idle loop.
 542  547                           */
 543  548                          if (ocp->cpu_disp->disp_max_unbound_pri != -1 &&
 544  549                              !((ocp->cpu_disp_flags & CPU_DISP_DONTSTEAL) &&
 545  550                              ocp->cpu_disp->disp_nrunnable == 1) &&
 546  551                              ocp->cpu_dispatch_pri != -1)
 547  552                                  return (1);
 548  553                  }
 549  554          }
 550  555          return (0);
 551  556  }
 552  557  
 553  558  /*
 554  559   * Called when CPU enters the idle loop
 555  560   */
 556  561  static void
 557  562  idle_enter()
 558  563  {
 559  564          cpu_t           *cp = CPU;
 560  565  
 561  566          new_cpu_mstate(CMS_IDLE, gethrtime_unscaled());
 562  567          CPU_STATS_ADDQ(cp, sys, idlethread, 1);
 563  568          set_idle_cpu(cp->cpu_id);       /* arch-dependent hook */
 564  569  }
 565  570  
 566  571  /*
 567  572   * Called when CPU exits the idle loop
 568  573   */
 569  574  static void
 570  575  idle_exit()
 571  576  {
 572  577          cpu_t           *cp = CPU;
 573  578  
 574  579          new_cpu_mstate(CMS_SYSTEM, gethrtime_unscaled());
 575  580          unset_idle_cpu(cp->cpu_id);     /* arch-dependent hook */
 576  581  }
 577  582  
 578  583  /*
 579  584   * Idle loop.
 580  585   */
 581  586  void
 582  587  idle()
 583  588  {
 584  589          struct cpu      *cp = CPU;              /* pointer to this CPU */
 585  590          kthread_t       *t;                     /* taken thread */
 586  591  
 587  592          idle_enter();
 588  593  
 589  594          /*
 590  595           * Uniprocessor version of idle loop.
 591  596           * Do this until notified that we're on an actual multiprocessor.
 592  597           */
 593  598          while (ncpus == 1) {
 594  599                  if (cp->cpu_disp->disp_nrunnable == 0) {
 595  600                          (*idle_cpu)();
 596  601                          continue;
 597  602                  }
 598  603                  idle_exit();
 599  604                  swtch();
 600  605  
 601  606                  idle_enter(); /* returned from swtch */
 602  607          }
 603  608  
 604  609          /*
 605  610           * Multiprocessor idle loop.
 606  611           */
 607  612          for (;;) {
 608  613                  /*
 609  614                   * If CPU is completely quiesced by p_online(2), just wait
 610  615                   * here with minimal bus traffic until put online.
 611  616                   */
 612  617                  while (cp->cpu_flags & CPU_QUIESCED)
 613  618                          (*idle_cpu)();
 614  619  
 615  620                  if (cp->cpu_disp->disp_nrunnable != 0) {
 616  621                          idle_exit();
 617  622                          swtch();
 618  623                  } else {
 619  624                          if (cp->cpu_flags & CPU_OFFLINE)
 620  625                                  continue;
 621  626                          if ((t = disp_getwork(cp)) == NULL) {
 622  627                                  if (cp->cpu_chosen_level != -1) {
 623  628                                          disp_t *dp = cp->cpu_disp;
 624  629                                          disp_t *kpq;
 625  630  
 626  631                                          disp_lock_enter(&dp->disp_lock);
 627  632                                          /*
 628  633                                           * Set kpq under lock to prevent
 629  634                                           * migration between partitions.
 630  635                                           */
 631  636                                          kpq = &cp->cpu_part->cp_kp_queue;
 632  637                                          if (kpq->disp_maxrunpri == -1)
 633  638                                                  cp->cpu_chosen_level = -1;
 634  639                                          disp_lock_exit(&dp->disp_lock);
 635  640                                  }
 636  641                                  (*idle_cpu)();
 637  642                                  continue;
 638  643                          }
 639  644                          /*
 640  645                           * If there was a thread but we couldn't steal
 641  646                           * it, then keep trying.
 642  647                           */
 643  648                          if (t == T_DONTSTEAL)
 644  649                                  continue;
 645  650                          idle_exit();
 646  651                          swtch_to(t);
 647  652                  }
 648  653                  idle_enter(); /* returned from swtch/swtch_to */
 649  654          }
 650  655  }
 651  656  
 652  657  
 653  658  /*
 654  659   * Preempt the currently running thread in favor of the highest
 655  660   * priority thread.  The class of the current thread controls
 656  661   * where it goes on the dispatcher queues. If panicking, turn
 657  662   * preemption off.
 658  663   */
 659  664  void
 660  665  preempt()
 661  666  {
 662  667          kthread_t       *t = curthread;
 663  668          klwp_t          *lwp = ttolwp(curthread);
 664  669  
 665  670          if (panicstr)
 666  671                  return;
 667  672  
 668  673          TRACE_0(TR_FAC_DISP, TR_PREEMPT_START, "preempt_start");
 669  674  
 670  675          thread_lock(t);
 671  676  
 672  677          if (t->t_state != TS_ONPROC || t->t_disp_queue != CPU->cpu_disp) {
 673  678                  /*
 674  679                   * this thread has already been chosen to be run on
 675  680                   * another CPU. Clear kprunrun on this CPU since we're
 676  681                   * already headed for swtch().
 677  682                   */
 678  683                  CPU->cpu_kprunrun = 0;
 679  684                  thread_unlock_nopreempt(t);
 680  685                  TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end");
 681  686          } else {
 682  687                  if (lwp != NULL)
 683  688                          lwp->lwp_ru.nivcsw++;
 684  689                  CPU_STATS_ADDQ(CPU, sys, inv_swtch, 1);
 685  690                  THREAD_TRANSITION(t);
 686  691                  CL_PREEMPT(t);
 687  692                  DTRACE_SCHED(preempt);
 688  693                  thread_unlock_nopreempt(t);
 689  694  
 690  695                  TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end");
 691  696  
 692  697                  swtch();                /* clears CPU->cpu_runrun via disp() */
 693  698          }
 694  699  }
 695  700  
 696  701  extern kthread_t *thread_unpin();
 697  702  
 698  703  /*
 699  704   * disp() - find the highest priority thread for this processor to run, and
 700  705   * set it in TS_ONPROC state so that resume() can be called to run it.
 701  706   */
 702  707  static kthread_t *
 703  708  disp()
 704  709  {
 705  710          cpu_t           *cpup;
 706  711          disp_t          *dp;
 707  712          kthread_t       *tp;
 708  713          dispq_t         *dq;
 709  714          int             maxrunword;
 710  715          pri_t           pri;
 711  716          disp_t          *kpq;
 712  717  
 713  718          TRACE_0(TR_FAC_DISP, TR_DISP_START, "disp_start");
 714  719  
 715  720          cpup = CPU;
 716  721          /*
 717  722           * Find the highest priority loaded, runnable thread.
 718  723           */
 719  724          dp = cpup->cpu_disp;
 720  725  
 721  726  reschedule:
 722  727          /*
 723  728           * If there is more important work on the global queue with a better
 724  729           * priority than the maximum on this CPU, take it now.
 725  730           */
 726  731          kpq = &cpup->cpu_part->cp_kp_queue;
 727  732          while ((pri = kpq->disp_maxrunpri) >= 0 &&
 728  733              pri >= dp->disp_maxrunpri &&
 729  734              (cpup->cpu_flags & CPU_OFFLINE) == 0 &&
 730  735              (tp = disp_getbest(kpq)) != NULL) {
 731  736                  if (disp_ratify(tp, kpq) != NULL) {
 732  737                          TRACE_1(TR_FAC_DISP, TR_DISP_END,
 733  738                              "disp_end:tid %p", tp);
 734  739                          return (tp);
 735  740                  }
 736  741          }
 737  742  
 738  743          disp_lock_enter(&dp->disp_lock);
 739  744          pri = dp->disp_maxrunpri;
 740  745  
 741  746          /*
 742  747           * If there is nothing to run, look at what's runnable on other queues.
 743  748           * Choose the idle thread if the CPU is quiesced.
 744  749           * Note that CPUs that have the CPU_OFFLINE flag set can still run
 745  750           * interrupt threads, which will be the only threads on the CPU's own
 746  751           * queue, but cannot run threads from other queues.
 747  752           */
 748  753          if (pri == -1) {
 749  754                  if (!(cpup->cpu_flags & CPU_OFFLINE)) {
 750  755                          disp_lock_exit(&dp->disp_lock);
 751  756                          if ((tp = disp_getwork(cpup)) == NULL ||
 752  757                              tp == T_DONTSTEAL) {
 753  758                                  tp = cpup->cpu_idle_thread;
 754  759                                  (void) splhigh();
 755  760                                  THREAD_ONPROC(tp, cpup);
 756  761                                  cpup->cpu_dispthread = tp;
 757  762                                  cpup->cpu_dispatch_pri = -1;
 758  763                                  cpup->cpu_runrun = cpup->cpu_kprunrun = 0;
 759  764                                  cpup->cpu_chosen_level = -1;
 760  765                          }
 761  766                  } else {
 762  767                          disp_lock_exit_high(&dp->disp_lock);
 763  768                          tp = cpup->cpu_idle_thread;
 764  769                          THREAD_ONPROC(tp, cpup);
 765  770                          cpup->cpu_dispthread = tp;
 766  771                          cpup->cpu_dispatch_pri = -1;
 767  772                          cpup->cpu_runrun = cpup->cpu_kprunrun = 0;
 768  773                          cpup->cpu_chosen_level = -1;
 769  774                  }
 770  775                  TRACE_1(TR_FAC_DISP, TR_DISP_END,
 771  776                      "disp_end:tid %p", tp);
 772  777                  return (tp);
 773  778          }
 774  779  
 775  780          dq = &dp->disp_q[pri];
 776  781          tp = dq->dq_first;
 777  782  
 778  783          ASSERT(tp != NULL);
 779  784          ASSERT(tp->t_schedflag & TS_LOAD);      /* thread must be swapped in */
 780  785  
 781  786          DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp);
 782  787  
 783  788          /*
 784  789           * Found it so remove it from queue.
 785  790           */
 786  791          dp->disp_nrunnable--;
 787  792          dq->dq_sruncnt--;
 788  793          if ((dq->dq_first = tp->t_link) == NULL) {
 789  794                  ulong_t *dqactmap = dp->disp_qactmap;
 790  795  
 791  796                  ASSERT(dq->dq_sruncnt == 0);
 792  797                  dq->dq_last = NULL;
 793  798  
 794  799                  /*
 795  800                   * The queue is empty, so the corresponding bit needs to be
 796  801                   * turned off in dqactmap.   If nrunnable != 0 just took the
 797  802                   * last runnable thread off the
 798  803                   * highest queue, so recompute disp_maxrunpri.
 799  804                   */
 800  805                  maxrunword = pri >> BT_ULSHIFT;
 801  806                  dqactmap[maxrunword] &= ~BT_BIW(pri);
 802  807  
 803  808                  if (dp->disp_nrunnable == 0) {
 804  809                          dp->disp_max_unbound_pri = -1;
 805  810                          dp->disp_maxrunpri = -1;
 806  811                  } else {
 807  812                          int ipri;
 808  813  
 809  814                          ipri = bt_gethighbit(dqactmap, maxrunword);
 810  815                          dp->disp_maxrunpri = ipri;
 811  816                          if (ipri < dp->disp_max_unbound_pri)
 812  817                                  dp->disp_max_unbound_pri = ipri;
 813  818                  }
 814  819          } else {
 815  820                  tp->t_link = NULL;
 816  821          }
 817  822  
 818  823          /*
 819  824           * Set TS_DONT_SWAP flag to prevent another processor from swapping
 820  825           * out this thread before we have a chance to run it.
 821  826           * While running, it is protected against swapping by t_lock.
 822  827           */
 823  828          tp->t_schedflag |= TS_DONT_SWAP;
 824  829          cpup->cpu_dispthread = tp;              /* protected by spl only */
 825  830          cpup->cpu_dispatch_pri = pri;
 826  831          ASSERT(pri == DISP_PRIO(tp));
 827  832          thread_onproc(tp, cpup);                /* set t_state to TS_ONPROC */
 828  833          disp_lock_exit_high(&dp->disp_lock);    /* drop run queue lock */
 829  834  
 830  835          ASSERT(tp != NULL);
 831  836          TRACE_1(TR_FAC_DISP, TR_DISP_END,
 832  837              "disp_end:tid %p", tp);
 833  838  
 834  839          if (disp_ratify(tp, kpq) == NULL)
 835  840                  goto reschedule;
 836  841  
 837  842          return (tp);
 838  843  }
 839  844  
 840  845  /*
 841  846   * swtch()
 842  847   *      Find best runnable thread and run it.
 843  848   *      Called with the current thread already switched to a new state,
 844  849   *      on a sleep queue, run queue, stopped, and not zombied.
 845  850   *      May be called at any spl level less than or equal to LOCK_LEVEL.
 846  851   *      Always drops spl to the base level (spl0()).
 847  852   */
 848  853  void
 849  854  swtch()
 850  855  {
 851  856          kthread_t       *t = curthread;
 852  857          kthread_t       *next;
 853  858          cpu_t           *cp;
 854  859  
 855  860          TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
 856  861  
 857  862          if (t->t_flag & T_INTR_THREAD)
 858  863                  cpu_intr_swtch_enter(t);
 859  864  
 860  865          if (t->t_intr != NULL) {
 861  866                  /*
 862  867                   * We are an interrupt thread.  Setup and return
 863  868                   * the interrupted thread to be resumed.
 864  869                   */
 865  870                  (void) splhigh();       /* block other scheduler action */
 866  871                  cp = CPU;               /* now protected against migration */
 867  872                  ASSERT(CPU_ON_INTR(cp) == 0);   /* not called with PIL > 10 */
 868  873                  CPU_STATS_ADDQ(cp, sys, pswitch, 1);
 869  874                  CPU_STATS_ADDQ(cp, sys, intrblk, 1);
 870  875                  next = thread_unpin();
 871  876                  TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
 872  877                  resume_from_intr(next);
 873  878          } else {
 874  879  #ifdef  DEBUG
 875  880                  if (t->t_state == TS_ONPROC &&
 876  881                      t->t_disp_queue->disp_cpu == CPU &&
 877  882                      t->t_preempt == 0) {
 878  883                          thread_lock(t);
 879  884                          ASSERT(t->t_state != TS_ONPROC ||
 880  885                              t->t_disp_queue->disp_cpu != CPU ||
 881  886                              t->t_preempt != 0); /* cannot migrate */
 882  887                          thread_unlock_nopreempt(t);
 883  888                  }
 884  889  #endif  /* DEBUG */
 885  890                  cp = CPU;
 886  891                  next = disp();          /* returns with spl high */
 887  892                  ASSERT(CPU_ON_INTR(cp) == 0);   /* not called with PIL > 10 */
 888  893  
 889  894                  /* OK to steal anything left on run queue */
 890  895                  cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL;
 891  896  
 892  897                  if (next != t) {
 893  898                          hrtime_t now;
 894  899  
 895  900                          now = gethrtime_unscaled();
 896  901                          pg_ev_thread_swtch(cp, now, t, next);
 897  902  
 898  903                          /*
 899  904                           * If t was previously in the TS_ONPROC state,
 900  905                           * setfrontdq and setbackdq won't have set its t_waitrq.
 901  906                           * Since we now finally know that we're switching away
 902  907                           * from this thread, set its t_waitrq if it is on a run
 903  908                           * queue.
 904  909                           */
 905  910                          if ((t->t_state == TS_RUN) && (t->t_waitrq == 0)) {
 906  911                                  t->t_waitrq = now;
 907  912                          }
 908  913  
 909  914                          /*
 910  915                           * restore mstate of thread that we are switching to
 911  916                           */
 912  917                          restore_mstate(next);
 913  918  
 914  919                          CPU_STATS_ADDQ(cp, sys, pswitch, 1);
 915  920                          cp->cpu_last_swtch = t->t_disp_time = ddi_get_lbolt();
 916  921                          TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
 917  922  
 918  923                          if (dtrace_vtime_active)
 919  924                                  dtrace_vtime_switch(next);
 920  925  
 921  926                          resume(next);
 922  927                          /*
 923  928                           * The TR_RESUME_END and TR_SWTCH_END trace points
 924  929                           * appear at the end of resume(), because we may not
 925  930                           * return here
 926  931                           */
 927  932                  } else {
 928  933                          if (t->t_flag & T_INTR_THREAD)
 929  934                                  cpu_intr_swtch_exit(t);
 930  935                          /*
 931  936                           * Threads that enqueue themselves on a run queue defer
 932  937                           * setting t_waitrq. It is then either set in swtch()
 933  938                           * when the CPU is actually yielded, or not at all if it
 934  939                           * is remaining on the CPU.
 935  940                           * There is however a window between where the thread
 936  941                           * placed itself on a run queue, and where it selects
 937  942                           * itself in disp(), where a third party (eg. clock()
 938  943                           * doing tick processing) may have re-enqueued this
 939  944                           * thread, setting t_waitrq in the process. We detect
 940  945                           * this race by noticing that despite switching to
 941  946                           * ourself, our t_waitrq has been set, and should be
 942  947                           * cleared.
 943  948                           */
 944  949                          if (t->t_waitrq != 0)
 945  950                                  t->t_waitrq = 0;
 946  951  
 947  952                          pg_ev_thread_remain(cp, t);
 948  953  
 949  954                          DTRACE_SCHED(remain__cpu);
 950  955                          TRACE_0(TR_FAC_DISP, TR_SWTCH_END, "swtch_end");
 951  956                          (void) spl0();
 952  957                  }
 953  958          }
 954  959  }
 955  960  
 956  961  /*
 957  962   * swtch_from_zombie()
 958  963   *      Special case of swtch(), which allows checks for TS_ZOMB to be
 959  964   *      eliminated from normal resume.
 960  965   *      Find best runnable thread and run it.
 961  966   *      Called with the current thread zombied.
 962  967   *      Zombies cannot migrate, so CPU references are safe.
 963  968   */
 964  969  void
 965  970  swtch_from_zombie()
 966  971  {
 967  972          kthread_t       *next;
 968  973          cpu_t           *cpu = CPU;
 969  974  
 970  975          TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
 971  976  
 972  977          ASSERT(curthread->t_state == TS_ZOMB);
 973  978  
 974  979          next = disp();                  /* returns with spl high */
 975  980          ASSERT(CPU_ON_INTR(CPU) == 0);  /* not called with PIL > 10 */
 976  981          CPU_STATS_ADDQ(CPU, sys, pswitch, 1);
 977  982          ASSERT(next != curthread);
 978  983          TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
 979  984  
 980  985          pg_ev_thread_swtch(cpu, gethrtime_unscaled(), curthread, next);
 981  986  
 982  987          restore_mstate(next);
 983  988  
 984  989          if (dtrace_vtime_active)
 985  990                  dtrace_vtime_switch(next);
 986  991  
 987  992          resume_from_zombie(next);
 988  993          /*
 989  994           * The TR_RESUME_END and TR_SWTCH_END trace points
 990  995           * appear at the end of resume(), because we certainly will not
 991  996           * return here
 992  997           */
 993  998  }
 994  999  
 995 1000  #if defined(DEBUG) && (defined(DISP_DEBUG) || defined(lint))
 996 1001  
 997 1002  /*
 998 1003   * search_disp_queues()
 999 1004   *      Search the given dispatch queues for thread tp.
1000 1005   *      Return 1 if tp is found, otherwise return 0.
1001 1006   */
1002 1007  static int
1003 1008  search_disp_queues(disp_t *dp, kthread_t *tp)
1004 1009  {
1005 1010          dispq_t         *dq;
1006 1011          dispq_t         *eq;
1007 1012  
1008 1013          disp_lock_enter_high(&dp->disp_lock);
1009 1014  
1010 1015          for (dq = dp->disp_q, eq = dp->disp_q_limit; dq < eq; ++dq) {
1011 1016                  kthread_t       *rp;
1012 1017  
1013 1018                  ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL);
1014 1019  
1015 1020                  for (rp = dq->dq_first; rp; rp = rp->t_link)
1016 1021                          if (tp == rp) {
1017 1022                                  disp_lock_exit_high(&dp->disp_lock);
1018 1023                                  return (1);
1019 1024                          }
1020 1025          }
1021 1026          disp_lock_exit_high(&dp->disp_lock);
1022 1027  
1023 1028          return (0);
1024 1029  }
1025 1030  
1026 1031  /*
1027 1032   * thread_on_queue()
1028 1033   *      Search all per-CPU dispatch queues and all partition-wide kpreempt
1029 1034   *      queues for thread tp. Return 1 if tp is found, otherwise return 0.
1030 1035   */
1031 1036  static int
1032 1037  thread_on_queue(kthread_t *tp)
1033 1038  {
1034 1039          cpu_t           *cp;
1035 1040          struct cpupart  *part;
1036 1041  
1037 1042          ASSERT(getpil() >= DISP_LEVEL);
1038 1043  
1039 1044          /*
1040 1045           * Search the per-CPU dispatch queues for tp.
1041 1046           */
1042 1047          cp = CPU;
1043 1048          do {
1044 1049                  if (search_disp_queues(cp->cpu_disp, tp))
1045 1050                          return (1);
1046 1051          } while ((cp = cp->cpu_next_onln) != CPU);
1047 1052  
1048 1053          /*
1049 1054           * Search the partition-wide kpreempt queues for tp.
1050 1055           */
1051 1056          part = CPU->cpu_part;
1052 1057          do {
1053 1058                  if (search_disp_queues(&part->cp_kp_queue, tp))
1054 1059                          return (1);
1055 1060          } while ((part = part->cp_next) != CPU->cpu_part);
1056 1061  
1057 1062          return (0);
1058 1063  }
1059 1064  
1060 1065  #else
1061 1066  
1062 1067  #define thread_on_queue(tp)     0       /* ASSERT must be !thread_on_queue */
1063 1068  
1064 1069  #endif  /* DEBUG */
1065 1070  
1066 1071  /*
1067 1072   * like swtch(), but switch to a specified thread taken from another CPU.
1068 1073   *      called with spl high..
1069 1074   */
1070 1075  void
1071 1076  swtch_to(kthread_t *next)
1072 1077  {
1073 1078          cpu_t                   *cp = CPU;
1074 1079          hrtime_t                now;
1075 1080  
1076 1081          TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
1077 1082  
1078 1083          /*
1079 1084           * Update context switch statistics.
1080 1085           */
1081 1086          CPU_STATS_ADDQ(cp, sys, pswitch, 1);
1082 1087  
1083 1088          TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
1084 1089  
1085 1090          now = gethrtime_unscaled();
1086 1091          pg_ev_thread_swtch(cp, now, curthread, next);
1087 1092  
1088 1093          /* OK to steal anything left on run queue */
1089 1094          cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL;
1090 1095  
1091 1096          /* record last execution time */
1092 1097          cp->cpu_last_swtch = curthread->t_disp_time = ddi_get_lbolt();
1093 1098  
1094 1099          /*
1095 1100           * If t was previously in the TS_ONPROC state, setfrontdq and setbackdq
1096 1101           * won't have set its t_waitrq.  Since we now finally know that we're
1097 1102           * switching away from this thread, set its t_waitrq if it is on a run
1098 1103           * queue.
1099 1104           */
1100 1105          if ((curthread->t_state == TS_RUN) && (curthread->t_waitrq == 0)) {
1101 1106                  curthread->t_waitrq = now;
1102 1107          }
1103 1108  
1104 1109          /* restore next thread to previously running microstate */
1105 1110          restore_mstate(next);
1106 1111  
1107 1112          if (dtrace_vtime_active)

↓ open down ↓

1039 lines elided

↑ open up ↑

1108 1113                  dtrace_vtime_switch(next);
1109 1114  
1110 1115          resume(next);
1111 1116          /*
1112 1117           * The TR_RESUME_END and TR_SWTCH_END trace points
1113 1118           * appear at the end of resume(), because we may not
1114 1119           * return here
1115 1120           */
1116 1121  }
1117 1122  
1118      -#define CPU_IDLING(pri) ((pri) == -1)
1119      -
1120 1123  static void
1121 1124  cpu_resched(cpu_t *cp, pri_t tpri)
1122 1125  {
1123 1126          int     call_poke_cpu = 0;
1124 1127          pri_t   cpupri = cp->cpu_dispatch_pri;
1125 1128  
1126      -        if (!CPU_IDLING(cpupri) && (cpupri < tpri)) {
     1129 +        if (cpupri != CPU_IDLE_PRI && cpupri < tpri) {
1127 1130                  TRACE_2(TR_FAC_DISP, TR_CPU_RESCHED,
1128 1131                      "CPU_RESCHED:Tpri %d Cpupri %d", tpri, cpupri);
1129 1132                  if (tpri >= upreemptpri && cp->cpu_runrun == 0) {
1130 1133                          cp->cpu_runrun = 1;
1131 1134                          aston(cp->cpu_dispthread);
1132 1135                          if (tpri < kpreemptpri && cp != CPU)
1133 1136                                  call_poke_cpu = 1;
1134 1137                  }
1135 1138                  if (tpri >= kpreemptpri && cp->cpu_kprunrun == 0) {
1136 1139                          cp->cpu_kprunrun = 1;

1137 1140                          if (cp != CPU)
1138 1141                                  call_poke_cpu = 1;
1139 1142                  }
1140 1143          }
1141 1144  
1142 1145          /*
1143 1146           * Propagate cpu_runrun, and cpu_kprunrun to global visibility.
1144 1147           */
1145 1148          membar_enter();
1146 1149  
1147 1150          if (call_poke_cpu)
1148 1151                  poke_cpu(cp->cpu_id);
1149 1152  }
1150 1153  
1151 1154  /*
1152 1155   * setbackdq() keeps runqs balanced such that the difference in length
1153 1156   * between the chosen runq and the next one is no more than RUNQ_MAX_DIFF.
1154 1157   * For threads with priorities below RUNQ_MATCH_PRI levels, the runq's lengths
1155 1158   * must match.  When per-thread TS_RUNQMATCH flag is set, setbackdq() will
1156 1159   * try to keep runqs perfectly balanced regardless of the thread priority.
1157 1160   */
1158 1161  #define RUNQ_MATCH_PRI  16      /* pri below which queue lengths must match */
1159 1162  #define RUNQ_MAX_DIFF   2       /* maximum runq length difference */
1160 1163  #define RUNQ_LEN(cp, pri)       ((cp)->cpu_disp->disp_q[pri].dq_sruncnt)
1161 1164  
1162 1165  /*
1163 1166   * Macro that evaluates to true if it is likely that the thread has cache
1164 1167   * warmth. This is based on the amount of time that has elapsed since the
1165 1168   * thread last ran. If that amount of time is less than "rechoose_interval"
1166 1169   * ticks, then we decide that the thread has enough cache warmth to warrant
1167 1170   * some affinity for t->t_cpu.
1168 1171   */
1169 1172  #define THREAD_HAS_CACHE_WARMTH(thread) \
1170 1173          ((thread == curthread) ||       \
1171 1174          ((ddi_get_lbolt() - thread->t_disp_time) <= rechoose_interval))
1172 1175  /*
1173 1176   * Put the specified thread on the back of the dispatcher
1174 1177   * queue corresponding to its current priority.
1175 1178   *
1176 1179   * Called with the thread in transition, onproc or stopped state
1177 1180   * and locked (transition implies locked) and at high spl.
1178 1181   * Returns with the thread in TS_RUN state and still locked.
1179 1182   */
1180 1183  void
1181 1184  setbackdq(kthread_t *tp)
1182 1185  {
1183 1186          dispq_t *dq;
1184 1187          disp_t          *dp;
1185 1188          cpu_t           *cp;
1186 1189          pri_t           tpri;
1187 1190          int             bound;
1188 1191          boolean_t       self;
1189 1192  
1190 1193          ASSERT(THREAD_LOCK_HELD(tp));
1191 1194          ASSERT((tp->t_schedflag & TS_ALLSTART) == 0);
1192 1195          ASSERT(!thread_on_queue(tp));   /* make sure tp isn't on a runq */
1193 1196  
1194 1197          /*
1195 1198           * If thread is "swapped" or on the swap queue don't
1196 1199           * queue it, but wake sched.
1197 1200           */
1198 1201          if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) {
1199 1202                  disp_swapped_setrun(tp);
1200 1203                  return;
1201 1204          }
1202 1205  
1203 1206          self = (tp == curthread);
1204 1207  
1205 1208          if (tp->t_bound_cpu || tp->t_weakbound_cpu)
1206 1209                  bound = 1;
1207 1210          else
1208 1211                  bound = 0;
1209 1212  
1210 1213          tpri = DISP_PRIO(tp);
1211 1214          if (ncpus == 1)

↓ open down ↓

75 lines elided

↑ open up ↑

1212 1215                  cp = tp->t_cpu;
1213 1216          else if (!bound) {
1214 1217                  if (tpri >= kpqpri) {
1215 1218                          setkpdq(tp, SETKP_BACK);
1216 1219                          return;
1217 1220                  }
1218 1221  
1219 1222                  /*
1220 1223                   * We'll generally let this thread continue to run where
1221 1224                   * it last ran...but will consider migration if:
1222      -                 * - We thread probably doesn't have much cache warmth.
     1225 +                 * - The thread probably doesn't have much cache warmth.
     1226 +                 * - HT exclusion would prefer us to run elsewhere
1223 1227                   * - The CPU where it last ran is the target of an offline
1224 1228                   *   request.
1225      -                 * - The thread last ran outside it's home lgroup.
     1229 +                 * - The thread last ran outside its home lgroup.
1226 1230                   */
1227 1231                  if ((!THREAD_HAS_CACHE_WARMTH(tp)) ||
1228      -                    (tp->t_cpu == cpu_inmotion)) {
1229      -                        cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri, NULL);
1230      -                } else if (!LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, tp->t_cpu)) {
1231      -                        cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri,
1232      -                            self ? tp->t_cpu : NULL);
     1232 +                    !ht_should_run(tp, tp->t_cpu) ||
     1233 +                    (tp->t_cpu == cpu_inmotion) ||
     1234 +                    !LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, tp->t_cpu)) {
     1235 +                        cp = disp_lowpri_cpu(tp->t_cpu, tp, tpri);
1233 1236                  } else {
1234 1237                          cp = tp->t_cpu;
1235 1238                  }
1236 1239  
1237 1240                  if (tp->t_cpupart == cp->cpu_part) {
1238 1241                          int     qlen;
1239 1242  
1240 1243                          /*
1241 1244                           * Perform any CMT load balancing
1242 1245                           */

1243 1246                          cp = cmt_balance(tp, cp);
1244 1247  
1245 1248                          /*
1246 1249                           * Balance across the run queues
1247 1250                           */
1248 1251                          qlen = RUNQ_LEN(cp, tpri);
1249 1252                          if (tpri >= RUNQ_MATCH_PRI &&
1250 1253                              !(tp->t_schedflag & TS_RUNQMATCH))

↓ open down ↓

8 lines elided

↑ open up ↑

1251 1254                                  qlen -= RUNQ_MAX_DIFF;
1252 1255                          if (qlen > 0) {
1253 1256                                  cpu_t *newcp;
1254 1257  
1255 1258                                  if (tp->t_lpl->lpl_lgrpid == LGRP_ROOTID) {
1256 1259                                          newcp = cp->cpu_next_part;
1257 1260                                  } else if ((newcp = cp->cpu_next_lpl) == cp) {
1258 1261                                          newcp = cp->cpu_next_part;
1259 1262                                  }
1260 1263  
1261      -                                if (RUNQ_LEN(newcp, tpri) < qlen) {
     1264 +                                if (ht_should_run(tp, newcp) &&
     1265 +                                    RUNQ_LEN(newcp, tpri) < qlen) {
1262 1266                                          DTRACE_PROBE3(runq__balance,
1263 1267                                              kthread_t *, tp,
1264 1268                                              cpu_t *, cp, cpu_t *, newcp);
1265 1269                                          cp = newcp;
1266 1270                                  }
1267 1271                          }
1268 1272                  } else {
1269 1273                          /*
1270 1274                           * Migrate to a cpu in the new partition.
1271 1275                           */
1272      -                        cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist,
1273      -                            tp->t_lpl, tp->t_pri, NULL);
     1276 +                        cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist, tp,
     1277 +                            tp->t_pri);
1274 1278                  }
1275 1279                  ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1276 1280          } else {
1277 1281                  /*
1278 1282                   * It is possible that t_weakbound_cpu != t_bound_cpu (for
1279 1283                   * a short time until weak binding that existed when the
1280 1284                   * strong binding was established has dropped) so we must
1281 1285                   * favour weak binding over strong.
1282 1286                   */
1283 1287                  cp = tp->t_weakbound_cpu ?

1284 1288                      tp->t_weakbound_cpu : tp->t_bound_cpu;
1285 1289          }
1286 1290          /*
1287 1291           * A thread that is ONPROC may be temporarily placed on the run queue
1288 1292           * but then chosen to run again by disp.  If the thread we're placing on
1289 1293           * the queue is in TS_ONPROC state, don't set its t_waitrq until a
1290 1294           * replacement process is actually scheduled in swtch().  In this
1291 1295           * situation, curthread is the only thread that could be in the ONPROC
1292 1296           * state.
1293 1297           */
1294 1298          if ((!self) && (tp->t_waitrq == 0)) {
1295 1299                  hrtime_t curtime;
1296 1300  
1297 1301                  curtime = gethrtime_unscaled();
1298 1302                  (void) cpu_update_pct(tp, curtime);
1299 1303                  tp->t_waitrq = curtime;
1300 1304          } else {
1301 1305                  (void) cpu_update_pct(tp, gethrtime_unscaled());
1302 1306          }
1303 1307  
1304 1308          dp = cp->cpu_disp;
1305 1309          disp_lock_enter_high(&dp->disp_lock);
1306 1310  
1307 1311          DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 0);
1308 1312          TRACE_3(TR_FAC_DISP, TR_BACKQ, "setbackdq:pri %d cpu %p tid %p",
1309 1313              tpri, cp, tp);
1310 1314  
1311 1315  #ifndef NPROBE
1312 1316          /* Kernel probe */
1313 1317          if (tnf_tracing_active)
1314 1318                  tnf_thread_queue(tp, cp, tpri);
1315 1319  #endif /* NPROBE */
1316 1320  
1317 1321          ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1318 1322  
1319 1323          THREAD_RUN(tp, &dp->disp_lock);         /* set t_state to TS_RUN */
1320 1324          tp->t_disp_queue = dp;
1321 1325          tp->t_link = NULL;
1322 1326  
1323 1327          dq = &dp->disp_q[tpri];
1324 1328          dp->disp_nrunnable++;
1325 1329          if (!bound)
1326 1330                  dp->disp_steal = 0;
1327 1331          membar_enter();
1328 1332  
1329 1333          if (dq->dq_sruncnt++ != 0) {
1330 1334                  ASSERT(dq->dq_first != NULL);
1331 1335                  dq->dq_last->t_link = tp;
1332 1336                  dq->dq_last = tp;
1333 1337          } else {
1334 1338                  ASSERT(dq->dq_first == NULL);
1335 1339                  ASSERT(dq->dq_last == NULL);
1336 1340                  dq->dq_first = dq->dq_last = tp;
1337 1341                  BT_SET(dp->disp_qactmap, tpri);
1338 1342                  if (tpri > dp->disp_maxrunpri) {
1339 1343                          dp->disp_maxrunpri = tpri;
1340 1344                          membar_enter();
1341 1345                          cpu_resched(cp, tpri);
1342 1346                  }
1343 1347          }
1344 1348  
1345 1349          if (!bound && tpri > dp->disp_max_unbound_pri) {
1346 1350                  if (self && dp->disp_max_unbound_pri == -1 && cp == CPU) {
1347 1351                          /*
1348 1352                           * If there are no other unbound threads on the
1349 1353                           * run queue, don't allow other CPUs to steal
1350 1354                           * this thread while we are in the middle of a
1351 1355                           * context switch. We may just switch to it
1352 1356                           * again right away. CPU_DISP_DONTSTEAL is cleared
1353 1357                           * in swtch and swtch_to.
1354 1358                           */
1355 1359                          cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL;
1356 1360                  }
1357 1361                  dp->disp_max_unbound_pri = tpri;
1358 1362          }
1359 1363          (*disp_enq_thread)(cp, bound);
1360 1364  }
1361 1365  
1362 1366  /*
1363 1367   * Put the specified thread on the front of the dispatcher
1364 1368   * queue corresponding to its current priority.
1365 1369   *
1366 1370   * Called with the thread in transition, onproc or stopped state
1367 1371   * and locked (transition implies locked) and at high spl.
1368 1372   * Returns with the thread in TS_RUN state and still locked.
1369 1373   */
1370 1374  void
1371 1375  setfrontdq(kthread_t *tp)
1372 1376  {
1373 1377          disp_t          *dp;
1374 1378          dispq_t         *dq;
1375 1379          cpu_t           *cp;
1376 1380          pri_t           tpri;
1377 1381          int             bound;
1378 1382  
1379 1383          ASSERT(THREAD_LOCK_HELD(tp));
1380 1384          ASSERT((tp->t_schedflag & TS_ALLSTART) == 0);
1381 1385          ASSERT(!thread_on_queue(tp));   /* make sure tp isn't on a runq */
1382 1386  
1383 1387          /*
1384 1388           * If thread is "swapped" or on the swap queue don't
1385 1389           * queue it, but wake sched.
1386 1390           */
1387 1391          if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) {
1388 1392                  disp_swapped_setrun(tp);
1389 1393                  return;
1390 1394          }
1391 1395  
1392 1396          if (tp->t_bound_cpu || tp->t_weakbound_cpu)
1393 1397                  bound = 1;
1394 1398          else
1395 1399                  bound = 0;
1396 1400  
1397 1401          tpri = DISP_PRIO(tp);
1398 1402          if (ncpus == 1)
1399 1403                  cp = tp->t_cpu;

↓ open down ↓

116 lines elided

↑ open up ↑

1400 1404          else if (!bound) {
1401 1405                  if (tpri >= kpqpri) {
1402 1406                          setkpdq(tp, SETKP_FRONT);
1403 1407                          return;
1404 1408                  }
1405 1409                  cp = tp->t_cpu;
1406 1410                  if (tp->t_cpupart == cp->cpu_part) {
1407 1411                          /*
1408 1412                           * We'll generally let this thread continue to run
1409 1413                           * where it last ran, but will consider migration if:
1410      -                         * - The thread last ran outside it's home lgroup.
     1414 +                         * - The thread last ran outside its home lgroup.
1411 1415                           * - The CPU where it last ran is the target of an
1412 1416                           *   offline request (a thread_nomigrate() on the in
1413 1417                           *   motion CPU relies on this when forcing a preempt).
1414 1418                           * - The thread isn't the highest priority thread where
1415 1419                           *   it last ran, and it is considered not likely to
1416 1420                           *   have significant cache warmth.
1417 1421                           */
1418      -                        if ((!LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, cp)) ||
1419      -                            (cp == cpu_inmotion)) {
1420      -                                cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri,
1421      -                                    (tp == curthread) ? cp : NULL);
1422      -                        } else if ((tpri < cp->cpu_disp->disp_maxrunpri) &&
1423      -                            (!THREAD_HAS_CACHE_WARMTH(tp))) {
1424      -                                cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri,
1425      -                                    NULL);
     1422 +                        if (!LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, cp) ||
     1423 +                            cp == cpu_inmotion ||
     1424 +                            (tpri < cp->cpu_disp->disp_maxrunpri &&
     1425 +                            !THREAD_HAS_CACHE_WARMTH(tp))) {
     1426 +                                cp = disp_lowpri_cpu(tp->t_cpu, tp, tpri);
1426 1427                          }
1427 1428                  } else {
1428 1429                          /*
1429 1430                           * Migrate to a cpu in the new partition.
1430 1431                           */
1431 1432                          cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist,
1432      -                            tp->t_lpl, tp->t_pri, NULL);
     1433 +                            tp, tp->t_pri);
1433 1434                  }
1434 1435                  ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1435 1436          } else {
1436 1437                  /*
1437 1438                   * It is possible that t_weakbound_cpu != t_bound_cpu (for
1438 1439                   * a short time until weak binding that existed when the
1439 1440                   * strong binding was established has dropped) so we must
1440 1441                   * favour weak binding over strong.
1441 1442                   */
1442 1443                  cp = tp->t_weakbound_cpu ?

1443 1444                      tp->t_weakbound_cpu : tp->t_bound_cpu;
1444 1445          }
1445 1446  
1446 1447          /*
1447 1448           * A thread that is ONPROC may be temporarily placed on the run queue
1448 1449           * but then chosen to run again by disp.  If the thread we're placing on
1449 1450           * the queue is in TS_ONPROC state, don't set its t_waitrq until a
1450 1451           * replacement process is actually scheduled in swtch().  In this
1451 1452           * situation, curthread is the only thread that could be in the ONPROC
1452 1453           * state.
1453 1454           */
1454 1455          if ((tp != curthread) && (tp->t_waitrq == 0)) {
1455 1456                  hrtime_t curtime;
1456 1457  
1457 1458                  curtime = gethrtime_unscaled();
1458 1459                  (void) cpu_update_pct(tp, curtime);
1459 1460                  tp->t_waitrq = curtime;
1460 1461          } else {
1461 1462                  (void) cpu_update_pct(tp, gethrtime_unscaled());
1462 1463          }
1463 1464  
1464 1465          dp = cp->cpu_disp;
1465 1466          disp_lock_enter_high(&dp->disp_lock);
1466 1467  
1467 1468          TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp);
1468 1469          DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 1);
1469 1470  
1470 1471  #ifndef NPROBE
1471 1472          /* Kernel probe */
1472 1473          if (tnf_tracing_active)
1473 1474                  tnf_thread_queue(tp, cp, tpri);
1474 1475  #endif /* NPROBE */
1475 1476  
1476 1477          ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1477 1478  
1478 1479          THREAD_RUN(tp, &dp->disp_lock);         /* set TS_RUN state and lock */
1479 1480          tp->t_disp_queue = dp;
1480 1481  
1481 1482          dq = &dp->disp_q[tpri];
1482 1483          dp->disp_nrunnable++;
1483 1484          if (!bound)
1484 1485                  dp->disp_steal = 0;
1485 1486          membar_enter();
1486 1487  
1487 1488          if (dq->dq_sruncnt++ != 0) {
1488 1489                  ASSERT(dq->dq_last != NULL);
1489 1490                  tp->t_link = dq->dq_first;
1490 1491                  dq->dq_first = tp;
1491 1492          } else {
1492 1493                  ASSERT(dq->dq_last == NULL);
1493 1494                  ASSERT(dq->dq_first == NULL);
1494 1495                  tp->t_link = NULL;
1495 1496                  dq->dq_first = dq->dq_last = tp;
1496 1497                  BT_SET(dp->disp_qactmap, tpri);
1497 1498                  if (tpri > dp->disp_maxrunpri) {
1498 1499                          dp->disp_maxrunpri = tpri;
1499 1500                          membar_enter();
1500 1501                          cpu_resched(cp, tpri);
1501 1502                  }
1502 1503          }
1503 1504  
1504 1505          if (!bound && tpri > dp->disp_max_unbound_pri) {
1505 1506                  if (tp == curthread && dp->disp_max_unbound_pri == -1 &&
1506 1507                      cp == CPU) {
1507 1508                          /*
1508 1509                           * If there are no other unbound threads on the
1509 1510                           * run queue, don't allow other CPUs to steal
1510 1511                           * this thread while we are in the middle of a
1511 1512                           * context switch. We may just switch to it
1512 1513                           * again right away. CPU_DISP_DONTSTEAL is cleared
1513 1514                           * in swtch and swtch_to.
1514 1515                           */
1515 1516                          cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL;
1516 1517                  }
1517 1518                  dp->disp_max_unbound_pri = tpri;
1518 1519          }
1519 1520          (*disp_enq_thread)(cp, bound);
1520 1521  }
1521 1522  
1522 1523  /*
1523 1524   * Put a high-priority unbound thread on the kp queue
1524 1525   */
1525 1526  static void
1526 1527  setkpdq(kthread_t *tp, int borf)
1527 1528  {
1528 1529          dispq_t *dq;
1529 1530          disp_t  *dp;
1530 1531          cpu_t   *cp;
1531 1532          pri_t   tpri;
1532 1533  
1533 1534          tpri = DISP_PRIO(tp);
1534 1535  
1535 1536          dp = &tp->t_cpupart->cp_kp_queue;
1536 1537          disp_lock_enter_high(&dp->disp_lock);
1537 1538  
1538 1539          TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp);
1539 1540  
1540 1541          ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1541 1542          DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, borf);
1542 1543          THREAD_RUN(tp, &dp->disp_lock);         /* set t_state to TS_RUN */
1543 1544          tp->t_disp_queue = dp;
1544 1545          dp->disp_nrunnable++;
1545 1546          dq = &dp->disp_q[tpri];
1546 1547  
1547 1548          if (dq->dq_sruncnt++ != 0) {
1548 1549                  if (borf == SETKP_BACK) {
1549 1550                          ASSERT(dq->dq_first != NULL);
1550 1551                          tp->t_link = NULL;
1551 1552                          dq->dq_last->t_link = tp;
1552 1553                          dq->dq_last = tp;
1553 1554                  } else {
1554 1555                          ASSERT(dq->dq_last != NULL);
1555 1556                          tp->t_link = dq->dq_first;
1556 1557                          dq->dq_first = tp;
1557 1558                  }
1558 1559          } else {
1559 1560                  if (borf == SETKP_BACK) {
1560 1561                          ASSERT(dq->dq_first == NULL);
1561 1562                          ASSERT(dq->dq_last == NULL);
1562 1563                          dq->dq_first = dq->dq_last = tp;
1563 1564                  } else {
1564 1565                          ASSERT(dq->dq_last == NULL);
1565 1566                          ASSERT(dq->dq_first == NULL);
1566 1567                          tp->t_link = NULL;
1567 1568                          dq->dq_first = dq->dq_last = tp;
1568 1569                  }
1569 1570                  BT_SET(dp->disp_qactmap, tpri);
1570 1571                  if (tpri > dp->disp_max_unbound_pri)
1571 1572                          dp->disp_max_unbound_pri = tpri;
1572 1573                  if (tpri > dp->disp_maxrunpri) {

↓ open down ↓

130 lines elided

↑ open up ↑

1573 1574                          dp->disp_maxrunpri = tpri;
1574 1575                          membar_enter();
1575 1576                  }
1576 1577          }
1577 1578  
1578 1579          cp = tp->t_cpu;
1579 1580          if (tp->t_cpupart != cp->cpu_part) {
1580 1581                  /* migrate to a cpu in the new partition */
1581 1582                  cp = tp->t_cpupart->cp_cpulist;
1582 1583          }
1583      -        cp = disp_lowpri_cpu(cp, tp->t_lpl, tp->t_pri, NULL);
     1584 +        cp = disp_lowpri_cpu(cp, tp, tp->t_pri);
1584 1585          disp_lock_enter_high(&cp->cpu_disp->disp_lock);
1585 1586          ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1586 1587  
1587 1588  #ifndef NPROBE
1588 1589          /* Kernel probe */
1589 1590          if (tnf_tracing_active)
1590 1591                  tnf_thread_queue(tp, cp, tpri);
1591 1592  #endif /* NPROBE */
1592 1593  
1593 1594          if (cp->cpu_chosen_level < tpri)

1594 1595                  cp->cpu_chosen_level = tpri;
1595 1596          cpu_resched(cp, tpri);
1596 1597          disp_lock_exit_high(&cp->cpu_disp->disp_lock);
1597 1598          (*disp_enq_thread)(cp, 0);
1598 1599  }
1599 1600  
1600 1601  /*
1601 1602   * Remove a thread from the dispatcher queue if it is on it.
1602 1603   * It is not an error if it is not found but we return whether
1603 1604   * or not it was found in case the caller wants to check.
1604 1605   */
1605 1606  int
1606 1607  dispdeq(kthread_t *tp)
1607 1608  {
1608 1609          disp_t          *dp;
1609 1610          dispq_t         *dq;
1610 1611          kthread_t       *rp;
1611 1612          kthread_t       *trp;
1612 1613          kthread_t       **ptp;
1613 1614          int             tpri;
1614 1615  
1615 1616          ASSERT(THREAD_LOCK_HELD(tp));
1616 1617  
1617 1618          if (tp->t_state != TS_RUN)
1618 1619                  return (0);
1619 1620  
1620 1621          /*
1621 1622           * The thread is "swapped" or is on the swap queue and
1622 1623           * hence no longer on the run queue, so return true.
1623 1624           */
1624 1625          if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD)
1625 1626                  return (1);
1626 1627  
1627 1628          tpri = DISP_PRIO(tp);
1628 1629          dp = tp->t_disp_queue;
1629 1630          ASSERT(tpri < dp->disp_npri);
1630 1631          dq = &dp->disp_q[tpri];
1631 1632          ptp = &dq->dq_first;
1632 1633          rp = *ptp;
1633 1634          trp = NULL;
1634 1635  
1635 1636          ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL);
1636 1637  
1637 1638          /*
1638 1639           * Search for thread in queue.
1639 1640           * Double links would simplify this at the expense of disp/setrun.
1640 1641           */
1641 1642          while (rp != tp && rp != NULL) {
1642 1643                  trp = rp;
1643 1644                  ptp = &trp->t_link;
1644 1645                  rp = trp->t_link;
1645 1646          }
1646 1647  
1647 1648          if (rp == NULL) {
1648 1649                  panic("dispdeq: thread not on queue");
1649 1650          }
1650 1651  
1651 1652          DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp);
1652 1653  
1653 1654          /*
1654 1655           * Found it so remove it from queue.
1655 1656           */
1656 1657          if ((*ptp = rp->t_link) == NULL)
1657 1658                  dq->dq_last = trp;
1658 1659  
1659 1660          dp->disp_nrunnable--;
1660 1661          if (--dq->dq_sruncnt == 0) {
1661 1662                  dp->disp_qactmap[tpri >> BT_ULSHIFT] &= ~BT_BIW(tpri);
1662 1663                  if (dp->disp_nrunnable == 0) {
1663 1664                          dp->disp_max_unbound_pri = -1;
1664 1665                          dp->disp_maxrunpri = -1;
1665 1666                  } else if (tpri == dp->disp_maxrunpri) {
1666 1667                          int ipri;
1667 1668  
1668 1669                          ipri = bt_gethighbit(dp->disp_qactmap,
1669 1670                              dp->disp_maxrunpri >> BT_ULSHIFT);
1670 1671                          if (ipri < dp->disp_max_unbound_pri)
1671 1672                                  dp->disp_max_unbound_pri = ipri;
1672 1673                          dp->disp_maxrunpri = ipri;
1673 1674                  }
1674 1675          }
1675 1676          tp->t_link = NULL;
1676 1677          THREAD_TRANSITION(tp);          /* put in intermediate state */
1677 1678          return (1);
1678 1679  }
1679 1680  
1680 1681  
1681 1682  /*
1682 1683   * dq_sruninc and dq_srundec are public functions for
1683 1684   * incrementing/decrementing the sruncnts when a thread on
1684 1685   * a dispatcher queue is made schedulable/unschedulable by
1685 1686   * resetting the TS_LOAD flag.
1686 1687   *
1687 1688   * The caller MUST have the thread lock and therefore the dispatcher
1688 1689   * queue lock so that the operation which changes
1689 1690   * the flag, the operation that checks the status of the thread to
1690 1691   * determine if it's on a disp queue AND the call to this function
1691 1692   * are one atomic operation with respect to interrupts.
1692 1693   */
1693 1694  
1694 1695  /*
1695 1696   * Called by sched AFTER TS_LOAD flag is set on a swapped, runnable thread.
1696 1697   */
1697 1698  void
1698 1699  dq_sruninc(kthread_t *t)
1699 1700  {
1700 1701          ASSERT(t->t_state == TS_RUN);
1701 1702          ASSERT(t->t_schedflag & TS_LOAD);
1702 1703  
1703 1704          THREAD_TRANSITION(t);
1704 1705          setfrontdq(t);
1705 1706  }
1706 1707  
1707 1708  /*
1708 1709   * See comment on calling conventions above.
1709 1710   * Called by sched BEFORE TS_LOAD flag is cleared on a runnable thread.
1710 1711   */
1711 1712  void
1712 1713  dq_srundec(kthread_t *t)
1713 1714  {
1714 1715          ASSERT(t->t_schedflag & TS_LOAD);
1715 1716  
1716 1717          (void) dispdeq(t);
1717 1718          disp_swapped_enq(t);
1718 1719  }
1719 1720  
1720 1721  /*
1721 1722   * Change the dispatcher lock of thread to the "swapped_lock"
1722 1723   * and return with thread lock still held.
1723 1724   *
1724 1725   * Called with thread_lock held, in transition state, and at high spl.
1725 1726   */
1726 1727  void
1727 1728  disp_swapped_enq(kthread_t *tp)
1728 1729  {
1729 1730          ASSERT(THREAD_LOCK_HELD(tp));
1730 1731          ASSERT(tp->t_schedflag & TS_LOAD);
1731 1732  
1732 1733          switch (tp->t_state) {
1733 1734          case TS_RUN:
1734 1735                  disp_lock_enter_high(&swapped_lock);
1735 1736                  THREAD_SWAP(tp, &swapped_lock); /* set TS_RUN state and lock */
1736 1737                  break;
1737 1738          case TS_ONPROC:
1738 1739                  disp_lock_enter_high(&swapped_lock);
1739 1740                  THREAD_TRANSITION(tp);
1740 1741                  wake_sched_sec = 1;             /* tell clock to wake sched */
1741 1742                  THREAD_SWAP(tp, &swapped_lock); /* set TS_RUN state and lock */
1742 1743                  break;
1743 1744          default:
1744 1745                  panic("disp_swapped: tp: %p bad t_state", (void *)tp);
1745 1746          }
1746 1747  }
1747 1748  
1748 1749  /*
1749 1750   * This routine is called by setbackdq/setfrontdq if the thread is
1750 1751   * not loaded or loaded and on the swap queue.
1751 1752   *
1752 1753   * Thread state TS_SLEEP implies that a swapped thread
1753 1754   * has been woken up and needs to be swapped in by the swapper.
1754 1755   *
1755 1756   * Thread state TS_RUN, it implies that the priority of a swapped
1756 1757   * thread is being increased by scheduling class (e.g. ts_update).
1757 1758   */
1758 1759  static void
1759 1760  disp_swapped_setrun(kthread_t *tp)
1760 1761  {
1761 1762          ASSERT(THREAD_LOCK_HELD(tp));
1762 1763          ASSERT((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD);
1763 1764  
1764 1765          switch (tp->t_state) {
1765 1766          case TS_SLEEP:
1766 1767                  disp_lock_enter_high(&swapped_lock);
1767 1768                  /*
1768 1769                   * Wakeup sched immediately (i.e., next tick) if the
1769 1770                   * thread priority is above maxclsyspri.
1770 1771                   */
1771 1772                  if (DISP_PRIO(tp) > maxclsyspri)
1772 1773                          wake_sched = 1;
1773 1774                  else
1774 1775                          wake_sched_sec = 1;
1775 1776                  THREAD_RUN(tp, &swapped_lock); /* set TS_RUN state and lock */
1776 1777                  break;
1777 1778          case TS_RUN:                            /* called from ts_update */
1778 1779                  break;
1779 1780          default:
1780 1781                  panic("disp_swapped_setrun: tp: %p bad t_state", (void *)tp);
1781 1782          }
1782 1783  }
1783 1784  
1784 1785  /*
1785 1786   *      Make a thread give up its processor.  Find the processor on
1786 1787   *      which this thread is executing, and have that processor
1787 1788   *      preempt.
1788 1789   *
1789 1790   *      We allow System Duty Cycle (SDC) threads to be preempted even if
1790 1791   *      they are running at kernel priorities.  To implement this, we always
1791 1792   *      set cpu_kprunrun; this ensures preempt() will be called.  Since SDC
1792 1793   *      calls cpu_surrender() very often, we only preempt if there is anyone
1793 1794   *      competing with us.
1794 1795   */
1795 1796  void
1796 1797  cpu_surrender(kthread_t *tp)
1797 1798  {
1798 1799          cpu_t   *cpup;
1799 1800          int     max_pri;
1800 1801          int     max_run_pri;
1801 1802          klwp_t  *lwp;
1802 1803  
1803 1804          ASSERT(THREAD_LOCK_HELD(tp));
1804 1805  
1805 1806          if (tp->t_state != TS_ONPROC)
1806 1807                  return;
1807 1808          cpup = tp->t_disp_queue->disp_cpu;      /* CPU thread dispatched to */
1808 1809          max_pri = cpup->cpu_disp->disp_maxrunpri; /* best pri of that CPU */
1809 1810          max_run_pri = CP_MAXRUNPRI(cpup->cpu_part);
1810 1811          if (max_pri < max_run_pri)
1811 1812                  max_pri = max_run_pri;
1812 1813  
1813 1814          if (tp->t_cid == sysdccid) {
1814 1815                  uint_t t_pri = DISP_PRIO(tp);
1815 1816                  if (t_pri > max_pri)
1816 1817                          return;         /* we are not competing w/ anyone */
1817 1818                  cpup->cpu_runrun = cpup->cpu_kprunrun = 1;
1818 1819          } else {
1819 1820                  cpup->cpu_runrun = 1;
1820 1821                  if (max_pri >= kpreemptpri && cpup->cpu_kprunrun == 0) {
1821 1822                          cpup->cpu_kprunrun = 1;
1822 1823                  }
1823 1824          }
1824 1825  
1825 1826          /*
1826 1827           * Propagate cpu_runrun, and cpu_kprunrun to global visibility.
1827 1828           */
1828 1829          membar_enter();
1829 1830  
1830 1831          DTRACE_SCHED1(surrender, kthread_t *, tp);
1831 1832  
1832 1833          /*
1833 1834           * Make the target thread take an excursion through trap()
1834 1835           * to do preempt() (unless we're already in trap or post_syscall,
1835 1836           * calling cpu_surrender via CL_TRAPRET).
1836 1837           */
1837 1838          if (tp != curthread || (lwp = tp->t_lwp) == NULL ||
1838 1839              lwp->lwp_state != LWP_USER) {
1839 1840                  aston(tp);
1840 1841                  if (cpup != CPU)
1841 1842                          poke_cpu(cpup->cpu_id);
1842 1843          }
1843 1844          TRACE_2(TR_FAC_DISP, TR_CPU_SURRENDER,
1844 1845              "cpu_surrender:tid %p cpu %p", tp, cpup);
1845 1846  }
1846 1847  
1847 1848  /*
1848 1849   * Commit to and ratify a scheduling decision
1849 1850   */
1850 1851  /*ARGSUSED*/
1851 1852  static kthread_t *
1852 1853  disp_ratify(kthread_t *tp, disp_t *kpq)
1853 1854  {
1854 1855          pri_t   tpri, maxpri;
1855 1856          pri_t   maxkpri;
1856 1857          cpu_t   *cpup;
1857 1858  
1858 1859          ASSERT(tp != NULL);
1859 1860          /*
1860 1861           * Commit to, then ratify scheduling decision
1861 1862           */
1862 1863          cpup = CPU;
1863 1864          if (cpup->cpu_runrun != 0)
1864 1865                  cpup->cpu_runrun = 0;
1865 1866          if (cpup->cpu_kprunrun != 0)
1866 1867                  cpup->cpu_kprunrun = 0;
1867 1868          if (cpup->cpu_chosen_level != -1)
1868 1869                  cpup->cpu_chosen_level = -1;
1869 1870          membar_enter();
1870 1871          tpri = DISP_PRIO(tp);
1871 1872          maxpri = cpup->cpu_disp->disp_maxrunpri;
1872 1873          maxkpri = kpq->disp_maxrunpri;
1873 1874          if (maxpri < maxkpri)
1874 1875                  maxpri = maxkpri;
1875 1876          if (tpri < maxpri) {
1876 1877                  /*
1877 1878                   * should have done better
1878 1879                   * put this one back and indicate to try again
1879 1880                   */
1880 1881                  cpup->cpu_dispthread = curthread;       /* fixup dispthread */
1881 1882                  cpup->cpu_dispatch_pri = DISP_PRIO(curthread);
1882 1883                  thread_lock_high(tp);
1883 1884                  THREAD_TRANSITION(tp);
1884 1885                  setfrontdq(tp);
1885 1886                  thread_unlock_nopreempt(tp);
1886 1887  
1887 1888                  tp = NULL;
1888 1889          }
1889 1890          return (tp);
1890 1891  }
1891 1892  
1892 1893  /*
1893 1894   * See if there is any work on the dispatcher queue for other CPUs.
1894 1895   * If there is, dequeue the best thread and return.
1895 1896   */
1896 1897  static kthread_t *
1897 1898  disp_getwork(cpu_t *cp)
1898 1899  {
1899 1900          cpu_t           *ocp;           /* other CPU */
1900 1901          cpu_t           *ocp_start;
1901 1902          cpu_t           *tcp;           /* target local CPU */
1902 1903          kthread_t       *tp;
1903 1904          kthread_t       *retval = NULL;
1904 1905          pri_t           maxpri;
1905 1906          disp_t          *kpq;           /* kp queue for this partition */
1906 1907          lpl_t           *lpl, *lpl_leaf;
1907 1908          int             leafidx, startidx;
1908 1909          hrtime_t        stealtime;
1909 1910          lgrp_id_t       local_id;
1910 1911  
1911 1912          maxpri = -1;
1912 1913          tcp = NULL;
1913 1914  
1914 1915          kpq = &cp->cpu_part->cp_kp_queue;
1915 1916          while (kpq->disp_maxrunpri >= 0) {
1916 1917                  /*
1917 1918                   * Try to take a thread from the kp_queue.
1918 1919                   */
1919 1920                  tp = (disp_getbest(kpq));
1920 1921                  if (tp)
1921 1922                          return (disp_ratify(tp, kpq));
1922 1923          }
1923 1924  
1924 1925          kpreempt_disable();             /* protect the cpu_active list */
1925 1926  
1926 1927          /*
1927 1928           * Try to find something to do on another CPU's run queue.
1928 1929           * Loop through all other CPUs looking for the one with the highest
1929 1930           * priority unbound thread.
1930 1931           *
1931 1932           * On NUMA machines, the partition's CPUs are consulted in order of
1932 1933           * distance from the current CPU. This way, the first available
1933 1934           * work found is also the closest, and will suffer the least
1934 1935           * from being migrated.
1935 1936           */
1936 1937          lpl = lpl_leaf = cp->cpu_lpl;
1937 1938          local_id = lpl_leaf->lpl_lgrpid;
1938 1939          leafidx = startidx = 0;
1939 1940  
1940 1941          /*
1941 1942           * This loop traverses the lpl hierarchy. Higher level lpls represent
1942 1943           * broader levels of locality
1943 1944           */
1944 1945          do {
1945 1946                  /* This loop iterates over the lpl's leaves */
1946 1947                  do {
1947 1948                          if (lpl_leaf != cp->cpu_lpl)
1948 1949                                  ocp = lpl_leaf->lpl_cpus;
1949 1950                          else
1950 1951                                  ocp = cp->cpu_next_lpl;
1951 1952  
1952 1953                          /* This loop iterates over the CPUs in the leaf */
1953 1954                          ocp_start = ocp;
1954 1955                          do {
1955 1956                                  pri_t pri;
1956 1957  
1957 1958                                  ASSERT(CPU_ACTIVE(ocp));
1958 1959  
1959 1960                                  /*
1960 1961                                   * End our stroll around this lpl if:
1961 1962                                   *
1962 1963                                   * - Something became runnable on the local
1963 1964                                   *   queue...which also ends our stroll around
1964 1965                                   *   the partition.
1965 1966                                   *
1966 1967                                   * - We happen across another idle CPU.
1967 1968                                   *   Since it is patrolling the next portion
1968 1969                                   *   of the lpl's list (assuming it's not
1969 1970                                   *   halted, or busy servicing an interrupt),
1970 1971                                   *   move to the next higher level of locality.
1971 1972                                   */
1972 1973                                  if (cp->cpu_disp->disp_nrunnable != 0) {
1973 1974                                          kpreempt_enable();
1974 1975                                          return (NULL);
1975 1976                                  }
1976 1977                                  if (ocp->cpu_dispatch_pri == -1) {
1977 1978                                          if (ocp->cpu_disp_flags &
1978 1979                                              CPU_DISP_HALTED ||
1979 1980                                              ocp->cpu_intr_actv != 0)
1980 1981                                                  continue;
1981 1982                                          else
1982 1983                                                  goto next_level;
1983 1984                                  }
1984 1985  
1985 1986                                  /*
1986 1987                                   * If there's only one thread and the CPU
1987 1988                                   * is in the middle of a context switch,
1988 1989                                   * or it's currently running the idle thread,
1989 1990                                   * don't steal it.
1990 1991                                   */
1991 1992                                  if ((ocp->cpu_disp_flags &
1992 1993                                      CPU_DISP_DONTSTEAL) &&
1993 1994                                      ocp->cpu_disp->disp_nrunnable == 1)
1994 1995                                          continue;
1995 1996  
1996 1997                                  pri = ocp->cpu_disp->disp_max_unbound_pri;
1997 1998                                  if (pri > maxpri) {
1998 1999                                          /*
1999 2000                                           * Don't steal threads that we attempted
2000 2001                                           * to steal recently until they're ready
2001 2002                                           * to be stolen again.
2002 2003                                           */
2003 2004                                          stealtime = ocp->cpu_disp->disp_steal;
2004 2005                                          if (stealtime == 0 ||
2005 2006                                              stealtime - gethrtime() <= 0) {
2006 2007                                                  maxpri = pri;
2007 2008                                                  tcp = ocp;
2008 2009                                          } else {
2009 2010                                                  /*
2010 2011                                                   * Don't update tcp, just set
2011 2012                                                   * the retval to T_DONTSTEAL, so
2012 2013                                                   * that if no acceptable CPUs
2013 2014                                                   * are found the return value
2014 2015                                                   * will be T_DONTSTEAL rather
2015 2016                                                   * then NULL.
2016 2017                                                   */
2017 2018                                                  retval = T_DONTSTEAL;
2018 2019                                          }
2019 2020                                  }
2020 2021                          } while ((ocp = ocp->cpu_next_lpl) != ocp_start);
2021 2022  
2022 2023                          /*
2023 2024                           * Iterate to the next leaf lpl in the resource set
2024 2025                           * at this level of locality. If we hit the end of
2025 2026                           * the set, wrap back around to the beginning.
2026 2027                           *
2027 2028                           * Note: This iteration is NULL terminated for a reason
2028 2029                           * see lpl_topo_bootstrap() in lgrp.c for details.
2029 2030                           */
2030 2031                          if ((lpl_leaf = lpl->lpl_rset[++leafidx]) == NULL) {
2031 2032                                  leafidx = 0;
2032 2033                                  lpl_leaf = lpl->lpl_rset[leafidx];
2033 2034                          }
2034 2035                  } while (leafidx != startidx);
2035 2036  
2036 2037  next_level:
2037 2038                  /*
2038 2039                   * Expand the search to include farther away CPUs (next
2039 2040                   * locality level). The closer CPUs that have already been
2040 2041                   * checked will be checked again. In doing so, idle CPUs
2041 2042                   * will tend to be more aggresive about stealing from CPUs
2042 2043                   * that are closer (since the closer CPUs will be considered
2043 2044                   * more often).
2044 2045                   * Begin at this level with the CPUs local leaf lpl.
2045 2046                   */
2046 2047                  if ((lpl = lpl->lpl_parent) != NULL) {
2047 2048                          leafidx = startidx = lpl->lpl_id2rset[local_id];
2048 2049                          lpl_leaf = lpl->lpl_rset[leafidx];
2049 2050                  }
2050 2051          } while (!tcp && lpl);
2051 2052  
2052 2053          kpreempt_enable();
2053 2054  
2054 2055          /*
2055 2056           * If another queue looks good, and there is still nothing on
2056 2057           * the local queue, try to transfer one or more threads
2057 2058           * from it to our queue.
2058 2059           */
2059 2060          if (tcp && cp->cpu_disp->disp_nrunnable == 0) {
2060 2061                  tp = disp_getbest(tcp->cpu_disp);
2061 2062                  if (tp == NULL || tp == T_DONTSTEAL)
2062 2063                          return (tp);
2063 2064                  return (disp_ratify(tp, kpq));
2064 2065          }
2065 2066          return (retval);
2066 2067  }
2067 2068  
2068 2069  
2069 2070  /*
2070 2071   * disp_fix_unbound_pri()
2071 2072   *      Determines the maximum priority of unbound threads on the queue.
2072 2073   *      The priority is kept for the queue, but is only increased, never
2073 2074   *      reduced unless some CPU is looking for something on that queue.
2074 2075   *
2075 2076   *      The priority argument is the known upper limit.
2076 2077   *
2077 2078   *      Perhaps this should be kept accurately, but that probably means
2078 2079   *      separate bitmaps for bound and unbound threads.  Since only idled
2079 2080   *      CPUs will have to do this recalculation, it seems better this way.
2080 2081   */
2081 2082  static void
2082 2083  disp_fix_unbound_pri(disp_t *dp, pri_t pri)
2083 2084  {
2084 2085          kthread_t       *tp;
2085 2086          dispq_t         *dq;
2086 2087          ulong_t         *dqactmap = dp->disp_qactmap;
2087 2088          ulong_t         mapword;
2088 2089          int             wx;
2089 2090  
2090 2091          ASSERT(DISP_LOCK_HELD(&dp->disp_lock));
2091 2092  
2092 2093          ASSERT(pri >= 0);                       /* checked by caller */
2093 2094  
2094 2095          /*
2095 2096           * Start the search at the next lowest priority below the supplied
2096 2097           * priority.  This depends on the bitmap implementation.
2097 2098           */
2098 2099          do {
2099 2100                  wx = pri >> BT_ULSHIFT;         /* index of word in map */
2100 2101  
2101 2102                  /*
2102 2103                   * Form mask for all lower priorities in the word.
2103 2104                   */
2104 2105                  mapword = dqactmap[wx] & (BT_BIW(pri) - 1);
2105 2106  
2106 2107                  /*
2107 2108                   * Get next lower active priority.
2108 2109                   */
2109 2110                  if (mapword != 0) {
2110 2111                          pri = (wx << BT_ULSHIFT) + highbit(mapword) - 1;
2111 2112                  } else if (wx > 0) {
2112 2113                          pri = bt_gethighbit(dqactmap, wx - 1); /* sign extend */
2113 2114                          if (pri < 0)
2114 2115                                  break;
2115 2116                  } else {
2116 2117                          pri = -1;
2117 2118                          break;
2118 2119                  }
2119 2120  
2120 2121                  /*
2121 2122                   * Search the queue for unbound, runnable threads.
2122 2123                   */
2123 2124                  dq = &dp->disp_q[pri];
2124 2125                  tp = dq->dq_first;
2125 2126  
2126 2127                  while (tp && (tp->t_bound_cpu || tp->t_weakbound_cpu)) {
2127 2128                          tp = tp->t_link;
2128 2129                  }
2129 2130  
2130 2131                  /*
2131 2132                   * If a thread was found, set the priority and return.
2132 2133                   */
2133 2134          } while (tp == NULL);
2134 2135  
2135 2136          /*
2136 2137           * pri holds the maximum unbound thread priority or -1.
2137 2138           */
2138 2139          if (dp->disp_max_unbound_pri != pri)
2139 2140                  dp->disp_max_unbound_pri = pri;
2140 2141  }
2141 2142  
2142 2143  /*
2143 2144   * disp_adjust_unbound_pri() - thread is becoming unbound, so we should
2144 2145   *      check if the CPU to which is was previously bound should have
2145 2146   *      its disp_max_unbound_pri increased.
2146 2147   */
2147 2148  void
2148 2149  disp_adjust_unbound_pri(kthread_t *tp)
2149 2150  {
2150 2151          disp_t *dp;
2151 2152          pri_t tpri;
2152 2153  
2153 2154          ASSERT(THREAD_LOCK_HELD(tp));
2154 2155  
2155 2156          /*
2156 2157           * Don't do anything if the thread is not bound, or
2157 2158           * currently not runnable or swapped out.
2158 2159           */
2159 2160          if (tp->t_bound_cpu == NULL ||
2160 2161              tp->t_state != TS_RUN ||
2161 2162              tp->t_schedflag & TS_ON_SWAPQ)
2162 2163                  return;
2163 2164  
2164 2165          tpri = DISP_PRIO(tp);
2165 2166          dp = tp->t_bound_cpu->cpu_disp;
2166 2167          ASSERT(tpri >= 0 && tpri < dp->disp_npri);
2167 2168          if (tpri > dp->disp_max_unbound_pri)
2168 2169                  dp->disp_max_unbound_pri = tpri;
2169 2170  }
2170 2171  
2171 2172  /*
2172 2173   * disp_getbest()
2173 2174   *   De-queue the highest priority unbound runnable thread.
2174 2175   *   Returns with the thread unlocked and onproc but at splhigh (like disp()).
2175 2176   *   Returns NULL if nothing found.
2176 2177   *   Returns T_DONTSTEAL if the thread was not stealable.
2177 2178   *   so that the caller will try again later.
2178 2179   *
2179 2180   *   Passed a pointer to a dispatch queue not associated with this CPU, and
2180 2181   *   its type.
2181 2182   */
2182 2183  static kthread_t *
2183 2184  disp_getbest(disp_t *dp)
2184 2185  {
2185 2186          kthread_t       *tp;
2186 2187          dispq_t         *dq;
2187 2188          pri_t           pri;
2188 2189          cpu_t           *cp, *tcp;
2189 2190          boolean_t       allbound;
2190 2191  
2191 2192          disp_lock_enter(&dp->disp_lock);
2192 2193  
2193 2194          /*
2194 2195           * If there is nothing to run, or the CPU is in the middle of a
2195 2196           * context switch of the only thread, return NULL.
2196 2197           */
2197 2198          tcp = dp->disp_cpu;
2198 2199          cp = CPU;
2199 2200          pri = dp->disp_max_unbound_pri;
2200 2201          if (pri == -1 ||
2201 2202              (tcp != NULL && (tcp->cpu_disp_flags & CPU_DISP_DONTSTEAL) &&
2202 2203              tcp->cpu_disp->disp_nrunnable == 1)) {
2203 2204                  disp_lock_exit_nopreempt(&dp->disp_lock);
2204 2205                  return (NULL);
2205 2206          }
2206 2207  
2207 2208          dq = &dp->disp_q[pri];
2208 2209  
2209 2210  
2210 2211          /*
2211 2212           * Assume that all threads are bound on this queue, and change it
2212 2213           * later when we find out that it is not the case.
2213 2214           */
2214 2215          allbound = B_TRUE;
2215 2216          for (tp = dq->dq_first; tp != NULL; tp = tp->t_link) {
2216 2217                  hrtime_t now, nosteal, rqtime;
2217 2218  
2218 2219                  /*
2219 2220                   * Skip over bound threads which could be here even
2220 2221                   * though disp_max_unbound_pri indicated this level.
2221 2222                   */
2222 2223                  if (tp->t_bound_cpu || tp->t_weakbound_cpu)
2223 2224                          continue;
2224 2225  
2225 2226                  /*
2226 2227                   * We've got some unbound threads on this queue, so turn
2227 2228                   * the allbound flag off now.
2228 2229                   */
2229 2230                  allbound = B_FALSE;
2230 2231  
2231 2232                  /*
2232 2233                   * The thread is a candidate for stealing from its run queue. We
2233 2234                   * don't want to steal threads that became runnable just a
2234 2235                   * moment ago. This improves CPU affinity for threads that get
2235 2236                   * preempted for short periods of time and go back on the run
2236 2237                   * queue.
2237 2238                   *
2238 2239                   * We want to let it stay on its run queue if it was only placed
2239 2240                   * there recently and it was running on the same CPU before that
2240 2241                   * to preserve its cache investment. For the thread to remain on
2241 2242                   * its run queue, ALL of the following conditions must be
2242 2243                   * satisfied:
2243 2244                   *
2244 2245                   * - the disp queue should not be the kernel preemption queue
2245 2246                   * - delayed idle stealing should not be disabled
2246 2247                   * - nosteal_nsec should be non-zero
2247 2248                   * - it should run with user priority
2248 2249                   * - it should be on the run queue of the CPU where it was
2249 2250                   *   running before being placed on the run queue
2250 2251                   * - it should be the only thread on the run queue (to prevent
2251 2252                   *   extra scheduling latency for other threads)
2252 2253                   * - it should sit on the run queue for less than per-chip
2253 2254                   *   nosteal interval or global nosteal interval
2254 2255                   * - in case of CPUs with shared cache it should sit in a run
2255 2256                   *   queue of a CPU from a different chip
2256 2257                   *
2257 2258                   * The checks are arranged so that the ones that are faster are
2258 2259                   * placed earlier.
2259 2260                   */
2260 2261                  if (tcp == NULL ||
2261 2262                      pri >= minclsyspri ||
2262 2263                      tp->t_cpu != tcp)
2263 2264                          break;
2264 2265  
2265 2266                  /*
2266 2267                   * Steal immediately if, due to CMT processor architecture
2267 2268                   * migraiton between cp and tcp would incur no performance
2268 2269                   * penalty.
2269 2270                   */
2270 2271                  if (pg_cmt_can_migrate(cp, tcp))
2271 2272                          break;
2272 2273  
2273 2274                  nosteal = nosteal_nsec;
2274 2275                  if (nosteal == 0)
2275 2276                          break;
2276 2277  
2277 2278                  /*
2278 2279                   * Calculate time spent sitting on run queue
2279 2280                   */
2280 2281                  now = gethrtime_unscaled();
2281 2282                  rqtime = now - tp->t_waitrq;
2282 2283                  scalehrtime(&rqtime);
2283 2284  
2284 2285                  /*
2285 2286                   * Steal immediately if the time spent on this run queue is more
2286 2287                   * than allowed nosteal delay.
2287 2288                   *
2288 2289                   * Negative rqtime check is needed here to avoid infinite
2289 2290                   * stealing delays caused by unlikely but not impossible
2290 2291                   * drifts between CPU times on different CPUs.
2291 2292                   */
2292 2293                  if (rqtime > nosteal || rqtime < 0)
2293 2294                          break;
2294 2295  
2295 2296                  DTRACE_PROBE4(nosteal, kthread_t *, tp,
2296 2297                      cpu_t *, tcp, cpu_t *, cp, hrtime_t, rqtime);
2297 2298                  scalehrtime(&now);
2298 2299                  /*
2299 2300                   * Calculate when this thread becomes stealable
2300 2301                   */
2301 2302                  now += (nosteal - rqtime);
2302 2303  
2303 2304                  /*
2304 2305                   * Calculate time when some thread becomes stealable
2305 2306                   */
2306 2307                  if (now < dp->disp_steal)
2307 2308                          dp->disp_steal = now;
2308 2309          }
2309 2310  
2310 2311          /*
2311 2312           * If there were no unbound threads on this queue, find the queue
2312 2313           * where they are and then return later. The value of
2313 2314           * disp_max_unbound_pri is not always accurate because it isn't
2314 2315           * reduced until another idle CPU looks for work.
2315 2316           */
2316 2317          if (allbound)
2317 2318                  disp_fix_unbound_pri(dp, pri);
2318 2319  
2319 2320          /*
2320 2321           * If we reached the end of the queue and found no unbound threads
2321 2322           * then return NULL so that other CPUs will be considered.  If there
2322 2323           * are unbound threads but they cannot yet be stolen, then
2323 2324           * return T_DONTSTEAL and try again later.
2324 2325           */
2325 2326          if (tp == NULL) {
2326 2327                  disp_lock_exit_nopreempt(&dp->disp_lock);
2327 2328                  return (allbound ? NULL : T_DONTSTEAL);
2328 2329          }
2329 2330  
2330 2331          /*
2331 2332           * Found a runnable, unbound thread, so remove it from queue.
2332 2333           * dispdeq() requires that we have the thread locked, and we do,
2333 2334           * by virtue of holding the dispatch queue lock.  dispdeq() will
2334 2335           * put the thread in transition state, thereby dropping the dispq
2335 2336           * lock.
2336 2337           */
2337 2338  
2338 2339  #ifdef DEBUG
2339 2340          {
2340 2341                  int     thread_was_on_queue;
2341 2342  
2342 2343                  thread_was_on_queue = dispdeq(tp);      /* drops disp_lock */
2343 2344                  ASSERT(thread_was_on_queue);
2344 2345          }
2345 2346  
2346 2347  #else /* DEBUG */
2347 2348          (void) dispdeq(tp);                     /* drops disp_lock */
2348 2349  #endif /* DEBUG */
2349 2350  
2350 2351          /*
2351 2352           * Reset the disp_queue steal time - we do not know what is the smallest
2352 2353           * value across the queue is.
2353 2354           */
2354 2355          dp->disp_steal = 0;
2355 2356  
2356 2357          tp->t_schedflag |= TS_DONT_SWAP;
2357 2358  
2358 2359          /*
2359 2360           * Setup thread to run on the current CPU.
2360 2361           */
2361 2362          tp->t_disp_queue = cp->cpu_disp;
2362 2363  
2363 2364          cp->cpu_dispthread = tp;                /* protected by spl only */
2364 2365          cp->cpu_dispatch_pri = pri;
2365 2366  
2366 2367          /*
2367 2368           * There can be a memory synchronization race between disp_getbest()
2368 2369           * and disp_ratify() vs cpu_resched() where cpu_resched() is trying
2369 2370           * to preempt the current thread to run the enqueued thread while
2370 2371           * disp_getbest() and disp_ratify() are changing the current thread
2371 2372           * to the stolen thread. This may lead to a situation where
2372 2373           * cpu_resched() tries to preempt the wrong thread and the
2373 2374           * stolen thread continues to run on the CPU which has been tagged
2374 2375           * for preemption.
2375 2376           * Later the clock thread gets enqueued but doesn't get to run on the
2376 2377           * CPU causing the system to hang.
2377 2378           *
2378 2379           * To avoid this, grabbing and dropping the disp_lock (which does
2379 2380           * a memory barrier) is needed to synchronize the execution of
2380 2381           * cpu_resched() with disp_getbest() and disp_ratify() and
2381 2382           * synchronize the memory read and written by cpu_resched(),
2382 2383           * disp_getbest(), and disp_ratify() with each other.
2383 2384           *  (see CR#6482861 for more details).
2384 2385           */
2385 2386          disp_lock_enter_high(&cp->cpu_disp->disp_lock);
2386 2387          disp_lock_exit_high(&cp->cpu_disp->disp_lock);
2387 2388  
2388 2389          ASSERT(pri == DISP_PRIO(tp));
2389 2390  
2390 2391          DTRACE_PROBE3(steal, kthread_t *, tp, cpu_t *, tcp, cpu_t *, cp);
2391 2392  
2392 2393          thread_onproc(tp, cp);                  /* set t_state to TS_ONPROC */
2393 2394  
2394 2395          /*
2395 2396           * Return with spl high so that swtch() won't need to raise it.
2396 2397           * The disp_lock was dropped by dispdeq().
2397 2398           */
2398 2399  
2399 2400          return (tp);
2400 2401  }
2401 2402  
2402 2403  /*
2403 2404   * disp_bound_common() - common routine for higher level functions
2404 2405   *      that check for bound threads under certain conditions.
2405 2406   *      If 'threadlistsafe' is set then there is no need to acquire
2406 2407   *      pidlock to stop the thread list from changing (eg, if
2407 2408   *      disp_bound_* is called with cpus paused).
2408 2409   */
2409 2410  static int
2410 2411  disp_bound_common(cpu_t *cp, int threadlistsafe, int flag)
2411 2412  {
2412 2413          int             found = 0;
2413 2414          kthread_t       *tp;
2414 2415  
2415 2416          ASSERT(flag);
2416 2417  
2417 2418          if (!threadlistsafe)
2418 2419                  mutex_enter(&pidlock);
2419 2420          tp = curthread;         /* faster than allthreads */
2420 2421          do {
2421 2422                  if (tp->t_state != TS_FREE) {
2422 2423                          /*
2423 2424                           * If an interrupt thread is busy, but the
2424 2425                           * caller doesn't care (i.e. BOUND_INTR is off),
2425 2426                           * then just ignore it and continue through.
2426 2427                           */
2427 2428                          if ((tp->t_flag & T_INTR_THREAD) &&
2428 2429                              !(flag & BOUND_INTR))
2429 2430                                  continue;
2430 2431  
2431 2432                          /*
2432 2433                           * Skip the idle thread for the CPU
2433 2434                           * we're about to set offline.
2434 2435                           */
2435 2436                          if (tp == cp->cpu_idle_thread)
2436 2437                                  continue;
2437 2438  
2438 2439                          /*
2439 2440                           * Skip the pause thread for the CPU
2440 2441                           * we're about to set offline.
2441 2442                           */
2442 2443                          if (tp == cp->cpu_pause_thread)
2443 2444                                  continue;
2444 2445  
2445 2446                          if ((flag & BOUND_CPU) &&
2446 2447                              (tp->t_bound_cpu == cp ||
2447 2448                              tp->t_bind_cpu == cp->cpu_id ||
2448 2449                              tp->t_weakbound_cpu == cp)) {
2449 2450                                  found = 1;
2450 2451                                  break;
2451 2452                          }
2452 2453  
2453 2454                          if ((flag & BOUND_PARTITION) &&
2454 2455                              (tp->t_cpupart == cp->cpu_part)) {
2455 2456                                  found = 1;
2456 2457                                  break;
2457 2458                          }
2458 2459                  }
2459 2460          } while ((tp = tp->t_next) != curthread && found == 0);
2460 2461          if (!threadlistsafe)
2461 2462                  mutex_exit(&pidlock);
2462 2463          return (found);
2463 2464  }
2464 2465  
2465 2466  /*
2466 2467   * disp_bound_threads - return nonzero if threads are bound to the processor.
2467 2468   *      Called infrequently.  Keep this simple.
2468 2469   *      Includes threads that are asleep or stopped but not onproc.
2469 2470   */
2470 2471  int
2471 2472  disp_bound_threads(cpu_t *cp, int threadlistsafe)
2472 2473  {
2473 2474          return (disp_bound_common(cp, threadlistsafe, BOUND_CPU));
2474 2475  }
2475 2476  
2476 2477  /*
2477 2478   * disp_bound_anythreads - return nonzero if _any_ threads are bound
2478 2479   * to the given processor, including interrupt threads.
2479 2480   */
2480 2481  int
2481 2482  disp_bound_anythreads(cpu_t *cp, int threadlistsafe)
2482 2483  {
2483 2484          return (disp_bound_common(cp, threadlistsafe, BOUND_CPU | BOUND_INTR));
2484 2485  }
2485 2486  
2486 2487  /*
2487 2488   * disp_bound_partition - return nonzero if threads are bound to the same
2488 2489   * partition as the processor.
2489 2490   *      Called infrequently.  Keep this simple.
2490 2491   *      Includes threads that are asleep or stopped but not onproc.
2491 2492   */
2492 2493  int
2493 2494  disp_bound_partition(cpu_t *cp, int threadlistsafe)
2494 2495  {
2495 2496          return (disp_bound_common(cp, threadlistsafe, BOUND_PARTITION));
2496 2497  }
2497 2498  
2498 2499  /*
2499 2500   * disp_cpu_inactive - make a CPU inactive by moving all of its unbound
2500 2501   * threads to other CPUs.
2501 2502   */
2502 2503  void
2503 2504  disp_cpu_inactive(cpu_t *cp)
2504 2505  {
2505 2506          kthread_t       *tp;
2506 2507          disp_t          *dp = cp->cpu_disp;
2507 2508          dispq_t         *dq;
2508 2509          pri_t           pri;
2509 2510          int             wasonq;
2510 2511  
2511 2512          disp_lock_enter(&dp->disp_lock);
2512 2513          while ((pri = dp->disp_max_unbound_pri) != -1) {
2513 2514                  dq = &dp->disp_q[pri];
2514 2515                  tp = dq->dq_first;
2515 2516  
2516 2517                  /*
2517 2518                   * Skip over bound threads.
2518 2519                   */
2519 2520                  while (tp != NULL && tp->t_bound_cpu != NULL) {
2520 2521                          tp = tp->t_link;
2521 2522                  }
2522 2523  
2523 2524                  if (tp == NULL) {
2524 2525                          /* disp_max_unbound_pri must be inaccurate, so fix it */
2525 2526                          disp_fix_unbound_pri(dp, pri);
2526 2527                          continue;
2527 2528                  }
2528 2529  
2529 2530                  wasonq = dispdeq(tp);           /* drops disp_lock */
2530 2531                  ASSERT(wasonq);
2531 2532                  ASSERT(tp->t_weakbound_cpu == NULL);
2532 2533  
2533 2534                  setbackdq(tp);
2534 2535                  /*
2535 2536                   * Called from cpu_offline:
2536 2537                   *
2537 2538                   * cp has already been removed from the list of active cpus
2538 2539                   * and tp->t_cpu has been changed so there is no risk of
2539 2540                   * tp ending up back on cp.
2540 2541                   *
2541 2542                   * Called from cpupart_move_cpu:
2542 2543                   *
2543 2544                   * The cpu has moved to a new cpupart.  Any threads that
2544 2545                   * were on it's dispatch queues before the move remain
2545 2546                   * in the old partition and can't run in the new partition.

↓ open down ↓

952 lines elided

↑ open up ↑

2546 2547                   */
2547 2548                  ASSERT(tp->t_cpu != cp);
2548 2549                  thread_unlock(tp);
2549 2550  
2550 2551                  disp_lock_enter(&dp->disp_lock);
2551 2552          }
2552 2553          disp_lock_exit(&dp->disp_lock);
2553 2554  }
2554 2555  
2555 2556  /*
2556      - * disp_lowpri_cpu - find CPU running the lowest priority thread.
2557      - *      The hint passed in is used as a starting point so we don't favor
2558      - *      CPU 0 or any other CPU.  The caller should pass in the most recently
2559      - *      used CPU for the thread.
     2557 + * Return a score rating this CPU for running this thread: lower is better.
2560 2558   *
2561      - *      The lgroup and priority are used to determine the best CPU to run on
2562      - *      in a NUMA machine.  The lgroup specifies which CPUs are closest while
2563      - *      the thread priority will indicate whether the thread will actually run
2564      - *      there.  To pick the best CPU, the CPUs inside and outside of the given
2565      - *      lgroup which are running the lowest priority threads are found.  The
2566      - *      remote CPU is chosen only if the thread will not run locally on a CPU
2567      - *      within the lgroup, but will run on the remote CPU. If the thread
2568      - *      cannot immediately run on any CPU, the best local CPU will be chosen.
     2559 + * If curthread is looking for a new CPU, then we ignore cpu_dispatch_pri for
     2560 + * curcpu (as that's our own priority).
2569 2561   *
2570      - *      The lpl specified also identifies the cpu partition from which
2571      - *      disp_lowpri_cpu should select a CPU.
     2562 + * If a cpu is the target of an offline request, then try to avoid it.
2572 2563   *
2573      - *      curcpu is used to indicate that disp_lowpri_cpu is being called on
2574      - *      behalf of the current thread. (curthread is looking for a new cpu)
2575      - *      In this case, cpu_dispatch_pri for this thread's cpu should be
2576      - *      ignored.
     2564 + * Otherwise we'll use double the effective dispatcher priority for the CPU.
2577 2565   *
2578      - *      If a cpu is the target of an offline request then try to avoid it.
     2566 + * We do this so ht_adjust_cpu_score() can increment the score if needed,
     2567 + * without ending up over-riding a dispatcher priority.
     2568 + */
     2569 +static pri_t
     2570 +cpu_score(cpu_t *cp, kthread_t *tp)
     2571 +{
     2572 +        pri_t score;
     2573 +
     2574 +        if (tp == curthread && cp == curthread->t_cpu)
     2575 +                score = 2 * CPU_IDLE_PRI;
     2576 +        else if (cp == cpu_inmotion)
     2577 +                score = SHRT_MAX;
     2578 +        else
     2579 +                score = 2 * cp->cpu_dispatch_pri;
     2580 +
     2581 +        if (2 * cp->cpu_disp->disp_maxrunpri > score)
     2582 +                score = 2 * cp->cpu_disp->disp_maxrunpri;
     2583 +        if (2 * cp->cpu_chosen_level > score)
     2584 +                score = 2 * cp->cpu_chosen_level;
     2585 +
     2586 +        return (ht_adjust_cpu_score(tp, cp, score));
     2587 +}
     2588 +
     2589 +/*
     2590 + * disp_lowpri_cpu - find a suitable CPU to run the given thread.
2579 2591   *
2580      - *      This function must be called at either high SPL, or with preemption
2581      - *      disabled, so that the "hint" CPU cannot be removed from the online
2582      - *      CPU list while we are traversing it.
     2592 + * We are looking for a CPU with an effective dispatch priority lower than the
     2593 + * thread's, so that the thread will run immediately rather than be enqueued.
     2594 + * For NUMA locality, we prefer "home" CPUs within the thread's ->t_lpl group.
     2595 + * If we don't find an available CPU there, we will expand our search to include
     2596 + * wider locality levels. (Note these groups are already divided by CPU
     2597 + * partition.)
     2598 + *
     2599 + * If the thread cannot immediately run on *any* CPU, we'll enqueue ourselves on
     2600 + * the best home CPU we found.
     2601 + *
     2602 + * The hint passed in is used as a starting point so we don't favor CPU 0 or any
     2603 + * other CPU.  The caller should pass in the most recently used CPU for the
     2604 + * thread; it's of course possible that this CPU isn't in the home lgroup.
     2605 + *
     2606 + * This function must be called at either high SPL, or with preemption disabled,
     2607 + * so that the "hint" CPU cannot be removed from the online CPU list while we
     2608 + * are traversing it.
2583 2609   */
2584 2610  cpu_t *
2585      -disp_lowpri_cpu(cpu_t *hint, lpl_t *lpl, pri_t tpri, cpu_t *curcpu)
     2611 +disp_lowpri_cpu(cpu_t *hint, kthread_t *tp, pri_t tpri)
2586 2612  {
2587 2613          cpu_t   *bestcpu;
2588 2614          cpu_t   *besthomecpu;
2589 2615          cpu_t   *cp, *cpstart;
2590 2616  
2591      -        pri_t   bestpri;
2592      -        pri_t   cpupri;
2593      -
2594 2617          klgrpset_t      done;
2595      -        klgrpset_t      cur_set;
2596 2618  
2597 2619          lpl_t           *lpl_iter, *lpl_leaf;
2598      -        int             i;
2599 2620  
2600      -        /*
2601      -         * Scan for a CPU currently running the lowest priority thread.
2602      -         * Cannot get cpu_lock here because it is adaptive.
2603      -         * We do not require lock on CPU list.
2604      -         */
2605 2621          ASSERT(hint != NULL);
2606      -        ASSERT(lpl != NULL);
2607      -        ASSERT(lpl->lpl_ncpu > 0);
     2622 +        ASSERT(tp->t_lpl->lpl_ncpu > 0);
2608 2623  
2609      -        /*
2610      -         * First examine local CPUs. Note that it's possible the hint CPU
2611      -         * passed in in remote to the specified home lgroup. If our priority
2612      -         * isn't sufficient enough such that we can run immediately at home,
2613      -         * then examine CPUs remote to our home lgroup.
2614      -         * We would like to give preference to CPUs closest to "home".
2615      -         * If we can't find a CPU where we'll run at a given level
2616      -         * of locality, we expand our search to include the next level.
2617      -         */
2618 2624          bestcpu = besthomecpu = NULL;
2619 2625          klgrpset_clear(done);
2620      -        /* start with lpl we were passed */
2621 2626  
2622      -        lpl_iter = lpl;
     2627 +        lpl_iter = tp->t_lpl;
2623 2628  
2624 2629          do {
     2630 +                pri_t best = SHRT_MAX;
     2631 +                klgrpset_t cur_set;
2625 2632  
2626      -                bestpri = SHRT_MAX;
2627 2633                  klgrpset_clear(cur_set);
2628 2634  
2629      -                for (i = 0; i < lpl_iter->lpl_nrset; i++) {
     2635 +                for (int i = 0; i < lpl_iter->lpl_nrset; i++) {
2630 2636                          lpl_leaf = lpl_iter->lpl_rset[i];
2631 2637                          if (klgrpset_ismember(done, lpl_leaf->lpl_lgrpid))
2632 2638                                  continue;
2633 2639  
2634 2640                          klgrpset_add(cur_set, lpl_leaf->lpl_lgrpid);
2635 2641  
2636 2642                          if (hint->cpu_lpl == lpl_leaf)
2637 2643                                  cp = cpstart = hint;
2638 2644                          else
2639 2645                                  cp = cpstart = lpl_leaf->lpl_cpus;
2640 2646  
2641 2647                          do {
2642      -                                if (cp == curcpu)
2643      -                                        cpupri = -1;
2644      -                                else if (cp == cpu_inmotion)
2645      -                                        cpupri = SHRT_MAX;
2646      -                                else
2647      -                                        cpupri = cp->cpu_dispatch_pri;
2648      -                                if (cp->cpu_disp->disp_maxrunpri > cpupri)
2649      -                                        cpupri = cp->cpu_disp->disp_maxrunpri;
2650      -                                if (cp->cpu_chosen_level > cpupri)
2651      -                                        cpupri = cp->cpu_chosen_level;
2652      -                                if (cpupri < bestpri) {
2653      -                                        if (CPU_IDLING(cpupri)) {
2654      -                                                ASSERT((cp->cpu_flags &
2655      -                                                    CPU_QUIESCED) == 0);
2656      -                                                return (cp);
2657      -                                        }
     2648 +                                pri_t score = cpu_score(cp, tp);
     2649 +
     2650 +                                if (score < best) {
     2651 +                                        best = score;
2658 2652                                          bestcpu = cp;
2659      -                                        bestpri = cpupri;
     2653 +
     2654 +                                        /* An idle CPU: we're done. */
     2655 +                                        if (score / 2 == CPU_IDLE_PRI)
     2656 +                                                goto out;
2660 2657                                  }
2661 2658                          } while ((cp = cp->cpu_next_lpl) != cpstart);
2662 2659                  }
2663 2660  
2664      -                if (bestcpu && (tpri > bestpri)) {
2665      -                        ASSERT((bestcpu->cpu_flags & CPU_QUIESCED) == 0);
2666      -                        return (bestcpu);
2667      -                }
     2661 +                if (bestcpu != NULL && tpri > (best / 2))
     2662 +                        goto out;
     2663 +
2668 2664                  if (besthomecpu == NULL)
2669 2665                          besthomecpu = bestcpu;
     2666 +
2670 2667                  /*
2671 2668                   * Add the lgrps we just considered to the "done" set
2672 2669                   */
2673 2670                  klgrpset_or(done, cur_set);
2674 2671  
2675 2672          } while ((lpl_iter = lpl_iter->lpl_parent) != NULL);
2676 2673  
2677 2674          /*
2678 2675           * The specified priority isn't high enough to run immediately
2679 2676           * anywhere, so just return the best CPU from the home lgroup.
2680 2677           */
2681      -        ASSERT((besthomecpu->cpu_flags & CPU_QUIESCED) == 0);
2682      -        return (besthomecpu);
     2678 +        bestcpu = besthomecpu;
     2679 +
     2680 +out:
     2681 +        ASSERT((bestcpu->cpu_flags & CPU_QUIESCED) == 0);
     2682 +        return (bestcpu);
2683 2683  }
2684 2684  
2685 2685  /*
2686 2686   * This routine provides the generic idle cpu function for all processors.
2687 2687   * If a processor has some specific code to execute when idle (say, to stop
2688 2688   * the pipeline and save power) then that routine should be defined in the
2689 2689   * processors specific code (module_xx.c) and the global variable idle_cpu
2690 2690   * set to that function.
2691 2691   */
2692 2692  static void
2693 2693  generic_idle_cpu(void)
2694 2694  {
2695 2695  }
2696 2696  
2697 2697  /*ARGSUSED*/
2698 2698  static void
2699 2699  generic_enq_thread(cpu_t *cpu, int bound)
2700 2700  {
     2701 +}
     2702 +
     2703 +cpu_t *
     2704 +disp_choose_best_cpu(void)
     2705 +{
     2706 +        kthread_t *t = curthread;
     2707 +        cpu_t *curcpu = CPU;
     2708 +
     2709 +        ASSERT(t->t_preempt > 0);
     2710 +        ASSERT(t->t_state == TS_ONPROC);
     2711 +        ASSERT(t->t_schedflag & TS_VCPU);
     2712 +
     2713 +        if (ht_should_run(t, curcpu))
     2714 +                return (curcpu);
     2715 +
     2716 +        return (disp_lowpri_cpu(curcpu, t, t->t_pri));
2701 2717  }

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX