illumos-gate Wdiff usr/src/uts/common/os/lgrp.c

Print this page

OS-7125 Need mitigation of L1TF (CVE-2018-3646)
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/os/lgrp.c
          +++ new/usr/src/uts/common/os/lgrp.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each

↓ open down ↓

13 lines elided

↑ open up ↑

  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23   23   * Use is subject to license terms.
       24 + * Copyright 2018 Joyent, Inc.
  24   25   */
  25   26  
  26   27  /*
  27   28   * Basic NUMA support in terms of locality groups
  28   29   *
  29   30   * Solaris needs to know which CPUs, memory, etc. are near each other to
  30   31   * provide good performance on NUMA machines by optimizing for locality.
  31   32   * In order to do this, a new abstraction called a "locality group (lgroup)"
  32   33   * has been introduced to keep track of which CPU-like and memory-like hardware
  33   34   * resources are close to each other.  Currently, latency is the only measure

34 35 * used to determine how to group hardware resources into lgroups, but this
35 36 * does not limit the groupings to be based solely on latency. Other factors
36 37 * may be used to determine the groupings in the future.
37 38 *
38 39 * Lgroups are organized into a hieararchy or topology that represents the
39 40 * latency topology of the machine. There is always at least a root lgroup in
40 41 * the system. It represents all the hardware resources in the machine at a
41 42 * latency big enough that any hardware resource can at least access any other
42 43 * hardware resource within that latency. A Uniform Memory Access (UMA)
43 44 * machine is represented with one lgroup (the root). In contrast, a NUMA
44 45 * machine is represented at least by the root lgroup and some number of leaf
45 46 * lgroups where the leaf lgroups contain the hardware resources within the
46 47 * least latency of each other and the root lgroup still contains all the
47 48 * resources in the machine. Some number of intermediate lgroups may exist
48 49 * which represent more levels of locality than just the local latency of the
49 50 * leaf lgroups and the system latency of the root lgroup. Non-leaf lgroups
50 51 * (eg. root and intermediate lgroups) contain the next nearest resources to
51 52 * its children lgroups. Thus, the lgroup hierarchy from a given leaf lgroup
52 53 * to the root lgroup shows the hardware resources from closest to farthest
53 54 * from the leaf lgroup such that each successive ancestor lgroup contains
54 55 * the next nearest resources at the next level of locality from the previous.
55 56 *
56 57 * The kernel uses the lgroup abstraction to know how to allocate resources
57 58 * near a given process/thread. At fork() and lwp/thread_create() time, a
58 59 * "home" lgroup is chosen for a thread. This is done by picking the lgroup
59 60 * with the lowest load average. Binding to a processor or processor set will
60 61 * change the home lgroup for a thread. The scheduler has been modified to try
61 62 * to dispatch a thread on a CPU in its home lgroup. Physical memory
62 63 * allocation is lgroup aware too, so memory will be allocated from the current
63 64 * thread's home lgroup if possible. If the desired resources are not
64 65 * available, the kernel traverses the lgroup hierarchy going to the parent
65 66 * lgroup to find resources at the next level of locality until it reaches the
66 67 * root lgroup.
67 68 */
68 69
69 70 #include <sys/lgrp.h>
70 71 #include <sys/lgrp_user.h>
71 72 #include <sys/types.h>
72 73 #include <sys/mman.h>
73 74 #include <sys/param.h>
74 75 #include <sys/var.h>
75 76 #include <sys/thread.h>
76 77 #include <sys/cpuvar.h>
77 78 #include <sys/cpupart.h>
78 79 #include <sys/kmem.h>
79 80 #include <vm/seg.h>
80 81 #include <vm/seg_kmem.h>
81 82 #include <vm/seg_spt.h>
82 83 #include <vm/seg_vn.h>

↓ open down ↓

49 lines elided

↑ open up ↑

  83   84  #include <vm/as.h>
  84   85  #include <sys/atomic.h>
  85   86  #include <sys/systm.h>
  86   87  #include <sys/errno.h>
  87   88  #include <sys/cmn_err.h>
  88   89  #include <sys/kstat.h>
  89   90  #include <sys/sysmacros.h>
  90   91  #include <sys/pg.h>
  91   92  #include <sys/promif.h>
  92   93  #include <sys/sdt.h>
       94 +#include <sys/ht.h>
  93   95  
  94   96  lgrp_gen_t      lgrp_gen = 0;           /* generation of lgroup hierarchy */
  95   97  lgrp_t *lgrp_table[NLGRPS_MAX]; /* table of all initialized lgrp_t structs */
  96   98                                  /* indexed by lgrp_id */
  97   99  int     nlgrps;                 /* number of lgroups in machine */
  98  100  int     lgrp_alloc_hint = -1;   /* hint for where to try to allocate next */
  99  101  int     lgrp_alloc_max = 0;     /* max lgroup ID allocated so far */
 100  102  
 101  103  /*
 102  104   * Kstat data for lgroups.

 103  105   *
 104  106   * Actual kstat data is collected in lgrp_stats array.
 105  107   * The lgrp_kstat_data array of named kstats is used to extract data from
 106  108   * lgrp_stats and present it to kstat framework. It is protected from partallel
 107  109   * modifications by lgrp_kstat_mutex. This may cause some contention when
 108  110   * several kstat commands run in parallel but this is not the
 109  111   * performance-critical path.
 110  112   */
 111  113  extern struct lgrp_stats lgrp_stats[];  /* table of per-lgrp stats */
 112  114  
 113  115  /*
 114  116   * Declare kstat names statically for enums as defined in the header file.
 115  117   */
 116  118  LGRP_KSTAT_NAMES;
 117  119  
 118  120  static void     lgrp_kstat_init(void);
 119  121  static int      lgrp_kstat_extract(kstat_t *, int);
 120  122  static void     lgrp_kstat_reset(lgrp_id_t);
 121  123  
 122  124  static struct kstat_named lgrp_kstat_data[LGRP_NUM_STATS];
 123  125  static kmutex_t lgrp_kstat_mutex;
 124  126  
 125  127  
 126  128  /*
 127  129   * max number of lgroups supported by the platform
 128  130   */
 129  131  int     nlgrpsmax = 0;
 130  132  
 131  133  /*
 132  134   * The root lgroup. Represents the set of resources at the system wide
 133  135   * level of locality.
 134  136   */
 135  137  lgrp_t          *lgrp_root = NULL;
 136  138  
 137  139  /*
 138  140   * During system bootstrap cp_default does not contain the list of lgrp load
 139  141   * averages (cp_lgrploads). The list is allocated after the first CPU is brought
 140  142   * on-line when cp_default is initialized by cpupart_initialize_default().
 141  143   * Configuring CPU0 may create a two-level topology with root and one leaf node
 142  144   * containing CPU0. This topology is initially constructed in a special
 143  145   * statically allocated 2-element lpl list lpl_bootstrap_list and later cloned
 144  146   * to cp_default when cp_default is initialized. The lpl_bootstrap_list is used
 145  147   * for all lpl operations until cp_default is fully constructed.
 146  148   *
 147  149   * The lpl_bootstrap_list is maintained by the code in lgrp.c. Every other
 148  150   * consumer who needs default lpl should use lpl_bootstrap which is a pointer to
 149  151   * the first element of lpl_bootstrap_list.
 150  152   *
 151  153   * CPUs that are added to the system, but have not yet been assigned to an
 152  154   * lgrp will use lpl_bootstrap as a default lpl. This is necessary because
 153  155   * on some architectures (x86) it's possible for the slave CPU startup thread
 154  156   * to enter the dispatcher or allocate memory before calling lgrp_cpu_init().
 155  157   */
 156  158  #define LPL_BOOTSTRAP_SIZE 2
 157  159  static lpl_t    lpl_bootstrap_list[LPL_BOOTSTRAP_SIZE];
 158  160  lpl_t           *lpl_bootstrap;
 159  161  static lpl_t    *lpl_bootstrap_rset[LPL_BOOTSTRAP_SIZE];
 160  162  static int      lpl_bootstrap_id2rset[LPL_BOOTSTRAP_SIZE];
 161  163  
 162  164  /*
 163  165   * If cp still references the bootstrap lpl, it has not yet been added to
 164  166   * an lgrp. lgrp_mem_choose() uses this macro to detect the case where
 165  167   * a thread is trying to allocate memory close to a CPU that has no lgrp.
 166  168   */
 167  169  #define LGRP_CPU_HAS_NO_LGRP(cp)        ((cp)->cpu_lpl == lpl_bootstrap)
 168  170  
 169  171  static lgrp_t   lroot;
 170  172  
 171  173  /*
 172  174   * Size, in bytes, beyond which random memory allocation policy is applied
 173  175   * to non-shared memory.  Default is the maximum size, so random memory
 174  176   * allocation won't be used for non-shared memory by default.
 175  177   */
 176  178  size_t  lgrp_privm_random_thresh = (size_t)(-1);
 177  179  
 178  180  /* the maximum effect that a single thread can have on it's lgroup's load */
 179  181  #define LGRP_LOADAVG_MAX_EFFECT(ncpu) \
 180  182          ((lgrp_loadavg_max_effect) / (ncpu))
 181  183  uint32_t        lgrp_loadavg_max_effect = LGRP_LOADAVG_THREAD_MAX;
 182  184  
 183  185  
 184  186  /*
 185  187   * Size, in bytes, beyond which random memory allocation policy is applied to
 186  188   * shared memory.  Default is 8MB (2 ISM pages).
 187  189   */
 188  190  size_t  lgrp_shm_random_thresh = 8*1024*1024;
 189  191  
 190  192  /*
 191  193   * Whether to do processor set aware memory allocation by default
 192  194   */
 193  195  int     lgrp_mem_pset_aware = 0;
 194  196  
 195  197  /*
 196  198   * Set the default memory allocation policy for root lgroup
 197  199   */
 198  200  lgrp_mem_policy_t       lgrp_mem_policy_root = LGRP_MEM_POLICY_RANDOM;
 199  201  
 200  202  /*
 201  203   * Set the default memory allocation policy.  For most platforms,
 202  204   * next touch is sufficient, but some platforms may wish to override
 203  205   * this.
 204  206   */
 205  207  lgrp_mem_policy_t       lgrp_mem_default_policy = LGRP_MEM_POLICY_NEXT;
 206  208  
 207  209  
 208  210  /*
 209  211   * lgroup CPU event handlers
 210  212   */
 211  213  static void     lgrp_cpu_init(struct cpu *);
 212  214  static void     lgrp_cpu_fini(struct cpu *, lgrp_id_t);
 213  215  static lgrp_t   *lgrp_cpu_to_lgrp(struct cpu *);
 214  216  
 215  217  /*
 216  218   * lgroup memory event handlers
 217  219   */
 218  220  static void     lgrp_mem_init(int, lgrp_handle_t, boolean_t);
 219  221  static void     lgrp_mem_fini(int, lgrp_handle_t, boolean_t);
 220  222  static void     lgrp_mem_rename(int, lgrp_handle_t, lgrp_handle_t);
 221  223  
 222  224  /*
 223  225   * lgroup CPU partition event handlers
 224  226   */
 225  227  static void     lgrp_part_add_cpu(struct cpu *, lgrp_id_t);
 226  228  static void     lgrp_part_del_cpu(struct cpu *);
 227  229  
 228  230  /*
 229  231   * lgroup framework initialization
 230  232   */
 231  233  static void     lgrp_main_init(void);
 232  234  static void     lgrp_main_mp_init(void);
 233  235  static void     lgrp_root_init(void);
 234  236  static void     lgrp_setup(void);
 235  237  
 236  238  /*
 237  239   * lpl topology
 238  240   */
 239  241  static void     lpl_init(lpl_t *, lpl_t *, lgrp_t *);
 240  242  static void     lpl_clear(lpl_t *);
 241  243  static void     lpl_leaf_insert(lpl_t *, struct cpupart *);
 242  244  static void     lpl_leaf_remove(lpl_t *, struct cpupart *);
 243  245  static void     lpl_rset_add(lpl_t *, lpl_t *);
 244  246  static void     lpl_rset_del(lpl_t *, lpl_t *);
 245  247  static int      lpl_rset_contains(lpl_t *, lpl_t *);
 246  248  static void     lpl_cpu_adjcnt(lpl_act_t, struct cpu *);
 247  249  static void     lpl_child_update(lpl_t *, struct cpupart *);
 248  250  static int      lpl_pick(lpl_t *, lpl_t *);
 249  251  static void     lpl_verify_wrapper(struct cpupart *);
 250  252  
 251  253  /*
 252  254   * defines for lpl topology verifier return codes
 253  255   */
 254  256  
 255  257  #define LPL_TOPO_CORRECT                        0
 256  258  #define LPL_TOPO_PART_HAS_NO_LPL                -1
 257  259  #define LPL_TOPO_CPUS_NOT_EMPTY                 -2
 258  260  #define LPL_TOPO_LGRP_MISMATCH                  -3
 259  261  #define LPL_TOPO_MISSING_PARENT                 -4
 260  262  #define LPL_TOPO_PARENT_MISMATCH                -5
 261  263  #define LPL_TOPO_BAD_CPUCNT                     -6
 262  264  #define LPL_TOPO_RSET_MISMATCH                  -7
 263  265  #define LPL_TOPO_LPL_ORPHANED                   -8
 264  266  #define LPL_TOPO_LPL_BAD_NCPU                   -9
 265  267  #define LPL_TOPO_RSET_MSSNG_LF                  -10
 266  268  #define LPL_TOPO_CPU_HAS_BAD_LPL                -11
 267  269  #define LPL_TOPO_NONLEAF_HAS_CPUS               -12
 268  270  #define LPL_TOPO_LGRP_NOT_LEAF                  -13
 269  271  #define LPL_TOPO_BAD_RSETCNT                    -14
 270  272  
 271  273  /*
 272  274   * Return whether lgroup optimizations should be enabled on this system
 273  275   */
 274  276  int
 275  277  lgrp_optimizations(void)
 276  278  {
 277  279          /*
 278  280           * System must have more than 2 lgroups to enable lgroup optimizations
 279  281           *
 280  282           * XXX This assumes that a 2 lgroup system has an empty root lgroup
 281  283           * with one child lgroup containing all the resources. A 2 lgroup
 282  284           * system with a root lgroup directly containing CPUs or memory might
 283  285           * need lgroup optimizations with its child lgroup, but there
 284  286           * isn't such a machine for now....
 285  287           */
 286  288          if (nlgrps > 2)
 287  289                  return (1);
 288  290  
 289  291          return (0);
 290  292  }
 291  293  
 292  294  /*
 293  295   * Setup root lgroup
 294  296   */
 295  297  static void
 296  298  lgrp_root_init(void)
 297  299  {
 298  300          lgrp_handle_t   hand;
 299  301          int             i;
 300  302          lgrp_id_t       id;
 301  303  
 302  304          /*
 303  305           * Create the "root" lgroup
 304  306           */
 305  307          ASSERT(nlgrps == 0);
 306  308          id = nlgrps++;
 307  309  
 308  310          lgrp_root = &lroot;
 309  311  
 310  312          lgrp_root->lgrp_cpu = NULL;
 311  313          lgrp_root->lgrp_mnodes = 0;
 312  314          lgrp_root->lgrp_nmnodes = 0;
 313  315          hand = lgrp_plat_root_hand();
 314  316          lgrp_root->lgrp_plathand = hand;
 315  317  
 316  318          lgrp_root->lgrp_id = id;
 317  319          lgrp_root->lgrp_cpucnt = 0;
 318  320          lgrp_root->lgrp_childcnt = 0;
 319  321          klgrpset_clear(lgrp_root->lgrp_children);
 320  322          klgrpset_clear(lgrp_root->lgrp_leaves);
 321  323          lgrp_root->lgrp_parent = NULL;
 322  324          lgrp_root->lgrp_latency = lgrp_plat_latency(hand, hand);
 323  325  
 324  326          for (i = 0; i < LGRP_RSRC_COUNT; i++)
 325  327                  klgrpset_clear(lgrp_root->lgrp_set[i]);
 326  328  
 327  329          lgrp_root->lgrp_kstat = NULL;
 328  330  
 329  331          lgrp_table[id] = lgrp_root;
 330  332  
 331  333          /*
 332  334           * Setup initial lpl list for CPU0 and initial t0 home.
 333  335           * The only lpl space we have so far is lpl_bootstrap. It is used for
 334  336           * all topology operations until cp_default is initialized at which
 335  337           * point t0.t_lpl will be updated.
 336  338           */
 337  339          lpl_bootstrap = lpl_bootstrap_list;
 338  340          t0.t_lpl = lpl_bootstrap;
 339  341          cp_default.cp_nlgrploads = LPL_BOOTSTRAP_SIZE;
 340  342          lpl_bootstrap_list[1].lpl_lgrpid = 1;
 341  343  
 342  344          /*
 343  345           * Set up the bootstrap rset
 344  346           * Since the bootstrap toplogy has just the root, and a leaf,
 345  347           * the rset contains just the leaf, and both lpls can use the same rset
 346  348           */
 347  349          lpl_bootstrap_rset[0] = &lpl_bootstrap_list[1];
 348  350          lpl_bootstrap_list[0].lpl_rset_sz = 1;
 349  351          lpl_bootstrap_list[0].lpl_rset = lpl_bootstrap_rset;
 350  352          lpl_bootstrap_list[0].lpl_id2rset = lpl_bootstrap_id2rset;
 351  353  
 352  354          lpl_bootstrap_list[1].lpl_rset_sz = 1;
 353  355          lpl_bootstrap_list[1].lpl_rset = lpl_bootstrap_rset;
 354  356          lpl_bootstrap_list[1].lpl_id2rset = lpl_bootstrap_id2rset;
 355  357  
 356  358          cp_default.cp_lgrploads = lpl_bootstrap;
 357  359  }
 358  360  
 359  361  /*
 360  362   * Initialize the lgroup framework and allow the platform to do the same
 361  363   *
 362  364   * This happens in stages during boot and is all funnelled through this routine
 363  365   * (see definition of lgrp_init_stages_t to see what happens at each stage and
 364  366   * when)
 365  367   */
 366  368  void
 367  369  lgrp_init(lgrp_init_stages_t stage)
 368  370  {
 369  371          /*
 370  372           * Initialize the platform
 371  373           */
 372  374          lgrp_plat_init(stage);
 373  375  
 374  376          switch (stage) {
 375  377          case LGRP_INIT_STAGE1:
 376  378                  /*
 377  379                   * Set max number of lgroups supported on this platform which
 378  380                   * must be less than the max number of lgroups supported by the
 379  381                   * common lgroup framework (eg. NLGRPS_MAX is max elements in
 380  382                   * lgrp_table[], etc.)
 381  383                   */
 382  384                  nlgrpsmax = lgrp_plat_max_lgrps();
 383  385                  ASSERT(nlgrpsmax <= NLGRPS_MAX);
 384  386                  break;
 385  387  
 386  388          case LGRP_INIT_STAGE2:
 387  389                  lgrp_setup();
 388  390                  break;
 389  391  
 390  392          case LGRP_INIT_STAGE4:
 391  393                  lgrp_main_init();
 392  394                  break;
 393  395  
 394  396          case LGRP_INIT_STAGE5:
 395  397                  lgrp_main_mp_init();
 396  398                  break;
 397  399  
 398  400          default:
 399  401                  break;
 400  402          }
 401  403  }
 402  404  
 403  405  /*
 404  406   * Create the root and cpu0's lgroup, and set t0's home.
 405  407   */
 406  408  static void
 407  409  lgrp_setup(void)
 408  410  {
 409  411          /*
 410  412           * Setup the root lgroup
 411  413           */
 412  414          lgrp_root_init();
 413  415  
 414  416          /*
 415  417           * Add cpu0 to an lgroup
 416  418           */
 417  419          lgrp_config(LGRP_CONFIG_CPU_ADD, (uintptr_t)CPU, 0);
 418  420          lgrp_config(LGRP_CONFIG_CPU_ONLINE, (uintptr_t)CPU, 0);
 419  421  }
 420  422  
 421  423  /*
 422  424   * true when lgrp initialization has been completed.
 423  425   */
 424  426  int     lgrp_initialized = 0;
 425  427  
 426  428  /*
 427  429   * True when lgrp topology is constructed.
 428  430   */
 429  431  int     lgrp_topo_initialized = 0;
 430  432  
 431  433  /*
 432  434   * Init routine called after startup(), /etc/system has been processed,
 433  435   * and cpu0 has been added to an lgroup.
 434  436   */
 435  437  static void
 436  438  lgrp_main_init(void)
 437  439  {
 438  440          cpu_t           *cp = CPU;
 439  441          lgrp_id_t       lgrpid;
 440  442          int             i;
 441  443          extern void     pg_cpu0_reinit();
 442  444  
 443  445          /*
 444  446           * Enforce a valid lgrp_mem_default_policy
 445  447           */
 446  448          if ((lgrp_mem_default_policy <= LGRP_MEM_POLICY_DEFAULT) ||
 447  449              (lgrp_mem_default_policy >= LGRP_NUM_MEM_POLICIES) ||
 448  450              (lgrp_mem_default_policy == LGRP_MEM_POLICY_NEXT_SEG))
 449  451                  lgrp_mem_default_policy = LGRP_MEM_POLICY_NEXT;
 450  452  
 451  453          /*
 452  454           * See if mpo should be disabled.
 453  455           * This may happen in the case of null proc LPA on Starcat.
 454  456           * The platform won't be able to detect null proc LPA until after
 455  457           * cpu0 and memory have already been added to lgroups.
 456  458           * When and if it is detected, the Starcat platform will return
 457  459           * a different platform handle for cpu0 which is what we check for
 458  460           * here. If mpo should be disabled move cpu0 to it's rightful place
 459  461           * (the root), and destroy the remaining lgroups. This effectively
 460  462           * provides an UMA lgroup topology.
 461  463           */
 462  464          lgrpid = cp->cpu_lpl->lpl_lgrpid;
 463  465          if (lgrp_table[lgrpid]->lgrp_plathand !=
 464  466              lgrp_plat_cpu_to_hand(cp->cpu_id)) {
 465  467                  lgrp_part_del_cpu(cp);
 466  468                  lgrp_cpu_fini(cp, lgrpid);
 467  469  
 468  470                  lgrp_cpu_init(cp);
 469  471                  lgrp_part_add_cpu(cp, cp->cpu_lpl->lpl_lgrpid);
 470  472  
 471  473                  ASSERT(cp->cpu_lpl->lpl_lgrpid == LGRP_ROOTID);
 472  474  
 473  475                  /*
 474  476                   * Notify the PG subsystem that the CPU's lgrp
 475  477                   * association has changed
 476  478                   */
 477  479                  pg_cpu0_reinit();
 478  480  
 479  481                  /*
 480  482                   * Destroy all lgroups except for root
 481  483                   */
 482  484                  for (i = 0; i <= lgrp_alloc_max; i++) {
 483  485                          if (LGRP_EXISTS(lgrp_table[i]) &&
 484  486                              lgrp_table[i] != lgrp_root)
 485  487                                  lgrp_destroy(lgrp_table[i]);
 486  488                  }
 487  489  
 488  490                  /*
 489  491                   * Fix up root to point at itself for leaves and resources
 490  492                   * and not have any children
 491  493                   */
 492  494                  lgrp_root->lgrp_childcnt = 0;
 493  495                  klgrpset_clear(lgrp_root->lgrp_children);
 494  496                  klgrpset_clear(lgrp_root->lgrp_leaves);
 495  497                  klgrpset_add(lgrp_root->lgrp_leaves, LGRP_ROOTID);
 496  498                  klgrpset_clear(lgrp_root->lgrp_set[LGRP_RSRC_MEM]);
 497  499                  klgrpset_add(lgrp_root->lgrp_set[LGRP_RSRC_MEM], LGRP_ROOTID);
 498  500          }
 499  501  
 500  502          /*
 501  503           * Initialize kstats framework.
 502  504           */
 503  505          lgrp_kstat_init();
 504  506          /*
 505  507           * cpu0 is finally where it should be, so create it's lgroup's kstats
 506  508           */
 507  509          mutex_enter(&cpu_lock);
 508  510          lgrp_kstat_create(cp);
 509  511          mutex_exit(&cpu_lock);
 510  512  
 511  513          lgrp_initialized = 1;
 512  514  }

↓ open down ↓

410 lines elided

↑ open up ↑

 513  515  
 514  516  /*
 515  517   * Finish lgrp initialization after all CPUS are brought on-line.
 516  518   * This routine is called after start_other_cpus().
 517  519   */
 518  520  static void
 519  521  lgrp_main_mp_init(void)
 520  522  {
 521  523          klgrpset_t changed;
 522  524  
      525 +        ht_init();
      526 +
 523  527          /*
 524  528           * Update lgroup topology (if necessary)
 525  529           */
 526  530          klgrpset_clear(changed);
 527  531          (void) lgrp_topo_update(lgrp_table, lgrp_alloc_max + 1, &changed);
 528  532          lgrp_topo_initialized = 1;
 529  533  }
 530  534  
 531  535  /*
 532  536   * Change latency of lgroup with specified lgroup platform handle (if one is

 533  537   * given) or change all lgroups with old latency to new latency
 534  538   */
 535  539  void
 536  540  lgrp_latency_change(lgrp_handle_t hand, u_longlong_t oldtime,
 537  541      u_longlong_t newtime)
 538  542  {
 539  543          lgrp_t          *lgrp;
 540  544          int             i;
 541  545  
 542  546          for (i = 0; i <= lgrp_alloc_max; i++) {
 543  547                  lgrp = lgrp_table[i];
 544  548  
 545  549                  if (!LGRP_EXISTS(lgrp))
 546  550                          continue;
 547  551  
 548  552                  if ((hand == LGRP_NULL_HANDLE &&
 549  553                      lgrp->lgrp_latency == oldtime) ||
 550  554                      (hand != LGRP_NULL_HANDLE && lgrp->lgrp_plathand == hand))
 551  555                          lgrp->lgrp_latency = (int)newtime;
 552  556          }
 553  557  }
 554  558  
 555  559  /*
 556  560   * Handle lgroup (re)configuration events (eg. addition of CPU, etc.)
 557  561   */
 558  562  void
 559  563  lgrp_config(lgrp_config_flag_t event, uintptr_t resource, uintptr_t where)
 560  564  {
 561  565          klgrpset_t      changed;
 562  566          cpu_t           *cp;
 563  567          lgrp_id_t       id;
 564  568          int             rc;
 565  569  
 566  570          switch (event) {
 567  571          /*
 568  572           * The following (re)configuration events are common code
 569  573           * initiated. lgrp_plat_config() is called here to inform the
 570  574           * platform of the reconfiguration event.
 571  575           */
 572  576          case LGRP_CONFIG_CPU_ADD:
 573  577                  cp = (cpu_t *)resource;
 574  578  
 575  579                  /*
 576  580                   * Initialize the new CPU's lgrp related next/prev
 577  581                   * links, and give it a bootstrap lpl so that it can
 578  582                   * survive should it need to enter the dispatcher.
 579  583                   */
 580  584                  cp->cpu_next_lpl = cp;
 581  585                  cp->cpu_prev_lpl = cp;
 582  586                  cp->cpu_next_lgrp = cp;
 583  587                  cp->cpu_prev_lgrp = cp;
 584  588                  cp->cpu_lpl = lpl_bootstrap;
 585  589  
 586  590                  lgrp_plat_config(event, resource);
 587  591                  atomic_inc_32(&lgrp_gen);
 588  592  
 589  593                  break;
 590  594          case LGRP_CONFIG_CPU_DEL:
 591  595                  lgrp_plat_config(event, resource);
 592  596                  atomic_inc_32(&lgrp_gen);
 593  597  
 594  598                  break;
 595  599          case LGRP_CONFIG_CPU_ONLINE:
 596  600                  cp = (cpu_t *)resource;
 597  601                  lgrp_cpu_init(cp);
 598  602                  lgrp_part_add_cpu(cp, cp->cpu_lpl->lpl_lgrpid);
 599  603                  rc = lpl_topo_verify(cp->cpu_part);
 600  604                  if (rc != LPL_TOPO_CORRECT) {
 601  605                          panic("lpl_topo_verify failed: %d", rc);
 602  606                  }
 603  607                  lgrp_plat_config(event, resource);
 604  608                  atomic_inc_32(&lgrp_gen);
 605  609  
 606  610                  break;
 607  611          case LGRP_CONFIG_CPU_OFFLINE:
 608  612                  cp = (cpu_t *)resource;
 609  613                  id = cp->cpu_lpl->lpl_lgrpid;
 610  614                  lgrp_part_del_cpu(cp);
 611  615                  lgrp_cpu_fini(cp, id);
 612  616                  rc = lpl_topo_verify(cp->cpu_part);
 613  617                  if (rc != LPL_TOPO_CORRECT) {
 614  618                          panic("lpl_topo_verify failed: %d", rc);
 615  619                  }
 616  620                  lgrp_plat_config(event, resource);
 617  621                  atomic_inc_32(&lgrp_gen);
 618  622  
 619  623                  break;
 620  624          case LGRP_CONFIG_CPUPART_ADD:
 621  625                  cp = (cpu_t *)resource;
 622  626                  lgrp_part_add_cpu((cpu_t *)resource, (lgrp_id_t)where);
 623  627                  rc = lpl_topo_verify(cp->cpu_part);
 624  628                  if (rc != LPL_TOPO_CORRECT) {
 625  629                          panic("lpl_topo_verify failed: %d", rc);
 626  630                  }
 627  631                  lgrp_plat_config(event, resource);
 628  632  
 629  633                  break;
 630  634          case LGRP_CONFIG_CPUPART_DEL:
 631  635                  cp = (cpu_t *)resource;
 632  636                  lgrp_part_del_cpu((cpu_t *)resource);
 633  637                  rc = lpl_topo_verify(cp->cpu_part);
 634  638                  if (rc != LPL_TOPO_CORRECT) {
 635  639                          panic("lpl_topo_verify failed: %d", rc);
 636  640                  }
 637  641                  lgrp_plat_config(event, resource);
 638  642  
 639  643                  break;
 640  644          /*
 641  645           * The following events are initiated by the memnode
 642  646           * subsystem.
 643  647           */
 644  648          case LGRP_CONFIG_MEM_ADD:
 645  649                  lgrp_mem_init((int)resource, where, B_FALSE);
 646  650                  atomic_inc_32(&lgrp_gen);
 647  651  
 648  652                  break;
 649  653          case LGRP_CONFIG_MEM_DEL:
 650  654                  lgrp_mem_fini((int)resource, where, B_FALSE);
 651  655                  atomic_inc_32(&lgrp_gen);
 652  656  
 653  657                  break;
 654  658          case LGRP_CONFIG_MEM_RENAME: {
 655  659                  lgrp_config_mem_rename_t *ren_arg =
 656  660                      (lgrp_config_mem_rename_t *)where;
 657  661  
 658  662                  lgrp_mem_rename((int)resource,
 659  663                      ren_arg->lmem_rename_from,
 660  664                      ren_arg->lmem_rename_to);
 661  665                  atomic_inc_32(&lgrp_gen);
 662  666  
 663  667                  break;
 664  668          }
 665  669          case LGRP_CONFIG_GEN_UPDATE:
 666  670                  atomic_inc_32(&lgrp_gen);
 667  671  
 668  672                  break;
 669  673          case LGRP_CONFIG_FLATTEN:
 670  674                  if (where == 0)
 671  675                          lgrp_topo_levels = (int)resource;
 672  676                  else
 673  677                          (void) lgrp_topo_flatten(resource,
 674  678                              lgrp_table, lgrp_alloc_max, &changed);
 675  679  
 676  680                  break;
 677  681          /*
 678  682           * Update any lgroups with old latency to new latency
 679  683           */
 680  684          case LGRP_CONFIG_LAT_CHANGE_ALL:
 681  685                  lgrp_latency_change(LGRP_NULL_HANDLE, (u_longlong_t)resource,
 682  686                      (u_longlong_t)where);
 683  687  
 684  688                  break;
 685  689          /*
 686  690           * Update lgroup with specified lgroup platform handle to have
 687  691           * new latency
 688  692           */
 689  693          case LGRP_CONFIG_LAT_CHANGE:
 690  694                  lgrp_latency_change((lgrp_handle_t)resource, 0,
 691  695                      (u_longlong_t)where);
 692  696  
 693  697                  break;
 694  698          case LGRP_CONFIG_NOP:
 695  699  
 696  700                  break;
 697  701          default:
 698  702                  break;
 699  703          }
 700  704  
 701  705  }
 702  706  
 703  707  /*
 704  708   * Called to add lgrp info into cpu structure from cpu_add_unit;
 705  709   * do not assume cpu is in cpu[] yet!
 706  710   *
 707  711   * CPUs are brought online with all other CPUs paused so we can't
 708  712   * allocate memory or we could deadlock the system, so we rely on
 709  713   * the platform to statically allocate as much space as we need
 710  714   * for the lgrp structs and stats.
 711  715   */
 712  716  static void
 713  717  lgrp_cpu_init(struct cpu *cp)
 714  718  {
 715  719          klgrpset_t      changed;
 716  720          int             count;
 717  721          lgrp_handle_t   hand;
 718  722          int             first_cpu;
 719  723          lgrp_t          *my_lgrp;
 720  724          lgrp_id_t       lgrpid;
 721  725          struct cpu      *cptr;
 722  726  
 723  727          /*
 724  728           * This is the first time through if the resource set
 725  729           * for the root lgroup is empty. After cpu0 has been
 726  730           * initially added to an lgroup, the root's CPU resource
 727  731           * set can never be empty, since the system's last CPU
 728  732           * cannot be offlined.
 729  733           */
 730  734          if (klgrpset_isempty(lgrp_root->lgrp_set[LGRP_RSRC_CPU])) {
 731  735                  /*
 732  736                   * First time through.
 733  737                   */
 734  738                  first_cpu = 1;
 735  739          } else {
 736  740                  /*
 737  741                   * If cpu0 needs to move lgroups, we may come
 738  742                   * through here again, at which time cpu_lock won't
 739  743                   * be held, and lgrp_initialized will be false.
 740  744                   */
 741  745                  ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
 742  746                  ASSERT(cp->cpu_part != NULL);
 743  747                  first_cpu = 0;
 744  748          }
 745  749  
 746  750          hand = lgrp_plat_cpu_to_hand(cp->cpu_id);
 747  751          my_lgrp = lgrp_hand_to_lgrp(hand);
 748  752  
 749  753          if (my_lgrp == NULL) {
 750  754                  /*
 751  755                   * Create new lgrp and add it to lgroup topology
 752  756                   */
 753  757                  my_lgrp = lgrp_create();
 754  758                  my_lgrp->lgrp_plathand = hand;
 755  759                  my_lgrp->lgrp_latency = lgrp_plat_latency(hand, hand);
 756  760                  lgrpid = my_lgrp->lgrp_id;
 757  761                  klgrpset_add(my_lgrp->lgrp_leaves, lgrpid);
 758  762                  klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
 759  763  
 760  764                  count = 0;
 761  765                  klgrpset_clear(changed);
 762  766                  count += lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
 763  767                      &changed);
 764  768                  /*
 765  769                   * May have added new intermediate lgroups, so need to add
 766  770                   * resources other than CPUs which are added below
 767  771                   */
 768  772                  (void) lgrp_mnode_update(changed, NULL);
 769  773          } else if (my_lgrp->lgrp_latency == 0 && lgrp_plat_latency(hand, hand)
 770  774              > 0) {
 771  775                  /*
 772  776                   * Leaf lgroup was created, but latency wasn't available
 773  777                   * then.  So, set latency for it and fill in rest of lgroup
 774  778                   * topology  now that we know how far it is from other leaf
 775  779                   * lgroups.
 776  780                   */
 777  781                  lgrpid = my_lgrp->lgrp_id;
 778  782                  klgrpset_clear(changed);
 779  783                  if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_CPU],
 780  784                      lgrpid))
 781  785                          klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
 782  786                  count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
 783  787                      &changed);
 784  788  
 785  789                  /*
 786  790                   * May have added new intermediate lgroups, so need to add
 787  791                   * resources other than CPUs which are added below
 788  792                   */
 789  793                  (void) lgrp_mnode_update(changed, NULL);
 790  794          } else if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_CPU],
 791  795              my_lgrp->lgrp_id)) {
 792  796                  int     i;
 793  797  
 794  798                  /*
 795  799                   * Update existing lgroup and lgroups containing it with CPU
 796  800                   * resource
 797  801                   */
 798  802                  lgrpid = my_lgrp->lgrp_id;
 799  803                  klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
 800  804                  for (i = 0; i <= lgrp_alloc_max; i++) {
 801  805                          lgrp_t          *lgrp;
 802  806  
 803  807                          lgrp = lgrp_table[i];
 804  808                          if (!LGRP_EXISTS(lgrp) ||
 805  809                              !lgrp_rsets_member(lgrp->lgrp_set, lgrpid))
 806  810                                  continue;
 807  811  
 808  812                          klgrpset_add(lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
 809  813                  }
 810  814          }
 811  815  
 812  816          lgrpid = my_lgrp->lgrp_id;
 813  817          cp->cpu_lpl = &cp->cpu_part->cp_lgrploads[lgrpid];
 814  818  
 815  819          /*
 816  820           * For multi-lgroup systems, need to setup lpl for CPU0 or CPU0 will
 817  821           * end up in lpl for lgroup 0 whether it is supposed to be in there or
 818  822           * not since none of lgroup IDs in the lpl's have been set yet.
 819  823           */
 820  824          if (first_cpu && nlgrpsmax > 1 && lgrpid != cp->cpu_lpl->lpl_lgrpid)
 821  825                  cp->cpu_lpl->lpl_lgrpid = lgrpid;
 822  826  
 823  827          /*
 824  828           * link the CPU into the lgrp's CPU list
 825  829           */
 826  830          if (my_lgrp->lgrp_cpucnt == 0) {
 827  831                  my_lgrp->lgrp_cpu = cp;
 828  832                  cp->cpu_next_lgrp = cp->cpu_prev_lgrp = cp;
 829  833          } else {
 830  834                  cptr = my_lgrp->lgrp_cpu;
 831  835                  cp->cpu_next_lgrp = cptr;
 832  836                  cp->cpu_prev_lgrp = cptr->cpu_prev_lgrp;
 833  837                  cptr->cpu_prev_lgrp->cpu_next_lgrp = cp;
 834  838                  cptr->cpu_prev_lgrp = cp;
 835  839          }
 836  840          my_lgrp->lgrp_cpucnt++;
 837  841  }
 838  842  
 839  843  lgrp_t *
 840  844  lgrp_create(void)
 841  845  {
 842  846          lgrp_t          *my_lgrp;
 843  847          lgrp_id_t       lgrpid;
 844  848          int             i;
 845  849  
 846  850          ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock));
 847  851  
 848  852          /*
 849  853           * Find an open slot in the lgroup table and recycle unused lgroup
 850  854           * left there if any
 851  855           */
 852  856          my_lgrp = NULL;
 853  857          if (lgrp_alloc_hint == -1)
 854  858                  /*
 855  859                   * Allocate from end when hint not set yet because no lgroups
 856  860                   * have been deleted yet
 857  861                   */
 858  862                  lgrpid = nlgrps++;
 859  863          else {
 860  864                  /*
 861  865                   * Start looking for next open slot from hint and leave hint
 862  866                   * at slot allocated
 863  867                   */
 864  868                  for (i = lgrp_alloc_hint; i < nlgrpsmax; i++) {
 865  869                          my_lgrp = lgrp_table[i];
 866  870                          if (!LGRP_EXISTS(my_lgrp)) {
 867  871                                  lgrpid = i;
 868  872                                  nlgrps++;
 869  873                                  break;
 870  874                          }
 871  875                  }
 872  876                  lgrp_alloc_hint = lgrpid;
 873  877          }
 874  878  
 875  879          /*
 876  880           * Keep track of max lgroup ID allocated so far to cut down on searches
 877  881           */
 878  882          if (lgrpid > lgrp_alloc_max)
 879  883                  lgrp_alloc_max = lgrpid;
 880  884  
 881  885          /*
 882  886           * Need to allocate new lgroup if next open slot didn't have one
 883  887           * for recycling
 884  888           */
 885  889          if (my_lgrp == NULL)
 886  890                  my_lgrp = lgrp_plat_alloc(lgrpid);
 887  891  
 888  892          if (nlgrps > nlgrpsmax || my_lgrp == NULL)
 889  893                  panic("Too many lgrps for platform (%d)", nlgrps);
 890  894  
 891  895          my_lgrp->lgrp_id = lgrpid;
 892  896          my_lgrp->lgrp_latency = 0;
 893  897          my_lgrp->lgrp_plathand = LGRP_NULL_HANDLE;
 894  898          my_lgrp->lgrp_parent = NULL;
 895  899          my_lgrp->lgrp_childcnt = 0;
 896  900          my_lgrp->lgrp_mnodes = (mnodeset_t)0;
 897  901          my_lgrp->lgrp_nmnodes = 0;
 898  902          klgrpset_clear(my_lgrp->lgrp_children);
 899  903          klgrpset_clear(my_lgrp->lgrp_leaves);
 900  904          for (i = 0; i < LGRP_RSRC_COUNT; i++)
 901  905                  klgrpset_clear(my_lgrp->lgrp_set[i]);
 902  906  
 903  907          my_lgrp->lgrp_cpu = NULL;
 904  908          my_lgrp->lgrp_cpucnt = 0;
 905  909  
 906  910          if (my_lgrp->lgrp_kstat != NULL)
 907  911                  lgrp_kstat_reset(lgrpid);
 908  912  
 909  913          lgrp_table[my_lgrp->lgrp_id] = my_lgrp;
 910  914  
 911  915          return (my_lgrp);
 912  916  }
 913  917  
 914  918  void
 915  919  lgrp_destroy(lgrp_t *lgrp)
 916  920  {
 917  921          int             i;
 918  922  
 919  923          /*
 920  924           * Unless this lgroup is being destroyed on behalf of
 921  925           * the boot CPU, cpu_lock must be held
 922  926           */
 923  927          ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock));
 924  928  
 925  929          if (nlgrps == 1)
 926  930                  cmn_err(CE_PANIC, "Can't destroy only lgroup!");
 927  931  
 928  932          if (!LGRP_EXISTS(lgrp))
 929  933                  return;
 930  934  
 931  935          /*
 932  936           * Set hint to lgroup being deleted and try to keep lower numbered
 933  937           * hints to facilitate finding empty slots
 934  938           */
 935  939          if (lgrp_alloc_hint == -1 || lgrp->lgrp_id < lgrp_alloc_hint)
 936  940                  lgrp_alloc_hint = lgrp->lgrp_id;
 937  941  
 938  942          /*
 939  943           * Mark this lgroup to be recycled by setting its lgroup ID to
 940  944           * LGRP_NONE and clear relevant fields
 941  945           */
 942  946          lgrp->lgrp_id = LGRP_NONE;
 943  947          lgrp->lgrp_latency = 0;
 944  948          lgrp->lgrp_plathand = LGRP_NULL_HANDLE;
 945  949          lgrp->lgrp_parent = NULL;
 946  950          lgrp->lgrp_childcnt = 0;
 947  951  
 948  952          klgrpset_clear(lgrp->lgrp_children);
 949  953          klgrpset_clear(lgrp->lgrp_leaves);
 950  954          for (i = 0; i < LGRP_RSRC_COUNT; i++)
 951  955                  klgrpset_clear(lgrp->lgrp_set[i]);
 952  956  
 953  957          lgrp->lgrp_mnodes = (mnodeset_t)0;
 954  958          lgrp->lgrp_nmnodes = 0;
 955  959  
 956  960          lgrp->lgrp_cpu = NULL;
 957  961          lgrp->lgrp_cpucnt = 0;
 958  962  
 959  963          nlgrps--;
 960  964  }
 961  965  
 962  966  /*
 963  967   * Initialize kstat data. Called from lgrp intialization code.
 964  968   */
 965  969  static void
 966  970  lgrp_kstat_init(void)
 967  971  {
 968  972          lgrp_stat_t     stat;
 969  973  
 970  974          mutex_init(&lgrp_kstat_mutex, NULL, MUTEX_DEFAULT, NULL);
 971  975  
 972  976          for (stat = 0; stat < LGRP_NUM_STATS; stat++)
 973  977                  kstat_named_init(&lgrp_kstat_data[stat],
 974  978                      lgrp_kstat_names[stat], KSTAT_DATA_INT64);
 975  979  }
 976  980  
 977  981  /*
 978  982   * initialize an lgrp's kstats if needed
 979  983   * called with cpu_lock held but not with cpus paused.
 980  984   * we don't tear these down now because we don't know about
 981  985   * memory leaving the lgrp yet...
 982  986   */
 983  987  
 984  988  void
 985  989  lgrp_kstat_create(cpu_t *cp)
 986  990  {
 987  991          kstat_t         *lgrp_kstat;
 988  992          lgrp_id_t       lgrpid;
 989  993          lgrp_t          *my_lgrp;
 990  994  
 991  995          ASSERT(MUTEX_HELD(&cpu_lock));
 992  996  
 993  997          lgrpid = cp->cpu_lpl->lpl_lgrpid;
 994  998          my_lgrp = lgrp_table[lgrpid];
 995  999  
 996 1000          if (my_lgrp->lgrp_kstat != NULL)
 997 1001                  return; /* already initialized */
 998 1002  
 999 1003          lgrp_kstat = kstat_create("lgrp", lgrpid, NULL, "misc",
1000 1004              KSTAT_TYPE_NAMED, LGRP_NUM_STATS,
1001 1005              KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE);
1002 1006  
1003 1007          if (lgrp_kstat != NULL) {
1004 1008                  lgrp_kstat->ks_lock = &lgrp_kstat_mutex;
1005 1009                  lgrp_kstat->ks_private = my_lgrp;
1006 1010                  lgrp_kstat->ks_data = &lgrp_kstat_data;
1007 1011                  lgrp_kstat->ks_update = lgrp_kstat_extract;
1008 1012                  my_lgrp->lgrp_kstat = lgrp_kstat;
1009 1013                  kstat_install(lgrp_kstat);
1010 1014          }
1011 1015  }
1012 1016  
1013 1017  /*
1014 1018   * this will do something when we manage to remove now unused lgrps
1015 1019   */
1016 1020  
1017 1021  /* ARGSUSED */
1018 1022  void
1019 1023  lgrp_kstat_destroy(cpu_t *cp)
1020 1024  {
1021 1025          ASSERT(MUTEX_HELD(&cpu_lock));
1022 1026  }
1023 1027  
1024 1028  /*
1025 1029   * Called when a CPU is off-lined.
1026 1030   */
1027 1031  static void
1028 1032  lgrp_cpu_fini(struct cpu *cp, lgrp_id_t lgrpid)
1029 1033  {
1030 1034          lgrp_t *my_lgrp;
1031 1035          struct cpu *prev;
1032 1036          struct cpu *next;
1033 1037  
1034 1038          ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
1035 1039  
1036 1040          prev = cp->cpu_prev_lgrp;
1037 1041          next = cp->cpu_next_lgrp;
1038 1042  
1039 1043          prev->cpu_next_lgrp = next;
1040 1044          next->cpu_prev_lgrp = prev;
1041 1045  
1042 1046          /*
1043 1047           * just because I'm paranoid doesn't mean...
1044 1048           */
1045 1049  
1046 1050          cp->cpu_next_lgrp = cp->cpu_prev_lgrp = NULL;
1047 1051  
1048 1052          my_lgrp = lgrp_table[lgrpid];
1049 1053          my_lgrp->lgrp_cpucnt--;
1050 1054  
1051 1055          /*
1052 1056           * Removing last CPU in lgroup, so update lgroup topology
1053 1057           */
1054 1058          if (my_lgrp->lgrp_cpucnt == 0) {
1055 1059                  klgrpset_t      changed;
1056 1060                  int             count;
1057 1061                  int             i;
1058 1062  
1059 1063                  my_lgrp->lgrp_cpu = NULL;
1060 1064  
1061 1065                  /*
1062 1066                   * Remove this lgroup from its lgroup CPU resources and remove
1063 1067                   * lgroup from lgroup topology if it doesn't have any more
1064 1068                   * resources in it now
1065 1069                   */
1066 1070                  klgrpset_del(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
1067 1071                  if (lgrp_rsets_empty(my_lgrp->lgrp_set)) {
1068 1072                          count = 0;
1069 1073                          klgrpset_clear(changed);
1070 1074                          count += lgrp_leaf_delete(my_lgrp, lgrp_table,
1071 1075                              lgrp_alloc_max + 1, &changed);
1072 1076                          return;
1073 1077                  }
1074 1078  
1075 1079                  /*
1076 1080                   * This lgroup isn't empty, so just remove it from CPU
1077 1081                   * resources of any lgroups that contain it as such
1078 1082                   */
1079 1083                  for (i = 0; i <= lgrp_alloc_max; i++) {
1080 1084                          lgrp_t          *lgrp;
1081 1085  
1082 1086                          lgrp = lgrp_table[i];
1083 1087                          if (!LGRP_EXISTS(lgrp) ||
1084 1088                              !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_CPU],
1085 1089                              lgrpid))
1086 1090                                  continue;
1087 1091  
1088 1092                          klgrpset_del(lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
1089 1093                  }
1090 1094                  return;
1091 1095          }
1092 1096  
1093 1097          if (my_lgrp->lgrp_cpu == cp)
1094 1098                  my_lgrp->lgrp_cpu = next;
1095 1099  
1096 1100  }
1097 1101  
1098 1102  /*
1099 1103   * Update memory nodes in target lgroups and return ones that get changed
1100 1104   */
1101 1105  int
1102 1106  lgrp_mnode_update(klgrpset_t target, klgrpset_t *changed)
1103 1107  {
1104 1108          int     count;
1105 1109          int     i;
1106 1110          int     j;
1107 1111          lgrp_t  *lgrp;
1108 1112          lgrp_t  *lgrp_rsrc;
1109 1113  
1110 1114          count = 0;
1111 1115          if (changed)
1112 1116                  klgrpset_clear(*changed);
1113 1117  
1114 1118          if (klgrpset_isempty(target))
1115 1119                  return (0);
1116 1120  
1117 1121          /*
1118 1122           * Find each lgroup in target lgroups
1119 1123           */
1120 1124          for (i = 0; i <= lgrp_alloc_max; i++) {
1121 1125                  /*
1122 1126                   * Skip any lgroups that don't exist or aren't in target group
1123 1127                   */
1124 1128                  lgrp = lgrp_table[i];
1125 1129                  if (!klgrpset_ismember(target, i) || !LGRP_EXISTS(lgrp)) {
1126 1130                          continue;
1127 1131                  }
1128 1132  
1129 1133                  /*
1130 1134                   * Initialize memnodes for intermediate lgroups to 0
1131 1135                   * and update them from scratch since they may have completely
1132 1136                   * changed
1133 1137                   */
1134 1138                  if (lgrp->lgrp_childcnt && lgrp != lgrp_root) {
1135 1139                          lgrp->lgrp_mnodes = (mnodeset_t)0;
1136 1140                          lgrp->lgrp_nmnodes = 0;
1137 1141                  }
1138 1142  
1139 1143                  /*
1140 1144                   * Update memory nodes of of target lgroup with memory nodes
1141 1145                   * from each lgroup in its lgroup memory resource set
1142 1146                   */
1143 1147                  for (j = 0; j <= lgrp_alloc_max; j++) {
1144 1148                          int     k;
1145 1149  
1146 1150                          /*
1147 1151                           * Skip any lgroups that don't exist or aren't in
1148 1152                           * memory resources of target lgroup
1149 1153                           */
1150 1154                          lgrp_rsrc = lgrp_table[j];
1151 1155                          if (!LGRP_EXISTS(lgrp_rsrc) ||
1152 1156                              !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM],
1153 1157                              j))
1154 1158                                  continue;
1155 1159  
1156 1160                          /*
1157 1161                           * Update target lgroup's memnodes to include memnodes
1158 1162                           * of this lgroup
1159 1163                           */
1160 1164                          for (k = 0; k < sizeof (mnodeset_t) * NBBY; k++) {
1161 1165                                  mnodeset_t      mnode_mask;
1162 1166  
1163 1167                                  mnode_mask = (mnodeset_t)1 << k;
1164 1168                                  if ((lgrp_rsrc->lgrp_mnodes & mnode_mask) &&
1165 1169                                      !(lgrp->lgrp_mnodes & mnode_mask)) {
1166 1170                                          lgrp->lgrp_mnodes |= mnode_mask;
1167 1171                                          lgrp->lgrp_nmnodes++;
1168 1172                                  }
1169 1173                          }
1170 1174                          count++;
1171 1175                          if (changed)
1172 1176                                  klgrpset_add(*changed, lgrp->lgrp_id);
1173 1177                  }
1174 1178          }
1175 1179  
1176 1180          return (count);
1177 1181  }
1178 1182  
1179 1183  /*
1180 1184   * Memory copy-rename. Called when the "mnode" containing the kernel cage memory
1181 1185   * is moved from one board to another. The "from" and "to" arguments specify the
1182 1186   * source and the destination of the move.
1183 1187   *
1184 1188   * See plat_lgrp_config() for a detailed description of the copy-rename
1185 1189   * semantics.
1186 1190   *
1187 1191   * The lgrp_mem_rename() is called by the platform copy-rename code to update
1188 1192   * the lgroup topology which is changing as memory moves from one lgroup to
1189 1193   * another. It removes the mnode from the source lgroup and re-inserts it in the
1190 1194   * target lgroup.
1191 1195   *
1192 1196   * The lgrp_mem_rename() function passes a flag to lgrp_mem_init() and
1193 1197   * lgrp_mem_fini() telling that the insertion and deleteion are part of a DR
1194 1198   * copy-rename operation.
1195 1199   *
1196 1200   * There is one case which requires special handling. If the system contains
1197 1201   * only two boards (mnodes), the lgrp_mem_fini() removes the only mnode from the
1198 1202   * lgroup hierarchy. This mnode is soon re-inserted back in the hierarchy by
1199 1203   * lgrp_mem_init), but there is a window when the system has no memory in the
1200 1204   * lgroup hierarchy. If another thread tries to allocate memory during this
1201 1205   * window, the allocation will fail, although the system has physical memory.
1202 1206   * This may cause a system panic or a deadlock (some sleeping memory allocations
1203 1207   * happen with cpu_lock held which prevents lgrp_mem_init() from re-inserting
1204 1208   * the mnode back).
1205 1209   *
1206 1210   * The lgrp_memnode_choose() function walks the lgroup hierarchy looking for the
1207 1211   * lgrp with non-empty lgrp_mnodes. To deal with the special case above,
1208 1212   * lgrp_mem_fini() does not remove the last mnode from the lroot->lgrp_mnodes,
1209 1213   * but it updates the rest of the lgroup topology as if the mnode was actually
1210 1214   * removed. The lgrp_mem_init() function recognizes that the mnode being
1211 1215   * inserted represents such a special case and updates the topology
1212 1216   * appropriately.
1213 1217   */
1214 1218  void
1215 1219  lgrp_mem_rename(int mnode, lgrp_handle_t from, lgrp_handle_t to)
1216 1220  {
1217 1221          /*
1218 1222           * Remove the memory from the source node and add it to the destination
1219 1223           * node.
1220 1224           */
1221 1225          lgrp_mem_fini(mnode, from, B_TRUE);
1222 1226          lgrp_mem_init(mnode, to, B_TRUE);
1223 1227  }
1224 1228  
1225 1229  /*
1226 1230   * Called to indicate that the lgrp with platform handle "hand" now
1227 1231   * contains the memory identified by "mnode".
1228 1232   *
1229 1233   * LOCKING for this routine is a bit tricky. Usually it is called without
1230 1234   * cpu_lock and it must must grab cpu_lock here to prevent racing with other
1231 1235   * callers. During DR of the board containing the caged memory it may be called
1232 1236   * with cpu_lock already held and CPUs paused.
1233 1237   *
1234 1238   * If the insertion is part of the DR copy-rename and the inserted mnode (and
1235 1239   * only this mnode) is already present in the lgrp_root->lgrp_mnodes set, we are
1236 1240   * dealing with the special case of DR copy-rename described in
1237 1241   * lgrp_mem_rename().
1238 1242   */
1239 1243  void
1240 1244  lgrp_mem_init(int mnode, lgrp_handle_t hand, boolean_t is_copy_rename)
1241 1245  {
1242 1246          klgrpset_t      changed;
1243 1247          int             count;
1244 1248          int             i;
1245 1249          lgrp_t          *my_lgrp;
1246 1250          lgrp_id_t       lgrpid;
1247 1251          mnodeset_t      mnodes_mask = ((mnodeset_t)1 << mnode);
1248 1252          boolean_t       drop_lock = B_FALSE;
1249 1253          boolean_t       need_synch = B_FALSE;
1250 1254  
1251 1255          /*
1252 1256           * Grab CPU lock (if we haven't already)
1253 1257           */
1254 1258          if (!MUTEX_HELD(&cpu_lock)) {
1255 1259                  mutex_enter(&cpu_lock);
1256 1260                  drop_lock = B_TRUE;
1257 1261          }
1258 1262  
1259 1263          /*
1260 1264           * This routine may be called from a context where we already
1261 1265           * hold cpu_lock, and have already paused cpus.
1262 1266           */
1263 1267          if (!cpus_paused())
1264 1268                  need_synch = B_TRUE;
1265 1269  
1266 1270          /*
1267 1271           * Check if this mnode is already configured and return immediately if
1268 1272           * it is.
1269 1273           *
1270 1274           * NOTE: in special case of copy-rename of the only remaining mnode,
1271 1275           * lgrp_mem_fini() refuses to remove the last mnode from the root, so we
1272 1276           * recognize this case and continue as usual, but skip the update to
1273 1277           * the lgrp_mnodes and the lgrp_nmnodes. This restores the inconsistency
1274 1278           * in topology, temporarily introduced by lgrp_mem_fini().
1275 1279           */
1276 1280          if (! (is_copy_rename && (lgrp_root->lgrp_mnodes == mnodes_mask)) &&
1277 1281              lgrp_root->lgrp_mnodes & mnodes_mask) {
1278 1282                  if (drop_lock)
1279 1283                          mutex_exit(&cpu_lock);
1280 1284                  return;
1281 1285          }
1282 1286  
1283 1287          /*
1284 1288           * Update lgroup topology with new memory resources, keeping track of
1285 1289           * which lgroups change
1286 1290           */
1287 1291          count = 0;
1288 1292          klgrpset_clear(changed);
1289 1293          my_lgrp = lgrp_hand_to_lgrp(hand);
1290 1294          if (my_lgrp == NULL) {
1291 1295                  /* new lgrp */
1292 1296                  my_lgrp = lgrp_create();
1293 1297                  lgrpid = my_lgrp->lgrp_id;
1294 1298                  my_lgrp->lgrp_plathand = hand;
1295 1299                  my_lgrp->lgrp_latency = lgrp_plat_latency(hand, hand);
1296 1300                  klgrpset_add(my_lgrp->lgrp_leaves, lgrpid);
1297 1301                  klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1298 1302  
1299 1303                  if (need_synch)
1300 1304                          pause_cpus(NULL, NULL);
1301 1305                  count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
1302 1306                      &changed);
1303 1307                  if (need_synch)
1304 1308                          start_cpus();
1305 1309          } else if (my_lgrp->lgrp_latency == 0 && lgrp_plat_latency(hand, hand)
1306 1310              > 0) {
1307 1311                  /*
1308 1312                   * Leaf lgroup was created, but latency wasn't available
1309 1313                   * then.  So, set latency for it and fill in rest of lgroup
1310 1314                   * topology  now that we know how far it is from other leaf
1311 1315                   * lgroups.
1312 1316                   */
1313 1317                  klgrpset_clear(changed);
1314 1318                  lgrpid = my_lgrp->lgrp_id;
1315 1319                  if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_MEM],
1316 1320                      lgrpid))
1317 1321                          klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1318 1322                  if (need_synch)
1319 1323                          pause_cpus(NULL, NULL);
1320 1324                  count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
1321 1325                      &changed);
1322 1326                  if (need_synch)
1323 1327                          start_cpus();
1324 1328          } else if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_MEM],
1325 1329              my_lgrp->lgrp_id)) {
1326 1330                  /*
1327 1331                   * Add new lgroup memory resource to existing lgroup
1328 1332                   */
1329 1333                  lgrpid = my_lgrp->lgrp_id;
1330 1334                  klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1331 1335                  klgrpset_add(changed, lgrpid);
1332 1336                  count++;
1333 1337                  for (i = 0; i <= lgrp_alloc_max; i++) {
1334 1338                          lgrp_t          *lgrp;
1335 1339  
1336 1340                          lgrp = lgrp_table[i];
1337 1341                          if (!LGRP_EXISTS(lgrp) ||
1338 1342                              !lgrp_rsets_member(lgrp->lgrp_set, lgrpid))
1339 1343                                  continue;
1340 1344  
1341 1345                          klgrpset_add(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1342 1346                          klgrpset_add(changed, lgrp->lgrp_id);
1343 1347                          count++;
1344 1348                  }
1345 1349          }
1346 1350  
1347 1351          /*
1348 1352           * Add memory node to lgroup and remove lgroup from ones that need
1349 1353           * to be updated
1350 1354           */
1351 1355          if (!(my_lgrp->lgrp_mnodes & mnodes_mask)) {
1352 1356                  my_lgrp->lgrp_mnodes |= mnodes_mask;
1353 1357                  my_lgrp->lgrp_nmnodes++;
1354 1358          }
1355 1359          klgrpset_del(changed, lgrpid);
1356 1360  
1357 1361          /*
1358 1362           * Update memory node information for all lgroups that changed and
1359 1363           * contain new memory node as a resource
1360 1364           */
1361 1365          if (count)
1362 1366                  (void) lgrp_mnode_update(changed, NULL);
1363 1367  
1364 1368          if (drop_lock)
1365 1369                  mutex_exit(&cpu_lock);
1366 1370  }
1367 1371  
1368 1372  /*
1369 1373   * Called to indicate that the lgroup associated with the platform
1370 1374   * handle "hand" no longer contains given memory node
1371 1375   *
1372 1376   * LOCKING for this routine is a bit tricky. Usually it is called without
1373 1377   * cpu_lock and it must must grab cpu_lock here to prevent racing with other
1374 1378   * callers. During DR of the board containing the caged memory it may be called
1375 1379   * with cpu_lock already held and CPUs paused.
1376 1380   *
1377 1381   * If the deletion is part of the DR copy-rename and the deleted mnode is the
1378 1382   * only one present in the lgrp_root->lgrp_mnodes, all the topology is updated,
1379 1383   * but lgrp_root->lgrp_mnodes is left intact. Later, lgrp_mem_init() will insert
1380 1384   * the same mnode back into the topology. See lgrp_mem_rename() and
1381 1385   * lgrp_mem_init() for additional details.
1382 1386   */
1383 1387  void
1384 1388  lgrp_mem_fini(int mnode, lgrp_handle_t hand, boolean_t is_copy_rename)
1385 1389  {
1386 1390          klgrpset_t      changed;
1387 1391          int             count;
1388 1392          int             i;
1389 1393          lgrp_t          *my_lgrp;
1390 1394          lgrp_id_t       lgrpid;
1391 1395          mnodeset_t      mnodes_mask;
1392 1396          boolean_t       drop_lock = B_FALSE;
1393 1397          boolean_t       need_synch = B_FALSE;
1394 1398  
1395 1399          /*
1396 1400           * Grab CPU lock (if we haven't already)
1397 1401           */
1398 1402          if (!MUTEX_HELD(&cpu_lock)) {
1399 1403                  mutex_enter(&cpu_lock);
1400 1404                  drop_lock = B_TRUE;
1401 1405          }
1402 1406  
1403 1407          /*
1404 1408           * This routine may be called from a context where we already
1405 1409           * hold cpu_lock and have already paused cpus.
1406 1410           */
1407 1411          if (!cpus_paused())
1408 1412                  need_synch = B_TRUE;
1409 1413  
1410 1414          my_lgrp = lgrp_hand_to_lgrp(hand);
1411 1415  
1412 1416          /*
1413 1417           * The lgrp *must* be pre-existing
1414 1418           */
1415 1419          ASSERT(my_lgrp != NULL);
1416 1420  
1417 1421          /*
1418 1422           * Delete memory node from lgroups which contain it
1419 1423           */
1420 1424          mnodes_mask = ((mnodeset_t)1 << mnode);
1421 1425          for (i = 0; i <= lgrp_alloc_max; i++) {
1422 1426                  lgrp_t *lgrp = lgrp_table[i];
1423 1427                  /*
1424 1428                   * Skip any non-existent lgroups and any lgroups that don't
1425 1429                   * contain leaf lgroup of memory as a memory resource
1426 1430                   */
1427 1431                  if (!LGRP_EXISTS(lgrp) ||
1428 1432                      !(lgrp->lgrp_mnodes & mnodes_mask))
1429 1433                          continue;
1430 1434  
1431 1435                  /*
1432 1436                   * Avoid removing the last mnode from the root in the DR
1433 1437                   * copy-rename case. See lgrp_mem_rename() for details.
1434 1438                   */
1435 1439                  if (is_copy_rename &&
1436 1440                      (lgrp == lgrp_root) && (lgrp->lgrp_mnodes == mnodes_mask))
1437 1441                          continue;
1438 1442  
1439 1443                  /*
1440 1444                   * Remove memory node from lgroup.
1441 1445                   */
1442 1446                  lgrp->lgrp_mnodes &= ~mnodes_mask;
1443 1447                  lgrp->lgrp_nmnodes--;
1444 1448                  ASSERT(lgrp->lgrp_nmnodes >= 0);
1445 1449          }
1446 1450          ASSERT(lgrp_root->lgrp_nmnodes > 0);
1447 1451  
1448 1452          /*
1449 1453           * Don't need to update lgroup topology if this lgroup still has memory.
1450 1454           *
1451 1455           * In the special case of DR copy-rename with the only mnode being
1452 1456           * removed, the lgrp_mnodes for the root is always non-zero, but we
1453 1457           * still need to update the lgroup topology.
1454 1458           */
1455 1459          if ((my_lgrp->lgrp_nmnodes > 0) &&
1456 1460              !(is_copy_rename && (my_lgrp == lgrp_root) &&
1457 1461              (my_lgrp->lgrp_mnodes == mnodes_mask))) {
1458 1462                  if (drop_lock)
1459 1463                          mutex_exit(&cpu_lock);
1460 1464                  return;
1461 1465          }
1462 1466  
1463 1467          /*
1464 1468           * This lgroup does not contain any memory now
1465 1469           */
1466 1470          klgrpset_clear(my_lgrp->lgrp_set[LGRP_RSRC_MEM]);
1467 1471  
1468 1472          /*
1469 1473           * Remove this lgroup from lgroup topology if it does not contain any
1470 1474           * resources now
1471 1475           */
1472 1476          lgrpid = my_lgrp->lgrp_id;
1473 1477          count = 0;
1474 1478          klgrpset_clear(changed);
1475 1479          if (lgrp_rsets_empty(my_lgrp->lgrp_set)) {
1476 1480                  /*
1477 1481                   * Delete lgroup when no more resources
1478 1482                   */
1479 1483                  if (need_synch)
1480 1484                          pause_cpus(NULL, NULL);
1481 1485                  count = lgrp_leaf_delete(my_lgrp, lgrp_table,
1482 1486                      lgrp_alloc_max + 1, &changed);
1483 1487                  ASSERT(count > 0);
1484 1488                  if (need_synch)
1485 1489                          start_cpus();
1486 1490          } else {
1487 1491                  /*
1488 1492                   * Remove lgroup from memory resources of any lgroups that
1489 1493                   * contain it as such
1490 1494                   */
1491 1495                  for (i = 0; i <= lgrp_alloc_max; i++) {
1492 1496                          lgrp_t          *lgrp;
1493 1497  
1494 1498                          lgrp = lgrp_table[i];
1495 1499                          if (!LGRP_EXISTS(lgrp) ||
1496 1500                              !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM],
1497 1501                              lgrpid))
1498 1502                                  continue;
1499 1503  
1500 1504                          klgrpset_del(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1501 1505                  }
1502 1506          }
1503 1507          if (drop_lock)
1504 1508                  mutex_exit(&cpu_lock);
1505 1509  }
1506 1510  
1507 1511  /*
1508 1512   * Return lgroup with given platform handle
1509 1513   */
1510 1514  lgrp_t *
1511 1515  lgrp_hand_to_lgrp(lgrp_handle_t hand)
1512 1516  {
1513 1517          int     i;
1514 1518          lgrp_t  *lgrp;
1515 1519  
1516 1520          if (hand == LGRP_NULL_HANDLE)
1517 1521                  return (NULL);
1518 1522  
1519 1523          for (i = 0; i <= lgrp_alloc_max; i++) {
1520 1524                  lgrp = lgrp_table[i];
1521 1525                  if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand)
1522 1526                          return (lgrp);
1523 1527          }
1524 1528          return (NULL);
1525 1529  }
1526 1530  
1527 1531  /*
1528 1532   * Return the home lgroup of the current thread.
1529 1533   * We must do this with kernel preemption disabled, since we don't want our
1530 1534   * thread to be re-homed while we're poking around with its lpl, and the lpl
1531 1535   * should never be NULL.
1532 1536   *
1533 1537   * NOTE: Can't guarantee that lgroup will be valid once kernel preemption
1534 1538   * is enabled because of DR.  Callers can use disable kernel preemption
1535 1539   * around this call to guarantee that the lgroup will be valid beyond this
1536 1540   * routine, since kernel preemption can be recursive.
1537 1541   */
1538 1542  lgrp_t *
1539 1543  lgrp_home_lgrp(void)
1540 1544  {
1541 1545          lgrp_t  *lgrp;
1542 1546          lpl_t   *lpl;
1543 1547  
1544 1548          kpreempt_disable();
1545 1549  
1546 1550          lpl = curthread->t_lpl;
1547 1551          ASSERT(lpl != NULL);
1548 1552          ASSERT(lpl->lpl_lgrpid >= 0 && lpl->lpl_lgrpid <= lgrp_alloc_max);
1549 1553          ASSERT(LGRP_EXISTS(lgrp_table[lpl->lpl_lgrpid]));
1550 1554          lgrp = lgrp_table[lpl->lpl_lgrpid];
1551 1555  
1552 1556          kpreempt_enable();
1553 1557  
1554 1558          return (lgrp);
1555 1559  }
1556 1560  
1557 1561  /*
1558 1562   * Return ID of home lgroup for given thread
1559 1563   * (See comments for lgrp_home_lgrp() for special care and handling
1560 1564   * instructions)
1561 1565   */
1562 1566  lgrp_id_t
1563 1567  lgrp_home_id(kthread_t *t)
1564 1568  {
1565 1569          lgrp_id_t       lgrp;
1566 1570          lpl_t           *lpl;
1567 1571  
1568 1572          ASSERT(t != NULL);
1569 1573          /*
1570 1574           * We'd like to ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock)), but we
1571 1575           * cannot since the HAT layer can call into this routine to
1572 1576           * determine the locality for its data structures in the context
1573 1577           * of a page fault.
1574 1578           */
1575 1579  
1576 1580          kpreempt_disable();
1577 1581  
1578 1582          lpl = t->t_lpl;
1579 1583          ASSERT(lpl != NULL);
1580 1584          ASSERT(lpl->lpl_lgrpid >= 0 && lpl->lpl_lgrpid <= lgrp_alloc_max);
1581 1585          lgrp = lpl->lpl_lgrpid;
1582 1586  
1583 1587          kpreempt_enable();
1584 1588  
1585 1589          return (lgrp);
1586 1590  }
1587 1591  
1588 1592  /*
1589 1593   * Return lgroup containing the physical memory for the given page frame number
1590 1594   */
1591 1595  lgrp_t *
1592 1596  lgrp_pfn_to_lgrp(pfn_t pfn)
1593 1597  {
1594 1598          lgrp_handle_t   hand;
1595 1599          int             i;
1596 1600          lgrp_t          *lgrp;
1597 1601  
1598 1602          hand = lgrp_plat_pfn_to_hand(pfn);
1599 1603          if (hand != LGRP_NULL_HANDLE)
1600 1604                  for (i = 0; i <= lgrp_alloc_max; i++) {
1601 1605                          lgrp = lgrp_table[i];
1602 1606                          if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand)
1603 1607                                  return (lgrp);
1604 1608                  }
1605 1609          return (NULL);
1606 1610  }
1607 1611  
1608 1612  /*
1609 1613   * Return lgroup containing the physical memory for the given page frame number
1610 1614   */
1611 1615  lgrp_t *
1612 1616  lgrp_phys_to_lgrp(u_longlong_t physaddr)
1613 1617  {
1614 1618          lgrp_handle_t   hand;
1615 1619          int             i;
1616 1620          lgrp_t          *lgrp;
1617 1621          pfn_t           pfn;
1618 1622  
1619 1623          pfn = btop(physaddr);
1620 1624          hand = lgrp_plat_pfn_to_hand(pfn);
1621 1625          if (hand != LGRP_NULL_HANDLE)
1622 1626                  for (i = 0; i <= lgrp_alloc_max; i++) {
1623 1627                          lgrp = lgrp_table[i];
1624 1628                          if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand)
1625 1629                                  return (lgrp);
1626 1630                  }
1627 1631          return (NULL);
1628 1632  }
1629 1633  
1630 1634  /*
1631 1635   * Return the leaf lgroup containing the given CPU
1632 1636   *
1633 1637   * The caller needs to take precautions necessary to prevent
1634 1638   * "cpu", and it's lpl from going away across a call to this function.
1635 1639   * hint: kpreempt_disable()/kpreempt_enable()
1636 1640   */
1637 1641  static lgrp_t *
1638 1642  lgrp_cpu_to_lgrp(cpu_t *cpu)
1639 1643  {
1640 1644          return (cpu->cpu_lpl->lpl_lgrp);
1641 1645  }
1642 1646  
1643 1647  /*
1644 1648   * Return the sum of the partition loads in an lgrp divided by
1645 1649   * the number of CPUs in the lgrp.  This is our best approximation
1646 1650   * of an 'lgroup load average' for a useful per-lgroup kstat.
1647 1651   */
1648 1652  static uint64_t
1649 1653  lgrp_sum_loadavgs(lgrp_t *lgrp)
1650 1654  {
1651 1655          cpu_t *cpu;
1652 1656          int ncpu;
1653 1657          uint64_t loads = 0;
1654 1658  
1655 1659          mutex_enter(&cpu_lock);
1656 1660  
1657 1661          cpu = lgrp->lgrp_cpu;
1658 1662          ncpu = lgrp->lgrp_cpucnt;
1659 1663  
1660 1664          if (cpu == NULL || ncpu == 0) {
1661 1665                  mutex_exit(&cpu_lock);
1662 1666                  return (0ull);
1663 1667          }
1664 1668  
1665 1669          do {
1666 1670                  loads += cpu->cpu_lpl->lpl_loadavg;
1667 1671                  cpu = cpu->cpu_next_lgrp;
1668 1672          } while (cpu != lgrp->lgrp_cpu);
1669 1673  
1670 1674          mutex_exit(&cpu_lock);
1671 1675  
1672 1676          return (loads / ncpu);
1673 1677  }
1674 1678  
1675 1679  void
1676 1680  lgrp_stat_add(lgrp_id_t lgrpid, lgrp_stat_t stat, int64_t val)
1677 1681  {
1678 1682          struct lgrp_stats *pstats;
1679 1683  
1680 1684          /*
1681 1685           * Verify that the caller isn't trying to add to
1682 1686           * a statistic for an lgroup that has gone away
1683 1687           */
1684 1688          if (lgrpid < 0 || lgrpid > lgrp_alloc_max)
1685 1689                  return;
1686 1690  
1687 1691          pstats = &lgrp_stats[lgrpid];
1688 1692          atomic_add_64((uint64_t *)LGRP_STAT_WRITE_PTR(pstats, stat), val);
1689 1693  }
1690 1694  
1691 1695  int64_t
1692 1696  lgrp_stat_read(lgrp_id_t lgrpid, lgrp_stat_t stat)
1693 1697  {
1694 1698          uint64_t val;
1695 1699          struct lgrp_stats *pstats;
1696 1700  
1697 1701          if (lgrpid < 0 || lgrpid > lgrp_alloc_max)
1698 1702                  return ((int64_t)0);
1699 1703  
1700 1704          pstats = &lgrp_stats[lgrpid];
1701 1705          LGRP_STAT_READ(pstats, stat, val);
1702 1706          return (val);
1703 1707  }
1704 1708  
1705 1709  /*
1706 1710   * Reset all kstats for lgrp specified by its lgrpid.
1707 1711   */
1708 1712  static void
1709 1713  lgrp_kstat_reset(lgrp_id_t lgrpid)
1710 1714  {
1711 1715          lgrp_stat_t stat;
1712 1716  
1713 1717          if (lgrpid < 0 || lgrpid > lgrp_alloc_max)
1714 1718                  return;
1715 1719  
1716 1720          for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) {
1717 1721                  LGRP_STAT_RESET(&lgrp_stats[lgrpid], stat);
1718 1722          }
1719 1723  }
1720 1724  
1721 1725  /*
1722 1726   * Collect all per-lgrp statistics for the lgrp associated with this
1723 1727   * kstat, and store them in the ks_data array.
1724 1728   *
1725 1729   * The superuser can reset all the running counter statistics for an
1726 1730   * lgrp by writing to any of the lgrp's stats.
1727 1731   */
1728 1732  static int
1729 1733  lgrp_kstat_extract(kstat_t *ksp, int rw)
1730 1734  {
1731 1735          lgrp_stat_t             stat;
1732 1736          struct kstat_named      *ksd;
1733 1737          lgrp_t                  *lgrp;
1734 1738          lgrp_id_t               lgrpid;
1735 1739  
1736 1740          lgrp = (lgrp_t *)ksp->ks_private;
1737 1741  
1738 1742          ksd = (struct kstat_named *)ksp->ks_data;
1739 1743          ASSERT(ksd == (struct kstat_named *)&lgrp_kstat_data);
1740 1744  
1741 1745          lgrpid = lgrp->lgrp_id;
1742 1746  
1743 1747          if (lgrpid == LGRP_NONE) {
1744 1748                  /*
1745 1749                   * Return all zeroes as stats for freed lgrp.
1746 1750                   */
1747 1751                  for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) {
1748 1752                          ksd[stat].value.i64 = 0;
1749 1753                  }
1750 1754                  ksd[stat + LGRP_NUM_CPUS].value.i64 = 0;
1751 1755                  ksd[stat + LGRP_NUM_PG_INSTALL].value.i64 = 0;
1752 1756                  ksd[stat + LGRP_NUM_PG_AVAIL].value.i64 = 0;
1753 1757                  ksd[stat + LGRP_NUM_PG_FREE].value.i64 = 0;
1754 1758                  ksd[stat + LGRP_LOADAVG].value.i64 = 0;
1755 1759          } else if (rw != KSTAT_WRITE) {
1756 1760                  /*
1757 1761                   * Handle counter stats
1758 1762                   */
1759 1763                  for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) {
1760 1764                          ksd[stat].value.i64 = lgrp_stat_read(lgrpid, stat);
1761 1765                  }
1762 1766  
1763 1767                  /*
1764 1768                   * Handle kernel data snapshot stats
1765 1769                   */
1766 1770                  ksd[stat + LGRP_NUM_CPUS].value.i64 = lgrp->lgrp_cpucnt;
1767 1771                  ksd[stat + LGRP_NUM_PG_INSTALL].value.i64 =
1768 1772                      lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_INSTALL);
1769 1773                  ksd[stat + LGRP_NUM_PG_AVAIL].value.i64 =
1770 1774                      lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_AVAIL);
1771 1775                  ksd[stat + LGRP_NUM_PG_FREE].value.i64 =
1772 1776                      lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_FREE);
1773 1777                  ksd[stat + LGRP_LOADAVG].value.i64 = lgrp_sum_loadavgs(lgrp);
1774 1778                  ksd[stat + LGRP_LOADAVG_SCALE].value.i64 =
1775 1779                      lgrp_loadavg_max_effect;
1776 1780          } else {
1777 1781                  lgrp_kstat_reset(lgrpid);
1778 1782          }
1779 1783  
1780 1784          return (0);
1781 1785  }
1782 1786  
1783 1787  int
1784 1788  lgrp_query_cpu(processorid_t id, lgrp_id_t *lp)
1785 1789  {
1786 1790          cpu_t   *cp;
1787 1791  
1788 1792          mutex_enter(&cpu_lock);
1789 1793  
1790 1794          if ((cp = cpu_get(id)) == NULL) {
1791 1795                  mutex_exit(&cpu_lock);
1792 1796                  return (EINVAL);
1793 1797          }
1794 1798  
1795 1799          if (cpu_is_offline(cp) || cpu_is_poweredoff(cp)) {
1796 1800                  mutex_exit(&cpu_lock);
1797 1801                  return (EINVAL);
1798 1802          }
1799 1803  
1800 1804          ASSERT(cp->cpu_lpl != NULL);
1801 1805  
1802 1806          *lp = cp->cpu_lpl->lpl_lgrpid;
1803 1807  
1804 1808          mutex_exit(&cpu_lock);
1805 1809  
1806 1810          return (0);
1807 1811  }
1808 1812  
1809 1813  int
1810 1814  lgrp_query_load(processorid_t id, lgrp_load_t *lp)
1811 1815  {
1812 1816          cpu_t *cp;
1813 1817  
1814 1818          mutex_enter(&cpu_lock);
1815 1819  
1816 1820          if ((cp = cpu_get(id)) == NULL) {
1817 1821                  mutex_exit(&cpu_lock);
1818 1822                  return (EINVAL);
1819 1823          }
1820 1824  
1821 1825          ASSERT(cp->cpu_lpl != NULL);
1822 1826  
1823 1827          *lp = cp->cpu_lpl->lpl_loadavg;
1824 1828  
1825 1829          mutex_exit(&cpu_lock);
1826 1830  
1827 1831          return (0);
1828 1832  }
1829 1833  
1830 1834  /*
1831 1835   * Add a resource named by lpl_leaf to rset of lpl_target
1832 1836   *
1833 1837   * This routine also adjusts ncpu and nrset if the call succeeds in adding a
1834 1838   * resource. It is adjusted here, as this is presently the only place that we
1835 1839   * can be certain a resource addition has succeeded.
1836 1840   *
1837 1841   * We keep the list of rsets sorted so that the dispatcher can quickly walk the
1838 1842   * list in order until it reaches a NULL.  (This list is required to be NULL
1839 1843   * terminated, too).  This is done so that we can mark start pos + 1, so that
1840 1844   * each lpl is traversed sequentially, but in a different order.  We hope this
1841 1845   * will improve performance a bit.  (Hopefully, less read-to-own traffic...)
1842 1846   */
1843 1847  
1844 1848  void
1845 1849  lpl_rset_add(lpl_t *lpl_target, lpl_t *lpl_leaf)
1846 1850  {
1847 1851          int             i;
1848 1852          int             entry_slot = 0;
1849 1853  
1850 1854          /* return if leaf is already present */
1851 1855          for (i = 0; i < lpl_target->lpl_nrset; i++) {
1852 1856                  if (lpl_target->lpl_rset[i] == lpl_leaf) {
1853 1857                          return;
1854 1858                  }
1855 1859  
1856 1860                  if (lpl_target->lpl_rset[i]->lpl_lgrpid >
1857 1861                      lpl_leaf->lpl_lgrpid) {
1858 1862                          break;
1859 1863                  }
1860 1864          }
1861 1865  
1862 1866          /* insert leaf, update counts */
1863 1867          entry_slot = i;
1864 1868          i = lpl_target->lpl_nrset++;
1865 1869  
1866 1870          /*
1867 1871           * Start at the end of the rset array and work backwards towards the
1868 1872           * slot into which the new lpl will be inserted. This effectively
1869 1873           * preserves the current ordering by scooting everybody over one entry,
1870 1874           * and placing the new entry into the space created.
1871 1875           */
1872 1876          while (i-- > entry_slot) {
1873 1877                  lpl_target->lpl_rset[i + 1] = lpl_target->lpl_rset[i];
1874 1878                  lpl_target->lpl_id2rset[lpl_target->lpl_rset[i]->lpl_lgrpid] =
1875 1879                      i + 1;
1876 1880          }
1877 1881  
1878 1882          lpl_target->lpl_rset[entry_slot] = lpl_leaf;
1879 1883          lpl_target->lpl_id2rset[lpl_leaf->lpl_lgrpid] = entry_slot;
1880 1884  
1881 1885          lpl_target->lpl_ncpu += lpl_leaf->lpl_ncpu;
1882 1886  }
1883 1887  
1884 1888  /*
1885 1889   * Update each of lpl_parent's children with a reference to their parent.
1886 1890   * The lgrp topology is used as the reference since it is fully
1887 1891   * consistent and correct at this point.
1888 1892   * This should be called after any potential change in lpl_parent's
1889 1893   * rset.
1890 1894   */
1891 1895  static void
1892 1896  lpl_child_update(lpl_t *lpl_parent, struct cpupart *cp)
1893 1897  {
1894 1898          klgrpset_t      children;
1895 1899          int             i;
1896 1900  
1897 1901          children = lgrp_table[lpl_parent->lpl_lgrpid]->lgrp_children;
1898 1902          if (klgrpset_isempty(children))
1899 1903                  return; /* nothing to do */
1900 1904  
1901 1905          for (i = 0; i <= lgrp_alloc_max; i++) {
1902 1906                  if (klgrpset_ismember(children, i)) {
1903 1907                          /*
1904 1908                           * (Re)set the parent. It may be incorrect if
1905 1909                           * lpl_parent is new in the topology.
1906 1910                           */
1907 1911                          cp->cp_lgrploads[i].lpl_parent = lpl_parent;
1908 1912                  }
1909 1913          }
1910 1914  }
1911 1915  
1912 1916  /*
1913 1917   * Delete resource lpl_leaf from rset of lpl_target, assuming it's there.
1914 1918   *
1915 1919   * This routine also adjusts ncpu and nrset if the call succeeds in deleting a
1916 1920   * resource. The values are adjusted here, as this is the only place that we can
1917 1921   * be certain a resource was successfully deleted.
1918 1922   */
1919 1923  void
1920 1924  lpl_rset_del(lpl_t *lpl_target, lpl_t *lpl_leaf)
1921 1925  {
1922 1926          int i;
1923 1927          lpl_t *leaf;
1924 1928  
1925 1929          if (lpl_target->lpl_nrset == 0)
1926 1930                  return;
1927 1931  
1928 1932          /* find leaf in intermediate node */
1929 1933          for (i = 0; i < lpl_target->lpl_nrset; i++) {
1930 1934                  if (lpl_target->lpl_rset[i] == lpl_leaf)
1931 1935                          break;
1932 1936          }
1933 1937  
1934 1938          /* return if leaf not found */
1935 1939          if (lpl_target->lpl_rset[i] != lpl_leaf)
1936 1940                  return;
1937 1941  
1938 1942          /* prune leaf, compress array */
1939 1943          lpl_target->lpl_rset[lpl_target->lpl_nrset--] = NULL;
1940 1944          lpl_target->lpl_id2rset[lpl_leaf->lpl_lgrpid] = -1;
1941 1945          lpl_target->lpl_ncpu--;
1942 1946          do {
1943 1947                  lpl_target->lpl_rset[i] = lpl_target->lpl_rset[i + 1];
1944 1948                  /*
1945 1949                   * Update the lgrp id <=> rset mapping
1946 1950                   */
1947 1951                  if ((leaf = lpl_target->lpl_rset[i]) != NULL) {
1948 1952                          lpl_target->lpl_id2rset[leaf->lpl_lgrpid] = i;
1949 1953                  }
1950 1954          } while (i++ < lpl_target->lpl_nrset);
1951 1955  }
1952 1956  
1953 1957  /*
1954 1958   * Check to see if the resource set of the target lpl contains the
1955 1959   * supplied leaf lpl.  This returns 1 if the lpl is found, 0 if it is not.
1956 1960   */
1957 1961  
1958 1962  int
1959 1963  lpl_rset_contains(lpl_t *lpl_target, lpl_t *lpl_leaf)
1960 1964  {
1961 1965          int i;
1962 1966  
1963 1967          for (i = 0; i < lpl_target->lpl_nrset; i++) {
1964 1968                  if (lpl_target->lpl_rset[i] == lpl_leaf)
1965 1969                          return (1);
1966 1970          }
1967 1971  
1968 1972          return (0);
1969 1973  }
1970 1974  
1971 1975  /*
1972 1976   * Called when we change cpu lpl membership.  This increments or decrements the
1973 1977   * per-cpu counter in every lpl in which our leaf appears.
1974 1978   */
1975 1979  void
1976 1980  lpl_cpu_adjcnt(lpl_act_t act, cpu_t *cp)
1977 1981  {
1978 1982          cpupart_t       *cpupart;
1979 1983          lgrp_t          *lgrp_leaf;
1980 1984          lgrp_t          *lgrp_cur;
1981 1985          lpl_t           *lpl_leaf;
1982 1986          lpl_t           *lpl_cur;
1983 1987          int             i;
1984 1988  
1985 1989          ASSERT(act == LPL_DECREMENT || act == LPL_INCREMENT);
1986 1990  
1987 1991          cpupart = cp->cpu_part;
1988 1992          lpl_leaf = cp->cpu_lpl;
1989 1993          lgrp_leaf = lgrp_table[lpl_leaf->lpl_lgrpid];
1990 1994  
1991 1995          for (i = 0; i <= lgrp_alloc_max; i++) {
1992 1996                  lgrp_cur = lgrp_table[i];
1993 1997  
1994 1998                  /*
1995 1999                   * Don't adjust if the lgrp isn't there, if we're the leaf lpl
1996 2000                   * for the cpu in question, or if the current lgrp and leaf
1997 2001                   * don't share the same resources.
1998 2002                   */
1999 2003  
2000 2004                  if (!LGRP_EXISTS(lgrp_cur) || (lgrp_cur == lgrp_leaf) ||
2001 2005                      !klgrpset_intersects(lgrp_leaf->lgrp_set[LGRP_RSRC_CPU],
2002 2006                      lgrp_cur->lgrp_set[LGRP_RSRC_CPU]))
2003 2007                          continue;
2004 2008  
2005 2009  
2006 2010                  lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id];
2007 2011  
2008 2012                  if (lpl_cur->lpl_nrset > 0) {
2009 2013                          if (act == LPL_INCREMENT) {
2010 2014                                  lpl_cur->lpl_ncpu++;
2011 2015                          } else if (act == LPL_DECREMENT) {
2012 2016                                  lpl_cur->lpl_ncpu--;
2013 2017                          }
2014 2018                  }
2015 2019          }
2016 2020  }
2017 2021  
2018 2022  /*
2019 2023   * Initialize lpl with given resources and specified lgrp
2020 2024   */
2021 2025  void
2022 2026  lpl_init(lpl_t *lpl, lpl_t *lpl_leaf, lgrp_t *lgrp)
2023 2027  {
2024 2028          lpl->lpl_lgrpid = lgrp->lgrp_id;
2025 2029          lpl->lpl_loadavg = 0;
2026 2030          if (lpl == lpl_leaf)
2027 2031                  lpl->lpl_ncpu = 1;
2028 2032          else
2029 2033                  lpl->lpl_ncpu = lpl_leaf->lpl_ncpu;
2030 2034          lpl->lpl_nrset = 1;
2031 2035          lpl->lpl_rset[0] = lpl_leaf;
2032 2036          lpl->lpl_id2rset[lpl_leaf->lpl_lgrpid] = 0;
2033 2037          lpl->lpl_lgrp = lgrp;
2034 2038          lpl->lpl_parent = NULL; /* set by lpl_leaf_insert() */
2035 2039          lpl->lpl_cpus = NULL; /* set by lgrp_part_add_cpu() */
2036 2040  }
2037 2041  
2038 2042  /*
2039 2043   * Clear an unused lpl
2040 2044   */
2041 2045  void
2042 2046  lpl_clear(lpl_t *lpl)
2043 2047  {
2044 2048          /*
2045 2049           * Clear out all fields in the lpl except:
2046 2050           *    lpl_lgrpid - to facilitate debugging
2047 2051           *    lpl_rset, lpl_rset_sz, lpl_id2rset - rset array references / size
2048 2052           *
2049 2053           * Note that the lpl's rset and id2rset mapping are cleared as well.
2050 2054           */
2051 2055          lpl->lpl_loadavg = 0;
2052 2056          lpl->lpl_ncpu = 0;
2053 2057          lpl->lpl_lgrp = NULL;
2054 2058          lpl->lpl_parent = NULL;
2055 2059          lpl->lpl_cpus = NULL;
2056 2060          lpl->lpl_nrset = 0;
2057 2061          lpl->lpl_homed_time = 0;
2058 2062          bzero(lpl->lpl_rset, sizeof (lpl->lpl_rset[0]) * lpl->lpl_rset_sz);
2059 2063          bzero(lpl->lpl_id2rset,
2060 2064              sizeof (lpl->lpl_id2rset[0]) * lpl->lpl_rset_sz);
2061 2065  }
2062 2066  
2063 2067  /*
2064 2068   * Given a CPU-partition, verify that the lpl topology in the CPU-partition
2065 2069   * is in sync with the lgroup toplogy in the system.  The lpl topology may not
2066 2070   * make full use of all of the lgroup topology, but this checks to make sure
2067 2071   * that for the parts that it does use, it has correctly understood the
2068 2072   * relationships that exist. This function returns
2069 2073   * 0 if the topology is correct, and a non-zero error code, for non-debug
2070 2074   * kernels if incorrect.  Asserts are spread throughout the code to aid in
2071 2075   * debugging on a DEBUG kernel.
2072 2076   */
2073 2077  int
2074 2078  lpl_topo_verify(cpupart_t *cpupart)
2075 2079  {
2076 2080          lgrp_t          *lgrp;
2077 2081          lpl_t           *lpl;
2078 2082          klgrpset_t      rset;
2079 2083          klgrpset_t      cset;
2080 2084          cpu_t           *cpu;
2081 2085          cpu_t           *cp_start;
2082 2086          int             i;
2083 2087          int             j;
2084 2088          int             sum;
2085 2089  
2086 2090          /* topology can't be incorrect if it doesn't exist */
2087 2091          if (!lgrp_topo_initialized || !lgrp_initialized)
2088 2092                  return (LPL_TOPO_CORRECT);
2089 2093  
2090 2094          ASSERT(cpupart != NULL);
2091 2095  
2092 2096          for (i = 0; i <= lgrp_alloc_max; i++) {
2093 2097                  lgrp = lgrp_table[i];
2094 2098                  lpl = NULL;
2095 2099                  /* make sure lpls are allocated */
2096 2100                  ASSERT(cpupart->cp_lgrploads);
2097 2101                  if (!cpupart->cp_lgrploads)
2098 2102                          return (LPL_TOPO_PART_HAS_NO_LPL);
2099 2103  
2100 2104                  lpl = &cpupart->cp_lgrploads[i];
2101 2105                  /* make sure our index is good */
2102 2106                  ASSERT(i < cpupart->cp_nlgrploads);
2103 2107  
2104 2108                  /* if lgroup doesn't exist, make sure lpl is empty */
2105 2109                  if (!LGRP_EXISTS(lgrp)) {
2106 2110                          ASSERT(lpl->lpl_ncpu == 0);
2107 2111                          if (lpl->lpl_ncpu > 0) {
2108 2112                                  return (LPL_TOPO_CPUS_NOT_EMPTY);
2109 2113                          } else {
2110 2114                                  continue;
2111 2115                          }
2112 2116                  }
2113 2117  
2114 2118                  /* verify that lgroup and lpl are identically numbered */
2115 2119                  ASSERT(lgrp->lgrp_id == lpl->lpl_lgrpid);
2116 2120  
2117 2121                  /* if lgroup isn't in our partition, make sure lpl is empty */
2118 2122                  if (!klgrpset_intersects(lgrp->lgrp_leaves,
2119 2123                      cpupart->cp_lgrpset)) {
2120 2124                          ASSERT(lpl->lpl_ncpu == 0);
2121 2125                          if (lpl->lpl_ncpu > 0) {
2122 2126                                  return (LPL_TOPO_CPUS_NOT_EMPTY);
2123 2127                          }
2124 2128                          /*
2125 2129                           * lpl is empty, and lgroup isn't in partition.  verify
2126 2130                           * that lpl doesn't show up in anyone else's rsets (in
2127 2131                           * this partition, anyway)
2128 2132                           */
2129 2133                          for (j = 0; j < cpupart->cp_nlgrploads; j++) {
2130 2134                                  lpl_t *i_lpl; /* lpl we're iterating over */
2131 2135  
2132 2136                                  i_lpl = &cpupart->cp_lgrploads[j];
2133 2137  
2134 2138                                  ASSERT(!lpl_rset_contains(i_lpl, lpl));
2135 2139                                  if (lpl_rset_contains(i_lpl, lpl)) {
2136 2140                                          return (LPL_TOPO_LPL_ORPHANED);
2137 2141                                  }
2138 2142                          }
2139 2143                          /* lgroup is empty, and everything is ok. continue */
2140 2144                          continue;
2141 2145                  }
2142 2146  
2143 2147  
2144 2148                  /* lgroup is in this partition, now check it against lpl */
2145 2149  
2146 2150                  /* do both have matching lgrps? */
2147 2151                  ASSERT(lgrp == lpl->lpl_lgrp);
2148 2152                  if (lgrp != lpl->lpl_lgrp) {
2149 2153                          return (LPL_TOPO_LGRP_MISMATCH);
2150 2154                  }
2151 2155  
2152 2156                  /* do the parent lgroups exist and do they match? */
2153 2157                  if (lgrp->lgrp_parent) {
2154 2158                          ASSERT(lpl->lpl_parent);
2155 2159                          ASSERT(lgrp->lgrp_parent->lgrp_id ==
2156 2160                              lpl->lpl_parent->lpl_lgrpid);
2157 2161  
2158 2162                          if (!lpl->lpl_parent) {
2159 2163                                  return (LPL_TOPO_MISSING_PARENT);
2160 2164                          } else if (lgrp->lgrp_parent->lgrp_id !=
2161 2165                              lpl->lpl_parent->lpl_lgrpid) {
2162 2166                                  return (LPL_TOPO_PARENT_MISMATCH);
2163 2167                          }
2164 2168                  }
2165 2169  
2166 2170                  /* only leaf lgroups keep a cpucnt, only check leaves */
2167 2171                  if ((lpl->lpl_nrset == 1) && (lpl == lpl->lpl_rset[0])) {
2168 2172  
2169 2173                          /* verify that lgrp is also a leaf */
2170 2174                          ASSERT((lgrp->lgrp_childcnt == 0) &&
2171 2175                              (klgrpset_ismember(lgrp->lgrp_leaves,
2172 2176                              lpl->lpl_lgrpid)));
2173 2177  
2174 2178                          if ((lgrp->lgrp_childcnt > 0) ||
2175 2179                              (!klgrpset_ismember(lgrp->lgrp_leaves,
2176 2180                              lpl->lpl_lgrpid))) {
2177 2181                                  return (LPL_TOPO_LGRP_NOT_LEAF);
2178 2182                          }
2179 2183  
2180 2184                          ASSERT((lgrp->lgrp_cpucnt >= lpl->lpl_ncpu) &&
2181 2185                              (lpl->lpl_ncpu > 0));
2182 2186                          if ((lgrp->lgrp_cpucnt < lpl->lpl_ncpu) ||
2183 2187                              (lpl->lpl_ncpu <= 0)) {
2184 2188                                  return (LPL_TOPO_BAD_CPUCNT);
2185 2189                          }
2186 2190  
2187 2191                          /*
2188 2192                           * Check that lpl_ncpu also matches the number of
2189 2193                           * cpus in the lpl's linked list.  This only exists in
2190 2194                           * leaves, but they should always match.
2191 2195                           */
2192 2196                          j = 0;
2193 2197                          cpu = cp_start = lpl->lpl_cpus;
2194 2198                          while (cpu != NULL) {
2195 2199                                  j++;
2196 2200  
2197 2201                                  /* check to make sure cpu's lpl is leaf lpl */
2198 2202                                  ASSERT(cpu->cpu_lpl == lpl);
2199 2203                                  if (cpu->cpu_lpl != lpl) {
2200 2204                                          return (LPL_TOPO_CPU_HAS_BAD_LPL);
2201 2205                                  }
2202 2206  
2203 2207                                  /* check next cpu */
2204 2208                                  if ((cpu = cpu->cpu_next_lpl) != cp_start) {
2205 2209                                          continue;
2206 2210                                  } else {
2207 2211                                          cpu = NULL;
2208 2212                                  }
2209 2213                          }
2210 2214  
2211 2215                          ASSERT(j == lpl->lpl_ncpu);
2212 2216                          if (j != lpl->lpl_ncpu) {
2213 2217                                  return (LPL_TOPO_LPL_BAD_NCPU);
2214 2218                          }
2215 2219  
2216 2220                          /*
2217 2221                           * Also, check that leaf lpl is contained in all
2218 2222                           * intermediate lpls that name the leaf as a descendant
2219 2223                           */
2220 2224                          for (j = 0; j <= lgrp_alloc_max; j++) {
2221 2225                                  klgrpset_t intersect;
2222 2226                                  lgrp_t *lgrp_cand;
2223 2227                                  lpl_t *lpl_cand;
2224 2228  
2225 2229                                  lgrp_cand = lgrp_table[j];
2226 2230                                  intersect = klgrpset_intersects(
2227 2231                                      lgrp_cand->lgrp_set[LGRP_RSRC_CPU],
2228 2232                                      cpupart->cp_lgrpset);
2229 2233  
2230 2234                                  if (!LGRP_EXISTS(lgrp_cand) ||
2231 2235                                      !klgrpset_intersects(lgrp_cand->lgrp_leaves,
2232 2236                                      cpupart->cp_lgrpset) ||
2233 2237                                      (intersect == 0))
2234 2238                                          continue;
2235 2239  
2236 2240                                  lpl_cand =
2237 2241                                      &cpupart->cp_lgrploads[lgrp_cand->lgrp_id];
2238 2242  
2239 2243                                  if (klgrpset_ismember(intersect,
2240 2244                                      lgrp->lgrp_id)) {
2241 2245                                          ASSERT(lpl_rset_contains(lpl_cand,
2242 2246                                              lpl));
2243 2247  
2244 2248                                          if (!lpl_rset_contains(lpl_cand, lpl)) {
2245 2249                                                  return (LPL_TOPO_RSET_MSSNG_LF);
2246 2250                                          }
2247 2251                                  }
2248 2252                          }
2249 2253  
2250 2254                  } else { /* non-leaf specific checks */
2251 2255  
2252 2256                          /*
2253 2257                           * Non-leaf lpls should have lpl_cpus == NULL
2254 2258                           * verify that this is so
2255 2259                           */
2256 2260                          ASSERT(lpl->lpl_cpus == NULL);
2257 2261                          if (lpl->lpl_cpus != NULL) {
2258 2262                                  return (LPL_TOPO_NONLEAF_HAS_CPUS);
2259 2263                          }
2260 2264  
2261 2265                          /*
2262 2266                           * verify that the sum of the cpus in the leaf resources
2263 2267                           * is equal to the total ncpu in the intermediate
2264 2268                           */
2265 2269                          for (j = sum = 0; j < lpl->lpl_nrset; j++) {
2266 2270                                  sum += lpl->lpl_rset[j]->lpl_ncpu;
2267 2271                          }
2268 2272  
2269 2273                          ASSERT(sum == lpl->lpl_ncpu);
2270 2274                          if (sum != lpl->lpl_ncpu) {
2271 2275                                  return (LPL_TOPO_LPL_BAD_NCPU);
2272 2276                          }
2273 2277                  }
2274 2278  
2275 2279                  /*
2276 2280                   * Check the rset of the lpl in question.  Make sure that each
2277 2281                   * rset contains a subset of the resources in
2278 2282                   * lgrp_set[LGRP_RSRC_CPU] and in cp_lgrpset.  This also makes
2279 2283                   * sure that each rset doesn't include resources that are
2280 2284                   * outside of that set.  (Which would be resources somehow not
2281 2285                   * accounted for).
2282 2286                   */
2283 2287                  klgrpset_clear(rset);
2284 2288                  for (j = 0; j < lpl->lpl_nrset; j++) {
2285 2289                          klgrpset_add(rset, lpl->lpl_rset[j]->lpl_lgrpid);
2286 2290                  }
2287 2291                  klgrpset_copy(cset, rset);
2288 2292                  /* make sure lpl rset matches lgrp rset */
2289 2293                  klgrpset_diff(rset, lgrp->lgrp_set[LGRP_RSRC_CPU]);
2290 2294                  /* make sure rset is contained with in partition, too */
2291 2295                  klgrpset_diff(cset, cpupart->cp_lgrpset);
2292 2296  
2293 2297                  ASSERT(klgrpset_isempty(rset) && klgrpset_isempty(cset));
2294 2298                  if (!klgrpset_isempty(rset) || !klgrpset_isempty(cset)) {
2295 2299                          return (LPL_TOPO_RSET_MISMATCH);
2296 2300                  }
2297 2301  
2298 2302                  /*
2299 2303                   * check to make sure lpl_nrset matches the number of rsets
2300 2304                   * contained in the lpl
2301 2305                   */
2302 2306                  for (j = 0; j < lpl->lpl_nrset; j++) {
2303 2307                          if (lpl->lpl_rset[j] == NULL)
2304 2308                                  break;
2305 2309                  }
2306 2310  
2307 2311                  ASSERT(j == lpl->lpl_nrset);
2308 2312                  if (j != lpl->lpl_nrset) {
2309 2313                          return (LPL_TOPO_BAD_RSETCNT);
2310 2314                  }
2311 2315  
2312 2316          }
2313 2317          return (LPL_TOPO_CORRECT);
2314 2318  }
2315 2319  
2316 2320  /*
2317 2321   * Flatten lpl topology to given number of levels.  This is presently only
2318 2322   * implemented for a flatten to 2 levels, which will prune out the intermediates
2319 2323   * and home the leaf lpls to the root lpl.
2320 2324   */
2321 2325  int
2322 2326  lpl_topo_flatten(int levels)
2323 2327  {
2324 2328          int             i;
2325 2329          uint_t          sum;
2326 2330          lgrp_t          *lgrp_cur;
2327 2331          lpl_t           *lpl_cur;
2328 2332          lpl_t           *lpl_root;
2329 2333          cpupart_t       *cp;
2330 2334  
2331 2335          if (levels != 2)
2332 2336                  return (0);
2333 2337  
2334 2338          /* called w/ cpus paused - grab no locks! */
2335 2339          ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 ||
2336 2340              !lgrp_initialized);
2337 2341  
2338 2342          cp = cp_list_head;
2339 2343          do {
2340 2344                  lpl_root = &cp->cp_lgrploads[lgrp_root->lgrp_id];
2341 2345                  ASSERT(LGRP_EXISTS(lgrp_root) && (lpl_root->lpl_ncpu > 0));
2342 2346  
2343 2347                  for (i = 0; i <= lgrp_alloc_max; i++) {
2344 2348                          lgrp_cur = lgrp_table[i];
2345 2349                          lpl_cur = &cp->cp_lgrploads[i];
2346 2350  
2347 2351                          if ((lgrp_cur == lgrp_root) ||
2348 2352                              (!LGRP_EXISTS(lgrp_cur) &&
2349 2353                              (lpl_cur->lpl_ncpu == 0)))
2350 2354                                  continue;
2351 2355  
2352 2356                          if (!LGRP_EXISTS(lgrp_cur) && (lpl_cur->lpl_ncpu > 0)) {
2353 2357                                  /*
2354 2358                                   * this should be a deleted intermediate, so
2355 2359                                   * clear it
2356 2360                                   */
2357 2361                                  lpl_clear(lpl_cur);
2358 2362                          } else if ((lpl_cur->lpl_nrset == 1) &&
2359 2363                              (lpl_cur->lpl_rset[0] == lpl_cur) &&
2360 2364                              ((lpl_cur->lpl_parent->lpl_ncpu == 0) ||
2361 2365                              (!LGRP_EXISTS(lpl_cur->lpl_parent->lpl_lgrp)))) {
2362 2366                                  /*
2363 2367                                   * this is a leaf whose parent was deleted, or
2364 2368                                   * whose parent had their lgrp deleted.  (And
2365 2369                                   * whose parent will soon be deleted).  Point
2366 2370                                   * this guy back to the root lpl.
2367 2371                                   */
2368 2372                                  lpl_cur->lpl_parent = lpl_root;
2369 2373                                  lpl_rset_add(lpl_root, lpl_cur);
2370 2374                          }
2371 2375  
2372 2376                  }
2373 2377  
2374 2378                  /*
2375 2379                   * Now that we're done, make sure the count on the root lpl is
2376 2380                   * correct, and update the hints of the children for the sake of
2377 2381                   * thoroughness
2378 2382                   */
2379 2383                  for (i = sum = 0; i < lpl_root->lpl_nrset; i++) {
2380 2384                          sum += lpl_root->lpl_rset[i]->lpl_ncpu;
2381 2385                  }
2382 2386                  lpl_root->lpl_ncpu = sum;
2383 2387                  lpl_child_update(lpl_root, cp);
2384 2388  
2385 2389                  cp = cp->cp_next;
2386 2390          } while (cp != cp_list_head);
2387 2391  
2388 2392          return (levels);
2389 2393  }
2390 2394  
2391 2395  /*
2392 2396   * Insert a lpl into the resource hierarchy and create any additional lpls that
2393 2397   * are necessary to represent the varying states of locality for the cpu
2394 2398   * resoruces newly added to the partition.
2395 2399   *
2396 2400   * This routine is clever enough that it can correctly add resources from the
2397 2401   * new leaf into both direct and indirect resource sets in the hierarchy.  (Ie,
2398 2402   * those for which the lpl is a leaf as opposed to simply a named equally local
2399 2403   * resource).  The one special case that needs additional processing is when a
2400 2404   * new intermediate lpl is introduced.  Since the main loop only traverses
2401 2405   * looking to add the leaf resource where it does not yet exist, additional work
2402 2406   * is necessary to add other leaf resources that may need to exist in the newly
2403 2407   * created intermediate.  This is performed by the second inner loop, and is
2404 2408   * only done when the check for more than one overlapping resource succeeds.
2405 2409   */
2406 2410  
2407 2411  void
2408 2412  lpl_leaf_insert(lpl_t *lpl_leaf, cpupart_t *cpupart)
2409 2413  {
2410 2414          int             i;
2411 2415          int             j;
2412 2416          int             rset_num_intersect;
2413 2417          lgrp_t          *lgrp_cur;
2414 2418          lpl_t           *lpl_cur;
2415 2419          lpl_t           *lpl_parent;
2416 2420          lgrp_id_t       parent_id;
2417 2421          klgrpset_t      rset_intersect; /* resources in cpupart and lgrp */
2418 2422  
2419 2423          for (i = 0; i <= lgrp_alloc_max; i++) {
2420 2424                  lgrp_cur = lgrp_table[i];
2421 2425  
2422 2426                  /*
2423 2427                   * Don't insert if the lgrp isn't there, if the leaf isn't
2424 2428                   * contained within the current lgrp, or if the current lgrp has
2425 2429                   * no leaves in this partition
2426 2430                   */
2427 2431  
2428 2432                  if (!LGRP_EXISTS(lgrp_cur) ||
2429 2433                      !klgrpset_ismember(lgrp_cur->lgrp_set[LGRP_RSRC_CPU],
2430 2434                      lpl_leaf->lpl_lgrpid) ||
2431 2435                      !klgrpset_intersects(lgrp_cur->lgrp_leaves,
2432 2436                      cpupart->cp_lgrpset))
2433 2437                          continue;
2434 2438  
2435 2439                  lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id];
2436 2440                  if (lgrp_cur->lgrp_parent != NULL) {
2437 2441                          /* if lgrp has a parent, assign it properly */
2438 2442                          parent_id = lgrp_cur->lgrp_parent->lgrp_id;
2439 2443                          lpl_parent = &cpupart->cp_lgrploads[parent_id];
2440 2444                  } else {
2441 2445                          /* if not, make sure parent ptr gets set to null */
2442 2446                          lpl_parent = NULL;
2443 2447                  }
2444 2448  
2445 2449                  if (lpl_cur == lpl_leaf) {
2446 2450                          /*
2447 2451                           * Almost all leaf state was initialized elsewhere.  The
2448 2452                           * only thing left to do is to set the parent.
2449 2453                           */
2450 2454                          lpl_cur->lpl_parent = lpl_parent;
2451 2455                          continue;
2452 2456                  }
2453 2457  
2454 2458                  lpl_clear(lpl_cur);
2455 2459                  lpl_init(lpl_cur, lpl_leaf, lgrp_cur);
2456 2460  
2457 2461                  lpl_cur->lpl_parent = lpl_parent;
2458 2462  
2459 2463                  /* does new lpl need to be populated with other resources? */
2460 2464                  rset_intersect =
2461 2465                      klgrpset_intersects(lgrp_cur->lgrp_set[LGRP_RSRC_CPU],
2462 2466                      cpupart->cp_lgrpset);
2463 2467                  klgrpset_nlgrps(rset_intersect, rset_num_intersect);
2464 2468  
2465 2469                  if (rset_num_intersect > 1) {
2466 2470                          /*
2467 2471                           * If so, figure out what lpls have resources that
2468 2472                           * intersect this one, and add them.
2469 2473                           */
2470 2474                          for (j = 0; j <= lgrp_alloc_max; j++) {
2471 2475                                  lgrp_t  *lgrp_cand;     /* candidate lgrp */
2472 2476                                  lpl_t   *lpl_cand;      /* candidate lpl */
2473 2477  
2474 2478                                  lgrp_cand = lgrp_table[j];
2475 2479                                  if (!LGRP_EXISTS(lgrp_cand) ||
2476 2480                                      !klgrpset_ismember(rset_intersect,
2477 2481                                      lgrp_cand->lgrp_id))
2478 2482                                          continue;
2479 2483                                  lpl_cand =
2480 2484                                      &cpupart->cp_lgrploads[lgrp_cand->lgrp_id];
2481 2485                                  lpl_rset_add(lpl_cur, lpl_cand);
2482 2486                          }
2483 2487                  }
2484 2488                  /*
2485 2489                   * This lpl's rset has changed. Update the hint in it's
2486 2490                   * children.
2487 2491                   */
2488 2492                  lpl_child_update(lpl_cur, cpupart);
2489 2493          }
2490 2494  }
2491 2495  
2492 2496  /*
2493 2497   * remove a lpl from the hierarchy of resources, clearing its state when
2494 2498   * finished.  If the lpls at the intermediate levels of the hierarchy have no
2495 2499   * remaining resources, or no longer name a leaf resource in the cpu-partition,
2496 2500   * delete them as well.
2497 2501   */
2498 2502  
2499 2503  void
2500 2504  lpl_leaf_remove(lpl_t *lpl_leaf, cpupart_t *cpupart)
2501 2505  {
2502 2506          int             i;
2503 2507          lgrp_t          *lgrp_cur;
2504 2508          lpl_t           *lpl_cur;
2505 2509          klgrpset_t      leaf_intersect; /* intersection of leaves */
2506 2510  
2507 2511          for (i = 0; i <= lgrp_alloc_max; i++) {
2508 2512                  lgrp_cur = lgrp_table[i];
2509 2513  
2510 2514                  /*
2511 2515                   * Don't attempt to remove from lgrps that aren't there, that
2512 2516                   * don't contain our leaf, or from the leaf itself. (We do that
2513 2517                   * later)
2514 2518                   */
2515 2519  
2516 2520                  if (!LGRP_EXISTS(lgrp_cur))
2517 2521                          continue;
2518 2522  
2519 2523                  lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id];
2520 2524  
2521 2525                  if (!klgrpset_ismember(lgrp_cur->lgrp_set[LGRP_RSRC_CPU],
2522 2526                      lpl_leaf->lpl_lgrpid) ||
2523 2527                      (lpl_cur == lpl_leaf)) {
2524 2528                          continue;
2525 2529                  }
2526 2530  
2527 2531                  /*
2528 2532                   * This is a slightly sleazy simplification in that we have
2529 2533                   * already marked the cp_lgrpset as no longer containing the
2530 2534                   * leaf we've deleted.  Any lpls that pass the above checks
2531 2535                   * based upon lgrp membership but not necessarily cpu-part
2532 2536                   * membership also get cleared by the checks below.  Currently
2533 2537                   * this is harmless, as the lpls should be empty anyway.
2534 2538                   *
2535 2539                   * In particular, we want to preserve lpls that have additional
2536 2540                   * leaf resources, even though we don't yet have a processor
2537 2541                   * architecture that represents resources this way.
2538 2542                   */
2539 2543  
2540 2544                  leaf_intersect = klgrpset_intersects(lgrp_cur->lgrp_leaves,
2541 2545                      cpupart->cp_lgrpset);
2542 2546  
2543 2547                  lpl_rset_del(lpl_cur, lpl_leaf);
2544 2548                  if ((lpl_cur->lpl_nrset == 0) || (!leaf_intersect)) {
2545 2549                          lpl_clear(lpl_cur);
2546 2550                  } else {
2547 2551                          /*
2548 2552                           * Update this lpl's children
2549 2553                           */
2550 2554                          lpl_child_update(lpl_cur, cpupart);
2551 2555                  }
2552 2556          }
2553 2557          lpl_clear(lpl_leaf);
2554 2558  }
2555 2559  
2556 2560  /*
2557 2561   * add a cpu to a partition in terms of lgrp load avg bookeeping
2558 2562   *
2559 2563   * The lpl (cpu partition load average information) is now arranged in a
2560 2564   * hierarchical fashion whereby resources that are closest, ie. most local, to
2561 2565   * the cpu in question are considered to be leaves in a tree of resources.
2562 2566   * There are two general cases for cpu additon:
2563 2567   *
2564 2568   * 1. A lpl structure that contains resources already in the hierarchy tree.
2565 2569   * In this case, all of the associated lpl relationships have been defined, and
2566 2570   * all that is necessary is that we link the new cpu into the per-lpl list of
2567 2571   * cpus, and increment the ncpu count of all places where this cpu resource will
2568 2572   * be accounted for.  lpl_cpu_adjcnt updates the cpu count, and the cpu pointer
2569 2573   * pushing is accomplished by this routine.
2570 2574   *
2571 2575   * 2. The lpl to contain the resources in this cpu-partition for this lgrp does
2572 2576   * not exist yet.  In this case, it is necessary to build the leaf lpl, and
2573 2577   * construct the hierarchy of state necessary to name it's more distant
2574 2578   * resources, if they should exist.  The leaf structure is initialized by this
2575 2579   * routine, as is the cpu-partition state for the lgrp membership.  This routine
2576 2580   * also calls lpl_leaf_insert() which inserts the named lpl into the hierarchy
2577 2581   * and builds all of the "ancestoral" state necessary to identify resources at
2578 2582   * differing levels of locality.
2579 2583   */
2580 2584  void
2581 2585  lgrp_part_add_cpu(cpu_t *cp, lgrp_id_t lgrpid)
2582 2586  {
2583 2587          cpupart_t       *cpupart;
2584 2588          lgrp_t          *lgrp_leaf;
2585 2589          lpl_t           *lpl_leaf;
2586 2590  
2587 2591          /* called sometimes w/ cpus paused - grab no locks */
2588 2592          ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
2589 2593  
2590 2594          cpupart = cp->cpu_part;
2591 2595          lgrp_leaf = lgrp_table[lgrpid];
2592 2596  
2593 2597          /* don't add non-existent lgrp */
2594 2598          ASSERT(LGRP_EXISTS(lgrp_leaf));
2595 2599          lpl_leaf = &cpupart->cp_lgrploads[lgrpid];
2596 2600          cp->cpu_lpl = lpl_leaf;
2597 2601  
2598 2602          /* only leaf lpls contain cpus */
2599 2603  
2600 2604          if (lpl_leaf->lpl_ncpu++ == 0) {
2601 2605                  lpl_init(lpl_leaf, lpl_leaf, lgrp_leaf);
2602 2606                  klgrpset_add(cpupart->cp_lgrpset, lgrpid);
2603 2607                  lpl_leaf_insert(lpl_leaf, cpupart);
2604 2608          } else {
2605 2609                  /*
2606 2610                   * the lpl should already exist in the parent, so just update
2607 2611                   * the count of available CPUs
2608 2612                   */
2609 2613                  lpl_cpu_adjcnt(LPL_INCREMENT, cp);
2610 2614          }
2611 2615  
2612 2616          /* link cpu into list of cpus in lpl */
2613 2617  
2614 2618          if (lpl_leaf->lpl_cpus) {
2615 2619                  cp->cpu_next_lpl = lpl_leaf->lpl_cpus;
2616 2620                  cp->cpu_prev_lpl = lpl_leaf->lpl_cpus->cpu_prev_lpl;
2617 2621                  lpl_leaf->lpl_cpus->cpu_prev_lpl->cpu_next_lpl = cp;
2618 2622                  lpl_leaf->lpl_cpus->cpu_prev_lpl = cp;
2619 2623          } else {
2620 2624                  /*
2621 2625                   * We increment ncpu immediately after we create a new leaf
2622 2626                   * lpl, so assert that ncpu == 1 for the case where we don't
2623 2627                   * have any cpu pointers yet.
2624 2628                   */
2625 2629                  ASSERT(lpl_leaf->lpl_ncpu == 1);
2626 2630                  lpl_leaf->lpl_cpus = cp->cpu_next_lpl = cp->cpu_prev_lpl = cp;
2627 2631          }
2628 2632  
2629 2633  }
2630 2634  
2631 2635  
2632 2636  /*
2633 2637   * remove a cpu from a partition in terms of lgrp load avg bookeeping
2634 2638   *
2635 2639   * The lpl (cpu partition load average information) is now arranged in a
2636 2640   * hierarchical fashion whereby resources that are closest, ie. most local, to
2637 2641   * the cpu in question are considered to be leaves in a tree of resources.
2638 2642   * There are two removal cases in question:
2639 2643   *
2640 2644   * 1. Removal of the resource in the leaf leaves other resources remaining in
2641 2645   * that leaf.  (Another cpu still exists at this level of locality).  In this
2642 2646   * case, the count of available cpus is decremented in all assocated lpls by
2643 2647   * calling lpl_adj_cpucnt(), and the pointer to the removed cpu is pruned
2644 2648   * from the per-cpu lpl list.
2645 2649   *
2646 2650   * 2. Removal of the resource results in the lpl containing no resources.  (It's
2647 2651   * empty)  In this case, all of what has occurred for the first step must take
2648 2652   * place; however, additionally we must remove the lpl structure itself, prune
2649 2653   * out any stranded lpls that do not directly name a leaf resource, and mark the
2650 2654   * cpu partition in question as no longer containing resources from the lgrp of
2651 2655   * the lpl that has been delted.  Cpu-partition changes are handled by this
2652 2656   * method, but the lpl_leaf_remove function deals with the details of pruning
2653 2657   * out the empty lpl and any of its orphaned direct ancestors.
2654 2658   */
2655 2659  void
2656 2660  lgrp_part_del_cpu(cpu_t *cp)
2657 2661  {
2658 2662          lpl_t           *lpl;
2659 2663          lpl_t           *leaf_lpl;
2660 2664          lgrp_t          *lgrp_leaf;
2661 2665  
2662 2666          /* called sometimes w/ cpus paused - grab no locks */
2663 2667  
2664 2668          ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
2665 2669  
2666 2670          lpl = leaf_lpl = cp->cpu_lpl;
2667 2671          lgrp_leaf = leaf_lpl->lpl_lgrp;
2668 2672  
2669 2673          /* don't delete a leaf that isn't there */
2670 2674          ASSERT(LGRP_EXISTS(lgrp_leaf));
2671 2675  
2672 2676          /* no double-deletes */
2673 2677          ASSERT(lpl->lpl_ncpu);
2674 2678          if (--lpl->lpl_ncpu == 0) {
2675 2679                  /*
2676 2680                   * This was the last cpu in this lgroup for this partition,
2677 2681                   * clear its bit in the partition's lgroup bitmask
2678 2682                   */
2679 2683                  klgrpset_del(cp->cpu_part->cp_lgrpset, lpl->lpl_lgrpid);
2680 2684  
2681 2685                  /* eliminate remaning lpl link pointers in cpu, lpl */
2682 2686                  lpl->lpl_cpus = cp->cpu_next_lpl = cp->cpu_prev_lpl = NULL;
2683 2687  
2684 2688                  lpl_leaf_remove(leaf_lpl, cp->cpu_part);
2685 2689          } else {
2686 2690  
2687 2691                  /* unlink cpu from lists of cpus in lpl */
2688 2692                  cp->cpu_prev_lpl->cpu_next_lpl = cp->cpu_next_lpl;
2689 2693                  cp->cpu_next_lpl->cpu_prev_lpl = cp->cpu_prev_lpl;
2690 2694                  if (lpl->lpl_cpus == cp) {
2691 2695                          lpl->lpl_cpus = cp->cpu_next_lpl;
2692 2696                  }
2693 2697  
2694 2698                  /*
2695 2699                   * Update the cpu count in the lpls associated with parent
2696 2700                   * lgroups.
2697 2701                   */
2698 2702                  lpl_cpu_adjcnt(LPL_DECREMENT, cp);
2699 2703  
2700 2704          }
2701 2705          /* clear cpu's lpl ptr when we're all done */
2702 2706          cp->cpu_lpl = NULL;
2703 2707  }
2704 2708  
2705 2709  /*
2706 2710   * Recompute load average for the specified partition/lgrp fragment.
2707 2711   *
2708 2712   * We rely on the fact that this routine is called from the clock thread
2709 2713   * at a point before the clock thread can block (i.e. before its first
2710 2714   * lock request).  Since the clock thread can not be preempted (since it
2711 2715   * runs at highest priority), we know that cpu partitions can not change
2712 2716   * (since doing so would require either the repartition requester or the
2713 2717   * cpu_pause thread to run on this cpu), so we can update the cpu's load
2714 2718   * without grabbing cpu_lock.
2715 2719   */
2716 2720  void
2717 2721  lgrp_loadavg(lpl_t *lpl, uint_t nrcpus, int ageflag)
2718 2722  {
2719 2723          uint_t          ncpu;
2720 2724          int64_t         old, new, f;
2721 2725  
2722 2726          /*
2723 2727           * 1 - exp(-1/(20 * ncpu)) << 13 = 400 for 1 cpu...
2724 2728           */
2725 2729          static short expval[] = {
2726 2730              0, 3196, 1618, 1083,
2727 2731              814, 652, 543, 466,
2728 2732              408, 363, 326, 297,
2729 2733              272, 251, 233, 218,
2730 2734              204, 192, 181, 172,
2731 2735              163, 155, 148, 142,
2732 2736              136, 130, 125, 121,
2733 2737              116, 112, 109, 105
2734 2738          };
2735 2739  
2736 2740          /* ASSERT (called from clock level) */
2737 2741  
2738 2742          if ((lpl == NULL) ||    /* we're booting - this is easiest for now */
2739 2743              ((ncpu = lpl->lpl_ncpu) == 0)) {
2740 2744                  return;
2741 2745          }
2742 2746  
2743 2747          for (;;) {
2744 2748  
2745 2749                  if (ncpu >= sizeof (expval) / sizeof (expval[0]))
2746 2750                          f = expval[1]/ncpu; /* good approx. for large ncpu */
2747 2751                  else
2748 2752                          f = expval[ncpu];
2749 2753  
2750 2754                  /*
2751 2755                   * Modify the load average atomically to avoid losing
2752 2756                   * anticipatory load updates (see lgrp_move_thread()).
2753 2757                   */
2754 2758                  if (ageflag) {
2755 2759                          /*
2756 2760                           * We're supposed to both update and age the load.
2757 2761                           * This happens 10 times/sec. per cpu.  We do a
2758 2762                           * little hoop-jumping to avoid integer overflow.
2759 2763                           */
2760 2764                          int64_t         q, r;
2761 2765  
2762 2766                          do {
2763 2767                                  old = new = lpl->lpl_loadavg;
2764 2768                                  q = (old  >> 16) << 7;
2765 2769                                  r = (old  & 0xffff) << 7;
2766 2770                                  new += ((long long)(nrcpus - q) * f -
2767 2771                                      ((r * f) >> 16)) >> 7;
2768 2772  
2769 2773                                  /*
2770 2774                                   * Check for overflow
2771 2775                                   */
2772 2776                                  if (new > LGRP_LOADAVG_MAX)
2773 2777                                          new = LGRP_LOADAVG_MAX;
2774 2778                                  else if (new < 0)
2775 2779                                          new = 0;
2776 2780                          } while (atomic_cas_32((lgrp_load_t *)&lpl->lpl_loadavg,
2777 2781                              old, new) != old);
2778 2782                  } else {
2779 2783                          /*
2780 2784                           * We're supposed to update the load, but not age it.
2781 2785                           * This option is used to update the load (which either
2782 2786                           * has already been aged in this 1/10 sec. interval or
2783 2787                           * soon will be) to account for a remotely executing
2784 2788                           * thread.
2785 2789                           */
2786 2790                          do {
2787 2791                                  old = new = lpl->lpl_loadavg;
2788 2792                                  new += f;
2789 2793                                  /*
2790 2794                                   * Check for overflow
2791 2795                                   * Underflow not possible here
2792 2796                                   */
2793 2797                                  if (new < old)
2794 2798                                          new = LGRP_LOADAVG_MAX;
2795 2799                          } while (atomic_cas_32((lgrp_load_t *)&lpl->lpl_loadavg,
2796 2800                              old, new) != old);
2797 2801                  }
2798 2802  
2799 2803                  /*
2800 2804                   * Do the same for this lpl's parent
2801 2805                   */
2802 2806                  if ((lpl = lpl->lpl_parent) == NULL)
2803 2807                          break;
2804 2808                  ncpu = lpl->lpl_ncpu;
2805 2809          }
2806 2810  }
2807 2811  
2808 2812  /*
2809 2813   * Initialize lpl topology in the target based on topology currently present in
2810 2814   * lpl_bootstrap.
2811 2815   *
2812 2816   * lpl_topo_bootstrap is only called once from cpupart_initialize_default() to
2813 2817   * initialize cp_default list of lpls. Up to this point all topology operations
2814 2818   * were performed using lpl_bootstrap. Now cp_default has its own list of lpls
2815 2819   * and all subsequent lpl operations should use it instead of lpl_bootstrap. The
2816 2820   * `target' points to the list of lpls in cp_default and `size' is the size of
2817 2821   * this list.
2818 2822   *
2819 2823   * This function walks the lpl topology in lpl_bootstrap and does for things:
2820 2824   *
2821 2825   * 1) Copies all fields from lpl_bootstrap to the target.
2822 2826   *
2823 2827   * 2) Sets CPU0 lpl pointer to the correct element of the target list.
2824 2828   *
2825 2829   * 3) Updates lpl_parent pointers to point to the lpls in the target list
2826 2830   *    instead of lpl_bootstrap.
2827 2831   *
2828 2832   * 4) Updates pointers in the resource list of the target to point to the lpls
2829 2833   *    in the target list instead of lpl_bootstrap.
2830 2834   *
2831 2835   * After lpl_topo_bootstrap() completes, target contains the same information
2832 2836   * that would be present there if it were used during boot instead of
2833 2837   * lpl_bootstrap. There is no need in information in lpl_bootstrap after this
2834 2838   * and it is bzeroed.
2835 2839   */
2836 2840  void
2837 2841  lpl_topo_bootstrap(lpl_t *target, int size)
2838 2842  {
2839 2843          lpl_t   *lpl = lpl_bootstrap;
2840 2844          lpl_t   *target_lpl = target;
2841 2845          lpl_t   **rset;
2842 2846          int     *id2rset;
2843 2847          int     sz;
2844 2848          int     howmany;
2845 2849          int     id;
2846 2850          int     i;
2847 2851  
2848 2852          /*
2849 2853           * The only target that should be passed here is cp_default lpl list.
2850 2854           */
2851 2855          ASSERT(target == cp_default.cp_lgrploads);
2852 2856          ASSERT(size == cp_default.cp_nlgrploads);
2853 2857          ASSERT(!lgrp_topo_initialized);
2854 2858          ASSERT(ncpus == 1);
2855 2859  
2856 2860          howmany = MIN(LPL_BOOTSTRAP_SIZE, size);
2857 2861          for (i = 0; i < howmany; i++, lpl++, target_lpl++) {
2858 2862                  /*
2859 2863                   * Copy all fields from lpl, except for the rset,
2860 2864                   * lgrp id <=> rset mapping storage,
2861 2865                   * and amount of storage
2862 2866                   */
2863 2867                  rset = target_lpl->lpl_rset;
2864 2868                  id2rset = target_lpl->lpl_id2rset;
2865 2869                  sz = target_lpl->lpl_rset_sz;
2866 2870  
2867 2871                  *target_lpl = *lpl;
2868 2872  
2869 2873                  target_lpl->lpl_rset_sz = sz;
2870 2874                  target_lpl->lpl_rset = rset;
2871 2875                  target_lpl->lpl_id2rset = id2rset;
2872 2876  
2873 2877                  /*
2874 2878                   * Substitute CPU0 lpl pointer with one relative to target.
2875 2879                   */
2876 2880                  if (lpl->lpl_cpus == CPU) {
2877 2881                          ASSERT(CPU->cpu_lpl == lpl);
2878 2882                          CPU->cpu_lpl = target_lpl;
2879 2883                  }
2880 2884  
2881 2885                  /*
2882 2886                   * Substitute parent information with parent relative to target.
2883 2887                   */
2884 2888                  if (lpl->lpl_parent != NULL)
2885 2889                          target_lpl->lpl_parent = (lpl_t *)
2886 2890                              (((uintptr_t)lpl->lpl_parent -
2887 2891                              (uintptr_t)lpl_bootstrap) +
2888 2892                              (uintptr_t)target);
2889 2893  
2890 2894                  /*
2891 2895                   * Walk over resource set substituting pointers relative to
2892 2896                   * lpl_bootstrap's rset to pointers relative to target's
2893 2897                   */
2894 2898                  ASSERT(lpl->lpl_nrset <= 1);
2895 2899  
2896 2900                  for (id = 0; id < lpl->lpl_nrset; id++) {
2897 2901                          if (lpl->lpl_rset[id] != NULL) {
2898 2902                                  target_lpl->lpl_rset[id] = (lpl_t *)
2899 2903                                      (((uintptr_t)lpl->lpl_rset[id] -
2900 2904                                      (uintptr_t)lpl_bootstrap) +
2901 2905                                      (uintptr_t)target);
2902 2906                          }
2903 2907                          target_lpl->lpl_id2rset[id] =
2904 2908                              lpl->lpl_id2rset[id];
2905 2909                  }
2906 2910          }
2907 2911  
2908 2912          /*
2909 2913           * Clean up the bootstrap lpls since we have switched over to the
2910 2914           * actual lpl array in the default cpu partition.
2911 2915           *
2912 2916           * We still need to keep one empty lpl around for newly starting
2913 2917           * slave CPUs to reference should they need to make it through the
2914 2918           * dispatcher prior to their lgrp/lpl initialization.
2915 2919           *
2916 2920           * The lpl related dispatcher code has been designed to work properly
2917 2921           * (and without extra checks) for this special case of a zero'ed
2918 2922           * bootstrap lpl. Such an lpl appears to the dispatcher as an lpl
2919 2923           * with lgrpid 0 and an empty resource set. Iteration over the rset
2920 2924           * array by the dispatcher is also NULL terminated for this reason.
2921 2925           *
2922 2926           * This provides the desired behaviour for an uninitialized CPU.
2923 2927           * It shouldn't see any other CPU to either dispatch to or steal
2924 2928           * from until it is properly initialized.
2925 2929           */
2926 2930          bzero(lpl_bootstrap_list, sizeof (lpl_bootstrap_list));
2927 2931          bzero(lpl_bootstrap_id2rset, sizeof (lpl_bootstrap_id2rset));
2928 2932          bzero(lpl_bootstrap_rset, sizeof (lpl_bootstrap_rset));
2929 2933  
2930 2934          lpl_bootstrap_list[0].lpl_rset = lpl_bootstrap_rset;
2931 2935          lpl_bootstrap_list[0].lpl_id2rset = lpl_bootstrap_id2rset;
2932 2936  }
2933 2937  
2934 2938  /*
2935 2939   * If the lowest load among the lgroups a process' threads are currently
2936 2940   * spread across is greater than lgrp_expand_proc_thresh, we'll consider
2937 2941   * expanding the process to a new lgroup.
2938 2942   */
2939 2943  #define LGRP_EXPAND_PROC_THRESH_DEFAULT 62250
2940 2944  lgrp_load_t     lgrp_expand_proc_thresh = LGRP_EXPAND_PROC_THRESH_DEFAULT;
2941 2945  
2942 2946  #define LGRP_EXPAND_PROC_THRESH(ncpu) \
2943 2947          ((lgrp_expand_proc_thresh) / (ncpu))
2944 2948  
2945 2949  /*
2946 2950   * A process will be expanded to a new lgroup only if the difference between
2947 2951   * the lowest load on the lgroups the process' thread's are currently spread
2948 2952   * across and the lowest load on the other lgroups in the process' partition
2949 2953   * is greater than lgrp_expand_proc_diff.
2950 2954   */
2951 2955  #define LGRP_EXPAND_PROC_DIFF_DEFAULT 60000
2952 2956  lgrp_load_t     lgrp_expand_proc_diff = LGRP_EXPAND_PROC_DIFF_DEFAULT;
2953 2957  
2954 2958  #define LGRP_EXPAND_PROC_DIFF(ncpu) \
2955 2959          ((lgrp_expand_proc_diff) / (ncpu))
2956 2960  
2957 2961  /*
2958 2962   * The loadavg tolerance accounts for "noise" inherent in the load, which may
2959 2963   * be present due to impreciseness of the load average decay algorithm.
2960 2964   *
2961 2965   * The default tolerance is lgrp_loadavg_max_effect. Note that the tunable
2962 2966   * tolerance is scaled by the number of cpus in the lgroup just like
2963 2967   * lgrp_loadavg_max_effect. For example, if lgrp_loadavg_tolerance = 0x10000,
2964 2968   * and ncpu = 4, then lgrp_choose will consider differences in lgroup loads
2965 2969   * of: 0x10000 / 4 => 0x4000 or greater to be significant.
2966 2970   */
2967 2971  uint32_t        lgrp_loadavg_tolerance = LGRP_LOADAVG_THREAD_MAX;
2968 2972  #define LGRP_LOADAVG_TOLERANCE(ncpu)    \
2969 2973          ((lgrp_loadavg_tolerance) / ncpu)
2970 2974  
2971 2975  /*
2972 2976   * lgrp_choose() will choose root lgroup as home when lowest lgroup load
2973 2977   * average is above this threshold
2974 2978   */
2975 2979  uint32_t        lgrp_load_thresh = UINT32_MAX;
2976 2980  
2977 2981  /*
2978 2982   * lgrp_choose() will try to skip any lgroups with less memory
2979 2983   * than this free when choosing a home lgroup
2980 2984   */
2981 2985  pgcnt_t lgrp_mem_free_thresh = 0;
2982 2986  
2983 2987  /*
2984 2988   * When choosing between similarly loaded lgroups, lgrp_choose() will pick
2985 2989   * one based on one of the following policies:
2986 2990   * - Random selection
2987 2991   * - Pseudo round robin placement
2988 2992   * - Longest time since a thread was last placed
2989 2993   */
2990 2994  #define LGRP_CHOOSE_RANDOM      1
2991 2995  #define LGRP_CHOOSE_RR          2
2992 2996  #define LGRP_CHOOSE_TIME        3
2993 2997  
2994 2998  int     lgrp_choose_policy = LGRP_CHOOSE_TIME;
2995 2999  
2996 3000  /*
2997 3001   * Choose a suitable leaf lgroup for a kthread.  The kthread is assumed not to
2998 3002   * be bound to a CPU or processor set.
2999 3003   *
3000 3004   * Arguments:
3001 3005   *      t               The thread
3002 3006   *      cpupart         The partition the thread belongs to.
3003 3007   *
3004 3008   * NOTE: Should at least be called with the cpu_lock held, kernel preemption
3005 3009   *       disabled, or thread_lock held (at splhigh) to protect against the CPU
3006 3010   *       partitions changing out from under us and assumes that given thread is
3007 3011   *       protected.  Also, called sometimes w/ cpus paused or kernel preemption
3008 3012   *       disabled, so don't grab any locks because we should never block under
3009 3013   *       those conditions.
3010 3014   */
3011 3015  lpl_t *
3012 3016  lgrp_choose(kthread_t *t, cpupart_t *cpupart)
3013 3017  {
3014 3018          lgrp_load_t     bestload, bestrload;
3015 3019          int             lgrpid_offset, lgrp_count;
3016 3020          lgrp_id_t       lgrpid, lgrpid_start;
3017 3021          lpl_t           *lpl, *bestlpl, *bestrlpl;
3018 3022          klgrpset_t      lgrpset;
3019 3023          proc_t          *p;
3020 3024  
3021 3025          ASSERT(t != NULL);
3022 3026          ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 ||
3023 3027              THREAD_LOCK_HELD(t));
3024 3028          ASSERT(cpupart != NULL);
3025 3029  
3026 3030          p = t->t_procp;
3027 3031  
3028 3032          /* A process should always be in an active partition */
3029 3033          ASSERT(!klgrpset_isempty(cpupart->cp_lgrpset));
3030 3034  
3031 3035          bestlpl = bestrlpl = NULL;
3032 3036          bestload = bestrload = LGRP_LOADAVG_MAX;
3033 3037          lgrpset = cpupart->cp_lgrpset;
3034 3038  
3035 3039          switch (lgrp_choose_policy) {
3036 3040          case LGRP_CHOOSE_RR:
3037 3041                  lgrpid = cpupart->cp_lgrp_hint;
3038 3042                  do {
3039 3043                          if (++lgrpid > lgrp_alloc_max)
3040 3044                                  lgrpid = 0;
3041 3045                  } while (!klgrpset_ismember(lgrpset, lgrpid));
3042 3046  
3043 3047                  break;
3044 3048          default:
3045 3049          case LGRP_CHOOSE_TIME:
3046 3050          case LGRP_CHOOSE_RANDOM:
3047 3051                  klgrpset_nlgrps(lgrpset, lgrp_count);
3048 3052                  lgrpid_offset =
3049 3053                      (((ushort_t)(gethrtime() >> 4)) % lgrp_count) + 1;
3050 3054                  for (lgrpid = 0; ; lgrpid++) {
3051 3055                          if (klgrpset_ismember(lgrpset, lgrpid)) {
3052 3056                                  if (--lgrpid_offset == 0)
3053 3057                                          break;
3054 3058                          }
3055 3059                  }
3056 3060                  break;
3057 3061          }
3058 3062  
3059 3063          lgrpid_start = lgrpid;
3060 3064  
3061 3065          DTRACE_PROBE2(lgrp_choose_start, lgrp_id_t, lgrpid_start,
3062 3066              lgrp_id_t, cpupart->cp_lgrp_hint);
3063 3067  
3064 3068          /*
3065 3069           * Use lgroup affinities (if any) to choose best lgroup
3066 3070           *
3067 3071           * NOTE: Assumes that thread is protected from going away and its
3068 3072           *       lgroup affinities won't change (ie. p_lock, or
3069 3073           *       thread_lock() being held and/or CPUs paused)
3070 3074           */
3071 3075          if (t->t_lgrp_affinity) {
3072 3076                  lpl = lgrp_affinity_best(t, cpupart, lgrpid_start, B_FALSE);
3073 3077                  if (lpl != NULL)
3074 3078                          return (lpl);
3075 3079          }
3076 3080  
3077 3081          ASSERT(klgrpset_ismember(lgrpset, lgrpid_start));
3078 3082  
3079 3083          do {
3080 3084                  pgcnt_t npgs;
3081 3085  
3082 3086                  /*
3083 3087                   * Skip any lgroups outside of thread's pset
3084 3088                   */
3085 3089                  if (!klgrpset_ismember(lgrpset, lgrpid)) {
3086 3090                          if (++lgrpid > lgrp_alloc_max)
3087 3091                                  lgrpid = 0;     /* wrap the search */
3088 3092                          continue;
3089 3093                  }
3090 3094  
3091 3095                  /*
3092 3096                   * Skip any non-leaf lgroups
3093 3097                   */
3094 3098                  if (lgrp_table[lgrpid]->lgrp_childcnt != 0)
3095 3099                          continue;
3096 3100  
3097 3101                  /*
3098 3102                   * Skip any lgroups without enough free memory
3099 3103                   * (when threshold set to nonzero positive value)
3100 3104                   */
3101 3105                  if (lgrp_mem_free_thresh > 0) {
3102 3106                          npgs = lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_FREE);
3103 3107                          if (npgs < lgrp_mem_free_thresh) {
3104 3108                                  if (++lgrpid > lgrp_alloc_max)
3105 3109                                          lgrpid = 0;     /* wrap the search */
3106 3110                                  continue;
3107 3111                          }
3108 3112                  }
3109 3113  
3110 3114                  lpl = &cpupart->cp_lgrploads[lgrpid];
3111 3115                  if (klgrpset_isempty(p->p_lgrpset) ||
3112 3116                      klgrpset_ismember(p->p_lgrpset, lgrpid)) {
3113 3117                          /*
3114 3118                           * Either this is a new process or the process already
3115 3119                           * has threads on this lgrp, so this is a preferred
3116 3120                           * lgroup for the thread.
3117 3121                           */
3118 3122                          if (bestlpl == NULL ||
3119 3123                              lpl_pick(lpl, bestlpl)) {
3120 3124                                  bestload = lpl->lpl_loadavg;
3121 3125                                  bestlpl = lpl;
3122 3126                          }
3123 3127                  } else {
3124 3128                          /*
3125 3129                           * The process doesn't have any threads on this lgrp,
3126 3130                           * but we're willing to consider this lgrp if the load
3127 3131                           * difference is big enough to justify splitting up
3128 3132                           * the process' threads.
3129 3133                           */
3130 3134                          if (bestrlpl == NULL ||
3131 3135                              lpl_pick(lpl, bestrlpl)) {
3132 3136                                  bestrload = lpl->lpl_loadavg;
3133 3137                                  bestrlpl = lpl;
3134 3138                          }
3135 3139                  }
3136 3140                  if (++lgrpid > lgrp_alloc_max)
3137 3141                          lgrpid = 0;     /* wrap the search */
3138 3142          } while (lgrpid != lgrpid_start);
3139 3143  
3140 3144          /*
3141 3145           * Return root lgroup if threshold isn't set to maximum value and
3142 3146           * lowest lgroup load average more than a certain threshold
3143 3147           */
3144 3148          if (lgrp_load_thresh != UINT32_MAX &&
3145 3149              bestload >= lgrp_load_thresh && bestrload >= lgrp_load_thresh)
3146 3150                  return (&cpupart->cp_lgrploads[lgrp_root->lgrp_id]);
3147 3151  
3148 3152          /*
3149 3153           * If all the lgroups over which the thread's process is spread are
3150 3154           * heavily loaded, or otherwise undesirable, we'll consider placing
3151 3155           * the thread on one of the other leaf lgroups in the thread's
3152 3156           * partition.
3153 3157           */
3154 3158          if ((bestlpl == NULL) ||
3155 3159              ((bestload > LGRP_EXPAND_PROC_THRESH(bestlpl->lpl_ncpu)) &&
3156 3160              (bestrload < bestload) &&   /* paranoid about wraparound */
3157 3161              (bestrload + LGRP_EXPAND_PROC_DIFF(bestrlpl->lpl_ncpu) <
3158 3162              bestload))) {
3159 3163                  bestlpl = bestrlpl;
3160 3164          }
3161 3165  
3162 3166          if (bestlpl == NULL) {
3163 3167                  /*
3164 3168                   * No lgroup looked particularly good, but we still
3165 3169                   * have to pick something. Go with the randomly selected
3166 3170                   * legal lgroup we started with above.
3167 3171                   */
3168 3172                  bestlpl = &cpupart->cp_lgrploads[lgrpid_start];
3169 3173          }
3170 3174  
3171 3175          cpupart->cp_lgrp_hint = bestlpl->lpl_lgrpid;
3172 3176          bestlpl->lpl_homed_time = gethrtime_unscaled();
3173 3177  
3174 3178          ASSERT(bestlpl->lpl_ncpu > 0);
3175 3179          return (bestlpl);
3176 3180  }
3177 3181  
3178 3182  /*
3179 3183   * Decide if lpl1 is a better candidate than lpl2 for lgrp homing.
3180 3184   * Returns non-zero if lpl1 is a better candidate, and 0 otherwise.
3181 3185   */
3182 3186  static int
3183 3187  lpl_pick(lpl_t *lpl1, lpl_t *lpl2)
3184 3188  {
3185 3189          lgrp_load_t     l1, l2;
3186 3190          lgrp_load_t     tolerance = LGRP_LOADAVG_TOLERANCE(lpl1->lpl_ncpu);
3187 3191  
3188 3192          l1 = lpl1->lpl_loadavg;
3189 3193          l2 = lpl2->lpl_loadavg;
3190 3194  
3191 3195          if ((l1 + tolerance < l2) && (l1 < l2)) {
3192 3196                  /* lpl1 is significantly less loaded than lpl2 */
3193 3197                  return (1);
3194 3198          }
3195 3199  
3196 3200          if (lgrp_choose_policy == LGRP_CHOOSE_TIME &&
3197 3201              l1 + tolerance >= l2 && l1 < l2 &&
3198 3202              lpl1->lpl_homed_time < lpl2->lpl_homed_time) {
3199 3203                  /*
3200 3204                   * lpl1's load is within the tolerance of lpl2. We're
3201 3205                   * willing to consider it be to better however if
3202 3206                   * it has been longer since we last homed a thread there
3203 3207                   */
3204 3208                  return (1);
3205 3209          }
3206 3210  
3207 3211          return (0);
3208 3212  }
3209 3213  
3210 3214  /*
3211 3215   * lgrp_trthr_moves counts the number of times main thread (t_tid = 1) of a
3212 3216   * process that uses text replication changed home lgrp. This info is used by
3213 3217   * segvn asyncronous thread to detect if it needs to recheck what lgrps
3214 3218   * should be used for text replication.
3215 3219   */
3216 3220  static uint64_t lgrp_trthr_moves = 0;
3217 3221  
3218 3222  uint64_t
3219 3223  lgrp_get_trthr_migrations(void)
3220 3224  {
3221 3225          return (lgrp_trthr_moves);
3222 3226  }
3223 3227  
3224 3228  void
3225 3229  lgrp_update_trthr_migrations(uint64_t incr)
3226 3230  {
3227 3231          atomic_add_64(&lgrp_trthr_moves, incr);
3228 3232  }
3229 3233  
3230 3234  /*
3231 3235   * An LWP is expected to be assigned to an lgroup for at least this long
3232 3236   * for its anticipatory load to be justified.  NOTE that this value should
3233 3237   * not be set extremely huge (say, larger than 100 years), to avoid problems
3234 3238   * with overflow in the calculation that uses it.
3235 3239   */
3236 3240  #define LGRP_MIN_NSEC   (NANOSEC / 10)          /* 1/10 of a second */
3237 3241  hrtime_t lgrp_min_nsec = LGRP_MIN_NSEC;
3238 3242  
3239 3243  /*
3240 3244   * Routine to change a thread's lgroup affiliation.  This routine updates
3241 3245   * the thread's kthread_t struct and its process' proc_t struct to note the
3242 3246   * thread's new lgroup affiliation, and its lgroup affinities.
3243 3247   *
3244 3248   * Note that this is the only routine that modifies a thread's t_lpl field,
3245 3249   * and that adds in or removes anticipatory load.
3246 3250   *
3247 3251   * If the thread is exiting, newlpl is NULL.
3248 3252   *
3249 3253   * Locking:
3250 3254   * The following lock must be held on entry:
3251 3255   *      cpu_lock, kpreempt_disable(), or thread_lock -- to assure t's new lgrp
3252 3256   *              doesn't get removed from t's partition
3253 3257   *
3254 3258   * This routine is not allowed to grab any locks, since it may be called
3255 3259   * with cpus paused (such as from cpu_offline).
3256 3260   */
3257 3261  void
3258 3262  lgrp_move_thread(kthread_t *t, lpl_t *newlpl, int do_lgrpset_delete)
3259 3263  {
3260 3264          proc_t          *p;
3261 3265          lpl_t           *lpl, *oldlpl;
3262 3266          lgrp_id_t       oldid;
3263 3267          kthread_t       *tp;
3264 3268          uint_t          ncpu;
3265 3269          lgrp_load_t     old, new;
3266 3270  
3267 3271          ASSERT(t);
3268 3272          ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 ||
3269 3273              THREAD_LOCK_HELD(t));
3270 3274  
3271 3275          /*
3272 3276           * If not changing lpls, just return
3273 3277           */
3274 3278          if ((oldlpl = t->t_lpl) == newlpl)
3275 3279                  return;
3276 3280  
3277 3281          /*
3278 3282           * Make sure the thread's lwp hasn't exited (if so, this thread is now
3279 3283           * associated with process 0 rather than with its original process).
3280 3284           */
3281 3285          if (t->t_proc_flag & TP_LWPEXIT) {
3282 3286                  if (newlpl != NULL) {
3283 3287                          t->t_lpl = newlpl;
3284 3288                  }
3285 3289                  return;
3286 3290          }
3287 3291  
3288 3292          p = ttoproc(t);
3289 3293  
3290 3294          /*
3291 3295           * If the thread had a previous lgroup, update its process' p_lgrpset
3292 3296           * to account for it being moved from its old lgroup.
3293 3297           */
3294 3298          if ((oldlpl != NULL) && /* thread had a previous lgroup */
3295 3299              (p->p_tlist != NULL)) {
3296 3300                  oldid = oldlpl->lpl_lgrpid;
3297 3301  
3298 3302                  if (newlpl != NULL)
3299 3303                          lgrp_stat_add(oldid, LGRP_NUM_MIGR, 1);
3300 3304  
3301 3305                  if ((do_lgrpset_delete) &&
3302 3306                      (klgrpset_ismember(p->p_lgrpset, oldid))) {
3303 3307                          for (tp = p->p_tlist->t_forw; ; tp = tp->t_forw) {
3304 3308                                  /*
3305 3309                                   * Check if a thread other than the thread
3306 3310                                   * that's moving is assigned to the same
3307 3311                                   * lgroup as the thread that's moving.  Note
3308 3312                                   * that we have to compare lgroup IDs, rather
3309 3313                                   * than simply comparing t_lpl's, since the
3310 3314                                   * threads may belong to different partitions
3311 3315                                   * but be assigned to the same lgroup.
3312 3316                                   */
3313 3317                                  ASSERT(tp->t_lpl != NULL);
3314 3318  
3315 3319                                  if ((tp != t) &&
3316 3320                                      (tp->t_lpl->lpl_lgrpid == oldid)) {
3317 3321                                          /*
3318 3322                                           * Another thread is assigned to the
3319 3323                                           * same lgroup as the thread that's
3320 3324                                           * moving, p_lgrpset doesn't change.
3321 3325                                           */
3322 3326                                          break;
3323 3327                                  } else if (tp == p->p_tlist) {
3324 3328                                          /*
3325 3329                                           * No other thread is assigned to the
3326 3330                                           * same lgroup as the exiting thread,
3327 3331                                           * clear the lgroup's bit in p_lgrpset.
3328 3332                                           */
3329 3333                                          klgrpset_del(p->p_lgrpset, oldid);
3330 3334                                          break;
3331 3335                                  }
3332 3336                          }
3333 3337                  }
3334 3338  
3335 3339                  /*
3336 3340                   * If this thread was assigned to its old lgroup for such a
3337 3341                   * short amount of time that the anticipatory load that was
3338 3342                   * added on its behalf has aged very little, remove that
3339 3343                   * anticipatory load.
3340 3344                   */
3341 3345                  if ((t->t_anttime + lgrp_min_nsec > gethrtime()) &&
3342 3346                      ((ncpu = oldlpl->lpl_ncpu) > 0)) {
3343 3347                          lpl = oldlpl;
3344 3348                          for (;;) {
3345 3349                                  do {
3346 3350                                          old = new = lpl->lpl_loadavg;
3347 3351                                          new -= LGRP_LOADAVG_MAX_EFFECT(ncpu);
3348 3352                                          if (new > old) {
3349 3353                                                  /*
3350 3354                                                   * this can happen if the load
3351 3355                                                   * average was aged since we
3352 3356                                                   * added in the anticipatory
3353 3357                                                   * load
3354 3358                                                   */
3355 3359                                                  new = 0;
3356 3360                                          }
3357 3361                                  } while (atomic_cas_32(
3358 3362                                      (lgrp_load_t *)&lpl->lpl_loadavg, old,
3359 3363                                      new) != old);
3360 3364  
3361 3365                                  lpl = lpl->lpl_parent;
3362 3366                                  if (lpl == NULL)
3363 3367                                          break;
3364 3368  
3365 3369                                  ncpu = lpl->lpl_ncpu;
3366 3370                                  ASSERT(ncpu > 0);
3367 3371                          }
3368 3372                  }
3369 3373          }
3370 3374          /*
3371 3375           * If the thread has a new lgroup (i.e. it's not exiting), update its
3372 3376           * t_lpl and its process' p_lgrpset, and apply an anticipatory load
3373 3377           * to its new lgroup to account for its move to its new lgroup.
3374 3378           */
3375 3379          if (newlpl != NULL) {
3376 3380                  /*
3377 3381                   * This thread is moving to a new lgroup
3378 3382                   */
3379 3383                  t->t_lpl = newlpl;
3380 3384                  if (t->t_tid == 1 && p->p_t1_lgrpid != newlpl->lpl_lgrpid) {
3381 3385                          p->p_t1_lgrpid = newlpl->lpl_lgrpid;
3382 3386                          membar_producer();
3383 3387                          if (p->p_tr_lgrpid != LGRP_NONE &&
3384 3388                              p->p_tr_lgrpid != p->p_t1_lgrpid) {
3385 3389                                  lgrp_update_trthr_migrations(1);
3386 3390                          }
3387 3391                  }
3388 3392  
3389 3393                  /*
3390 3394                   * Reflect move in load average of new lgroup
3391 3395                   * unless it is root lgroup
3392 3396                   */
3393 3397                  if (lgrp_table[newlpl->lpl_lgrpid] == lgrp_root)
3394 3398                          return;
3395 3399  
3396 3400                  if (!klgrpset_ismember(p->p_lgrpset, newlpl->lpl_lgrpid)) {
3397 3401                          klgrpset_add(p->p_lgrpset, newlpl->lpl_lgrpid);
3398 3402                  }
3399 3403  
3400 3404                  /*
3401 3405                   * It'll take some time for the load on the new lgroup
3402 3406                   * to reflect this thread's placement on it.  We'd
3403 3407                   * like not, however, to have all threads between now
3404 3408                   * and then also piling on to this lgroup.  To avoid
3405 3409                   * this pileup, we anticipate the load this thread
3406 3410                   * will generate on its new lgroup.  The goal is to
3407 3411                   * make the lgroup's load appear as though the thread
3408 3412                   * had been there all along.  We're very conservative
3409 3413                   * in calculating this anticipatory load, we assume
3410 3414                   * the worst case case (100% CPU-bound thread).  This
3411 3415                   * may be modified in the future to be more accurate.
3412 3416                   */
3413 3417                  lpl = newlpl;
3414 3418                  for (;;) {
3415 3419                          ncpu = lpl->lpl_ncpu;
3416 3420                          ASSERT(ncpu > 0);
3417 3421                          do {
3418 3422                                  old = new = lpl->lpl_loadavg;
3419 3423                                  new += LGRP_LOADAVG_MAX_EFFECT(ncpu);
3420 3424                                  /*
3421 3425                                   * Check for overflow
3422 3426                                   * Underflow not possible here
3423 3427                                   */
3424 3428                                  if (new < old)
3425 3429                                          new = UINT32_MAX;
3426 3430                          } while (atomic_cas_32((lgrp_load_t *)&lpl->lpl_loadavg,
3427 3431                              old, new) != old);
3428 3432  
3429 3433                          lpl = lpl->lpl_parent;
3430 3434                          if (lpl == NULL)
3431 3435                                  break;
3432 3436                  }
3433 3437                  t->t_anttime = gethrtime();
3434 3438          }
3435 3439  }
3436 3440  
3437 3441  /*
3438 3442   * Return lgroup memory allocation policy given advice from madvise(3C)
3439 3443   */
3440 3444  lgrp_mem_policy_t
3441 3445  lgrp_madv_to_policy(uchar_t advice, size_t size, int type)
3442 3446  {
3443 3447          switch (advice) {
3444 3448          case MADV_ACCESS_LWP:
3445 3449                  return (LGRP_MEM_POLICY_NEXT);
3446 3450          case MADV_ACCESS_MANY:
3447 3451                  return (LGRP_MEM_POLICY_RANDOM);
3448 3452          default:
3449 3453                  return (lgrp_mem_policy_default(size, type));
3450 3454          }
3451 3455  }
3452 3456  
3453 3457  /*
3454 3458   * Figure out default policy
3455 3459   */
3456 3460  lgrp_mem_policy_t
3457 3461  lgrp_mem_policy_default(size_t size, int type)
3458 3462  {
3459 3463          cpupart_t               *cp;
3460 3464          lgrp_mem_policy_t       policy;
3461 3465          size_t                  pset_mem_size;
3462 3466  
3463 3467          /*
3464 3468           * Randomly allocate memory across lgroups for shared memory
3465 3469           * beyond a certain threshold
3466 3470           */
3467 3471          if ((type != MAP_SHARED && size > lgrp_privm_random_thresh) ||
3468 3472              (type == MAP_SHARED && size > lgrp_shm_random_thresh)) {
3469 3473                  /*
3470 3474                   * Get total memory size of current thread's pset
3471 3475                   */
3472 3476                  kpreempt_disable();
3473 3477                  cp = curthread->t_cpupart;
3474 3478                  klgrpset_totalsize(cp->cp_lgrpset, pset_mem_size);
3475 3479                  kpreempt_enable();
3476 3480  
3477 3481                  /*
3478 3482                   * Choose policy to randomly allocate memory across
3479 3483                   * lgroups in pset if it will fit and is not default
3480 3484                   * partition.  Otherwise, allocate memory randomly
3481 3485                   * across machine.
3482 3486                   */
3483 3487                  if (lgrp_mem_pset_aware && size < pset_mem_size)
3484 3488                          policy = LGRP_MEM_POLICY_RANDOM_PSET;
3485 3489                  else
3486 3490                          policy = LGRP_MEM_POLICY_RANDOM;
3487 3491          } else
3488 3492                  /*
3489 3493                   * Apply default policy for private memory and
3490 3494                   * shared memory under the respective random
3491 3495                   * threshold.
3492 3496                   */
3493 3497                  policy = lgrp_mem_default_policy;
3494 3498  
3495 3499          return (policy);
3496 3500  }
3497 3501  
3498 3502  /*
3499 3503   * Get memory allocation policy for this segment
3500 3504   */
3501 3505  lgrp_mem_policy_info_t *
3502 3506  lgrp_mem_policy_get(struct seg *seg, caddr_t vaddr)
3503 3507  {
3504 3508          lgrp_mem_policy_info_t  *policy_info;
3505 3509          extern struct seg_ops   segspt_ops;
3506 3510          extern struct seg_ops   segspt_shmops;
3507 3511  
3508 3512          /*
3509 3513           * This is for binary compatibility to protect against third party
3510 3514           * segment drivers which haven't recompiled to allow for
3511 3515           * SEGOP_GETPOLICY()
3512 3516           */
3513 3517          if (seg->s_ops != &segvn_ops && seg->s_ops != &segspt_ops &&
3514 3518              seg->s_ops != &segspt_shmops)
3515 3519                  return (NULL);
3516 3520  
3517 3521          policy_info = NULL;
3518 3522          if (seg->s_ops->getpolicy != NULL)
3519 3523                  policy_info = SEGOP_GETPOLICY(seg, vaddr);
3520 3524  
3521 3525          return (policy_info);
3522 3526  }
3523 3527  
3524 3528  /*
3525 3529   * Set policy for allocating private memory given desired policy, policy info,
3526 3530   * size in bytes of memory that policy is being applied.
3527 3531   * Return 0 if policy wasn't set already and 1 if policy was set already
3528 3532   */
3529 3533  int
3530 3534  lgrp_privm_policy_set(lgrp_mem_policy_t policy,
3531 3535      lgrp_mem_policy_info_t *policy_info, size_t size)
3532 3536  {
3533 3537  
3534 3538          ASSERT(policy_info != NULL);
3535 3539  
3536 3540          if (policy == LGRP_MEM_POLICY_DEFAULT)
3537 3541                  policy = lgrp_mem_policy_default(size, MAP_PRIVATE);
3538 3542  
3539 3543          /*
3540 3544           * Policy set already?
3541 3545           */
3542 3546          if (policy == policy_info->mem_policy)
3543 3547                  return (1);
3544 3548  
3545 3549          /*
3546 3550           * Set policy
3547 3551           */
3548 3552          policy_info->mem_policy = policy;
3549 3553          policy_info->mem_lgrpid = LGRP_NONE;
3550 3554  
3551 3555          return (0);
3552 3556  }
3553 3557  
3554 3558  
3555 3559  /*
3556 3560   * Get shared memory allocation policy with given tree and offset
3557 3561   */
3558 3562  lgrp_mem_policy_info_t *
3559 3563  lgrp_shm_policy_get(struct anon_map *amp, ulong_t anon_index, vnode_t *vp,
3560 3564      u_offset_t vn_off)
3561 3565  {
3562 3566          u_offset_t              off;
3563 3567          lgrp_mem_policy_info_t  *policy_info;
3564 3568          lgrp_shm_policy_seg_t   *policy_seg;
3565 3569          lgrp_shm_locality_t     *shm_locality;
3566 3570          avl_tree_t              *tree;
3567 3571          avl_index_t             where;
3568 3572  
3569 3573          /*
3570 3574           * Get policy segment tree from anon_map or vnode and use specified
3571 3575           * anon index or vnode offset as offset
3572 3576           *
3573 3577           * Assume that no lock needs to be held on anon_map or vnode, since
3574 3578           * they should be protected by their reference count which must be
3575 3579           * nonzero for an existing segment
3576 3580           */
3577 3581          if (amp) {
3578 3582                  ASSERT(amp->refcnt != 0);
3579 3583                  shm_locality = amp->locality;
3580 3584                  if (shm_locality == NULL)
3581 3585                          return (NULL);
3582 3586                  tree = shm_locality->loc_tree;
3583 3587                  off = ptob(anon_index);
3584 3588          } else if (vp) {
3585 3589                  shm_locality = vp->v_locality;
3586 3590                  if (shm_locality == NULL)
3587 3591                          return (NULL);
3588 3592                  ASSERT(shm_locality->loc_count != 0);
3589 3593                  tree = shm_locality->loc_tree;
3590 3594                  off = vn_off;
3591 3595          }
3592 3596  
3593 3597          if (tree == NULL)
3594 3598                  return (NULL);
3595 3599  
3596 3600          /*
3597 3601           * Lookup policy segment for offset into shared object and return
3598 3602           * policy info
3599 3603           */
3600 3604          rw_enter(&shm_locality->loc_lock, RW_READER);
3601 3605          policy_info = NULL;
3602 3606          policy_seg = avl_find(tree, &off, &where);
3603 3607          if (policy_seg)
3604 3608                  policy_info = &policy_seg->shm_policy;
3605 3609          rw_exit(&shm_locality->loc_lock);
3606 3610  
3607 3611          return (policy_info);
3608 3612  }
3609 3613  
3610 3614  /*
3611 3615   * Default memory allocation policy for kernel segmap pages
3612 3616   */
3613 3617  lgrp_mem_policy_t       lgrp_segmap_default_policy = LGRP_MEM_POLICY_RANDOM;
3614 3618  
3615 3619  /*
3616 3620   * Return lgroup to use for allocating memory
3617 3621   * given the segment and address
3618 3622   *
3619 3623   * There isn't any mutual exclusion that exists between calls
3620 3624   * to this routine and DR, so this routine and whomever calls it
3621 3625   * should be mindful of the possibility that the lgrp returned
3622 3626   * may be deleted. If this happens, dereferences of the lgrp
3623 3627   * pointer will still be safe, but the resources in the lgrp will
3624 3628   * be gone, and LGRP_EXISTS() will no longer be true.
3625 3629   */
3626 3630  lgrp_t *
3627 3631  lgrp_mem_choose(struct seg *seg, caddr_t vaddr, size_t pgsz)
3628 3632  {
3629 3633          int                     i;
3630 3634          lgrp_t                  *lgrp;
3631 3635          klgrpset_t              lgrpset;
3632 3636          int                     lgrps_spanned;
3633 3637          unsigned long           off;
3634 3638          lgrp_mem_policy_t       policy;
3635 3639          lgrp_mem_policy_info_t  *policy_info;
3636 3640          ushort_t                random;
3637 3641          int                     stat = 0;
3638 3642          extern struct seg       *segkmap;
3639 3643  
3640 3644          /*
3641 3645           * Just return null if the lgrp framework hasn't finished
3642 3646           * initializing or if this is a UMA machine.
3643 3647           */
3644 3648          if (nlgrps == 1 || !lgrp_initialized)
3645 3649                  return (lgrp_root);
3646 3650  
3647 3651          /*
3648 3652           * Get memory allocation policy for this segment
3649 3653           */
3650 3654          policy = lgrp_mem_default_policy;
3651 3655          if (seg != NULL) {
3652 3656                  if (seg->s_as == &kas) {
3653 3657                          if (seg == segkmap)
3654 3658                                  policy = lgrp_segmap_default_policy;
3655 3659                          if (policy == LGRP_MEM_POLICY_RANDOM_PROC ||
3656 3660                              policy == LGRP_MEM_POLICY_RANDOM_PSET)
3657 3661                                  policy = LGRP_MEM_POLICY_RANDOM;
3658 3662                  } else {
3659 3663                          policy_info = lgrp_mem_policy_get(seg, vaddr);
3660 3664                          if (policy_info != NULL) {
3661 3665                                  policy = policy_info->mem_policy;
3662 3666                                  if (policy == LGRP_MEM_POLICY_NEXT_SEG) {
3663 3667                                          lgrp_id_t id = policy_info->mem_lgrpid;
3664 3668                                          ASSERT(id != LGRP_NONE);
3665 3669                                          ASSERT(id < NLGRPS_MAX);
3666 3670                                          lgrp = lgrp_table[id];
3667 3671                                          if (!LGRP_EXISTS(lgrp)) {
3668 3672                                                  policy = LGRP_MEM_POLICY_NEXT;
3669 3673                                          } else {
3670 3674                                                  lgrp_stat_add(id,
3671 3675                                                      LGRP_NUM_NEXT_SEG, 1);
3672 3676                                                  return (lgrp);
3673 3677                                          }
3674 3678                                  }
3675 3679                          }
3676 3680                  }
3677 3681          }
3678 3682          lgrpset = 0;
3679 3683  
3680 3684          /*
3681 3685           * Initialize lgroup to home by default
3682 3686           */
3683 3687          lgrp = lgrp_home_lgrp();
3684 3688  
3685 3689          /*
3686 3690           * When homing threads on root lgrp, override default memory
3687 3691           * allocation policies with root lgroup memory allocation policy
3688 3692           */
3689 3693          if (lgrp == lgrp_root)
3690 3694                  policy = lgrp_mem_policy_root;
3691 3695  
3692 3696          /*
3693 3697           * Implement policy
3694 3698           */
3695 3699          switch (policy) {
3696 3700          case LGRP_MEM_POLICY_NEXT_CPU:
3697 3701  
3698 3702                  /*
3699 3703                   * Return lgroup of current CPU which faulted on memory
3700 3704                   * If the CPU isn't currently in an lgrp, then opt to
3701 3705                   * allocate from the root.
3702 3706                   *
3703 3707                   * Kernel preemption needs to be disabled here to prevent
3704 3708                   * the current CPU from going away before lgrp is found.
3705 3709                   */
3706 3710                  if (LGRP_CPU_HAS_NO_LGRP(CPU)) {
3707 3711                          lgrp = lgrp_root;
3708 3712                  } else {
3709 3713                          kpreempt_disable();
3710 3714                          lgrp = lgrp_cpu_to_lgrp(CPU);
3711 3715                          kpreempt_enable();
3712 3716                  }
3713 3717                  break;
3714 3718  
3715 3719          case LGRP_MEM_POLICY_NEXT:
3716 3720          case LGRP_MEM_POLICY_DEFAULT:
3717 3721          default:
3718 3722  
3719 3723                  /*
3720 3724                   * Just return current thread's home lgroup
3721 3725                   * for default policy (next touch)
3722 3726                   * If the thread is homed to the root,
3723 3727                   * then the default policy is random across lgroups.
3724 3728                   * Fallthrough to the random case.
3725 3729                   */
3726 3730                  if (lgrp != lgrp_root) {
3727 3731                          if (policy == LGRP_MEM_POLICY_NEXT)
3728 3732                                  lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_NEXT, 1);
3729 3733                          else
3730 3734                                  lgrp_stat_add(lgrp->lgrp_id,
3731 3735                                      LGRP_NUM_DEFAULT, 1);
3732 3736                          break;
3733 3737                  }
3734 3738                  /* FALLTHROUGH */
3735 3739          case LGRP_MEM_POLICY_RANDOM:
3736 3740  
3737 3741                  /*
3738 3742                   * Return a random leaf lgroup with memory
3739 3743                   */
3740 3744                  lgrpset = lgrp_root->lgrp_set[LGRP_RSRC_MEM];
3741 3745                  /*
3742 3746                   * Count how many lgroups are spanned
3743 3747                   */
3744 3748                  klgrpset_nlgrps(lgrpset, lgrps_spanned);
3745 3749  
3746 3750                  /*
3747 3751                   * There may be no memnodes in the root lgroup during DR copy
3748 3752                   * rename on a system with only two boards (memnodes)
3749 3753                   * configured. In this case just return the root lgrp.
3750 3754                   */
3751 3755                  if (lgrps_spanned == 0) {
3752 3756                          lgrp = lgrp_root;
3753 3757                          break;
3754 3758                  }
3755 3759  
3756 3760                  /*
3757 3761                   * Pick a random offset within lgroups spanned
3758 3762                   * and return lgroup at that offset
3759 3763                   */
3760 3764                  random = (ushort_t)gethrtime() >> 4;
3761 3765                  off = random % lgrps_spanned;
3762 3766                  ASSERT(off <= lgrp_alloc_max);
3763 3767  
3764 3768                  for (i = 0; i <= lgrp_alloc_max; i++) {
3765 3769                          if (!klgrpset_ismember(lgrpset, i))
3766 3770                                  continue;
3767 3771                          if (off)
3768 3772                                  off--;
3769 3773                          else {
3770 3774                                  lgrp = lgrp_table[i];
3771 3775                                  lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_RANDOM,
3772 3776                                      1);
3773 3777                                  break;
3774 3778                          }
3775 3779                  }
3776 3780                  break;
3777 3781  
3778 3782          case LGRP_MEM_POLICY_RANDOM_PROC:
3779 3783  
3780 3784                  /*
3781 3785                   * Grab copy of bitmask of lgroups spanned by
3782 3786                   * this process
3783 3787                   */
3784 3788                  klgrpset_copy(lgrpset, curproc->p_lgrpset);
3785 3789                  stat = LGRP_NUM_RANDOM_PROC;
3786 3790  
3787 3791                  /* FALLTHROUGH */
3788 3792          case LGRP_MEM_POLICY_RANDOM_PSET:
3789 3793  
3790 3794                  if (!stat)
3791 3795                          stat = LGRP_NUM_RANDOM_PSET;
3792 3796  
3793 3797                  if (klgrpset_isempty(lgrpset)) {
3794 3798                          /*
3795 3799                           * Grab copy of bitmask of lgroups spanned by
3796 3800                           * this processor set
3797 3801                           */
3798 3802                          kpreempt_disable();
3799 3803                          klgrpset_copy(lgrpset,
3800 3804                              curthread->t_cpupart->cp_lgrpset);
3801 3805                          kpreempt_enable();
3802 3806                  }
3803 3807  
3804 3808                  /*
3805 3809                   * Count how many lgroups are spanned
3806 3810                   */
3807 3811                  klgrpset_nlgrps(lgrpset, lgrps_spanned);
3808 3812                  ASSERT(lgrps_spanned <= nlgrps);
3809 3813  
3810 3814                  /*
3811 3815                   * Probably lgrps_spanned should be always non-zero, but to be
3812 3816                   * on the safe side we return lgrp_root if it is empty.
3813 3817                   */
3814 3818                  if (lgrps_spanned == 0) {
3815 3819                          lgrp = lgrp_root;
3816 3820                          break;
3817 3821                  }
3818 3822  
3819 3823                  /*
3820 3824                   * Pick a random offset within lgroups spanned
3821 3825                   * and return lgroup at that offset
3822 3826                   */
3823 3827                  random = (ushort_t)gethrtime() >> 4;
3824 3828                  off = random % lgrps_spanned;
3825 3829                  ASSERT(off <= lgrp_alloc_max);
3826 3830  
3827 3831                  for (i = 0; i <= lgrp_alloc_max; i++) {
3828 3832                          if (!klgrpset_ismember(lgrpset, i))
3829 3833                                  continue;
3830 3834                          if (off)
3831 3835                                  off--;
3832 3836                          else {
3833 3837                                  lgrp = lgrp_table[i];
3834 3838                                  lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_RANDOM,
3835 3839                                      1);
3836 3840                                  break;
3837 3841                          }
3838 3842                  }
3839 3843                  break;
3840 3844  
3841 3845          case LGRP_MEM_POLICY_ROUNDROBIN:
3842 3846  
3843 3847                  /*
3844 3848                   * Use offset within segment to determine
3845 3849                   * offset from home lgroup to choose for
3846 3850                   * next lgroup to allocate memory from
3847 3851                   */
3848 3852                  off = ((unsigned long)(vaddr - seg->s_base) / pgsz) %
3849 3853                      (lgrp_alloc_max + 1);
3850 3854  
3851 3855                  kpreempt_disable();
3852 3856                  lgrpset = lgrp_root->lgrp_set[LGRP_RSRC_MEM];
3853 3857                  i = lgrp->lgrp_id;
3854 3858                  kpreempt_enable();
3855 3859  
3856 3860                  while (off > 0) {
3857 3861                          i = (i + 1) % (lgrp_alloc_max + 1);
3858 3862                          lgrp = lgrp_table[i];
3859 3863                          if (klgrpset_ismember(lgrpset, i))
3860 3864                                  off--;
3861 3865                  }
3862 3866                  lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ROUNDROBIN, 1);
3863 3867  
3864 3868                  break;
3865 3869          }
3866 3870  
3867 3871          ASSERT(lgrp != NULL);
3868 3872          return (lgrp);
3869 3873  }
3870 3874  
3871 3875  /*
3872 3876   * Return the number of pages in an lgroup
3873 3877   *
3874 3878   * NOTE: NUMA test (numat) driver uses this, so changing arguments or semantics
3875 3879   *       could cause tests that rely on the numat driver to fail....
3876 3880   */
3877 3881  pgcnt_t
3878 3882  lgrp_mem_size(lgrp_id_t lgrpid, lgrp_mem_query_t query)
3879 3883  {
3880 3884          lgrp_t *lgrp;
3881 3885  
3882 3886          lgrp = lgrp_table[lgrpid];
3883 3887          if (!LGRP_EXISTS(lgrp) ||
3884 3888              klgrpset_isempty(lgrp->lgrp_set[LGRP_RSRC_MEM]) ||
3885 3889              !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid))
3886 3890                  return (0);
3887 3891  
3888 3892          return (lgrp_plat_mem_size(lgrp->lgrp_plathand, query));
3889 3893  }
3890 3894  
3891 3895  /*
3892 3896   * Initialize lgroup shared memory allocation policy support
3893 3897   */
3894 3898  void
3895 3899  lgrp_shm_policy_init(struct anon_map *amp, vnode_t *vp)
3896 3900  {
3897 3901          lgrp_shm_locality_t     *shm_locality;
3898 3902  
3899 3903          /*
3900 3904           * Initialize locality field in anon_map
3901 3905           * Don't need any locks because this is called when anon_map is
3902 3906           * allocated, but not used anywhere yet.
3903 3907           */
3904 3908          if (amp) {
3905 3909                  ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
3906 3910                  if (amp->locality == NULL) {
3907 3911                          /*
3908 3912                           * Allocate and initialize shared memory locality info
3909 3913                           * and set anon_map locality pointer to it
3910 3914                           * Drop lock across kmem_alloc(KM_SLEEP)
3911 3915                           */
3912 3916                          ANON_LOCK_EXIT(&amp->a_rwlock);
3913 3917                          shm_locality = kmem_alloc(sizeof (*shm_locality),
3914 3918                              KM_SLEEP);
3915 3919                          rw_init(&shm_locality->loc_lock, NULL, RW_DEFAULT,
3916 3920                              NULL);
3917 3921                          shm_locality->loc_count = 1;    /* not used for amp */
3918 3922                          shm_locality->loc_tree = NULL;
3919 3923  
3920 3924                          /*
3921 3925                           * Reacquire lock and check to see whether anyone beat
3922 3926                           * us to initializing the locality info
3923 3927                           */
3924 3928                          ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
3925 3929                          if (amp->locality != NULL) {
3926 3930                                  rw_destroy(&shm_locality->loc_lock);
3927 3931                                  kmem_free(shm_locality,
3928 3932                                      sizeof (*shm_locality));
3929 3933                          } else
3930 3934                                  amp->locality = shm_locality;
3931 3935                  }
3932 3936                  ANON_LOCK_EXIT(&amp->a_rwlock);
3933 3937                  return;
3934 3938          }
3935 3939  
3936 3940          /*
3937 3941           * Allocate shared vnode policy info if vnode is not locality aware yet
3938 3942           */
3939 3943          mutex_enter(&vp->v_lock);
3940 3944          if ((vp->v_flag & V_LOCALITY) == 0) {
3941 3945                  /*
3942 3946                   * Allocate and initialize shared memory locality info
3943 3947                   */
3944 3948                  mutex_exit(&vp->v_lock);
3945 3949                  shm_locality = kmem_alloc(sizeof (*shm_locality), KM_SLEEP);
3946 3950                  rw_init(&shm_locality->loc_lock, NULL, RW_DEFAULT, NULL);
3947 3951                  shm_locality->loc_count = 1;
3948 3952                  shm_locality->loc_tree = NULL;
3949 3953  
3950 3954                  /*
3951 3955                   * Point vnode locality field at shared vnode policy info
3952 3956                   * and set locality aware flag in vnode
3953 3957                   */
3954 3958                  mutex_enter(&vp->v_lock);
3955 3959                  if ((vp->v_flag & V_LOCALITY) == 0) {
3956 3960                          vp->v_locality = shm_locality;
3957 3961                          vp->v_flag |= V_LOCALITY;
3958 3962                  } else {
3959 3963                          /*
3960 3964                           * Lost race so free locality info and increment count.
3961 3965                           */
3962 3966                          rw_destroy(&shm_locality->loc_lock);
3963 3967                          kmem_free(shm_locality, sizeof (*shm_locality));
3964 3968                          shm_locality = vp->v_locality;
3965 3969                          shm_locality->loc_count++;
3966 3970                  }
3967 3971                  mutex_exit(&vp->v_lock);
3968 3972  
3969 3973                  return;
3970 3974          }
3971 3975  
3972 3976          /*
3973 3977           * Increment reference count of number of segments mapping this vnode
3974 3978           * shared
3975 3979           */
3976 3980          shm_locality = vp->v_locality;
3977 3981          shm_locality->loc_count++;
3978 3982          mutex_exit(&vp->v_lock);
3979 3983  }
3980 3984  
3981 3985  /*
3982 3986   * Destroy the given shared memory policy segment tree
3983 3987   */
3984 3988  void
3985 3989  lgrp_shm_policy_tree_destroy(avl_tree_t *tree)
3986 3990  {
3987 3991          lgrp_shm_policy_seg_t   *cur;
3988 3992          lgrp_shm_policy_seg_t   *next;
3989 3993  
3990 3994          if (tree == NULL)
3991 3995                  return;
3992 3996  
3993 3997          cur = (lgrp_shm_policy_seg_t *)avl_first(tree);
3994 3998          while (cur != NULL) {
3995 3999                  next = AVL_NEXT(tree, cur);
3996 4000                  avl_remove(tree, cur);
3997 4001                  kmem_free(cur, sizeof (*cur));
3998 4002                  cur = next;
3999 4003          }
4000 4004          kmem_free(tree, sizeof (avl_tree_t));
4001 4005  }
4002 4006  
4003 4007  /*
4004 4008   * Uninitialize lgroup shared memory allocation policy support
4005 4009   */
4006 4010  void
4007 4011  lgrp_shm_policy_fini(struct anon_map *amp, vnode_t *vp)
4008 4012  {
4009 4013          lgrp_shm_locality_t     *shm_locality;
4010 4014  
4011 4015          /*
4012 4016           * For anon_map, deallocate shared memory policy tree and
4013 4017           * zero locality field
4014 4018           * Don't need any locks because anon_map is being freed
4015 4019           */
4016 4020          if (amp) {
4017 4021                  if (amp->locality == NULL)
4018 4022                          return;
4019 4023                  shm_locality = amp->locality;
4020 4024                  shm_locality->loc_count = 0;    /* not really used for amp */
4021 4025                  rw_destroy(&shm_locality->loc_lock);
4022 4026                  lgrp_shm_policy_tree_destroy(shm_locality->loc_tree);
4023 4027                  kmem_free(shm_locality, sizeof (*shm_locality));
4024 4028                  amp->locality = 0;
4025 4029                  return;
4026 4030          }
4027 4031  
4028 4032          /*
4029 4033           * For vnode, decrement reference count of segments mapping this vnode
4030 4034           * shared and delete locality info if reference count drops to 0
4031 4035           */
4032 4036          mutex_enter(&vp->v_lock);
4033 4037          shm_locality = vp->v_locality;
4034 4038          shm_locality->loc_count--;
4035 4039  
4036 4040          if (shm_locality->loc_count == 0) {
4037 4041                  rw_destroy(&shm_locality->loc_lock);
4038 4042                  lgrp_shm_policy_tree_destroy(shm_locality->loc_tree);
4039 4043                  kmem_free(shm_locality, sizeof (*shm_locality));
4040 4044                  vp->v_locality = 0;
4041 4045                  vp->v_flag &= ~V_LOCALITY;
4042 4046          }
4043 4047          mutex_exit(&vp->v_lock);
4044 4048  }
4045 4049  
4046 4050  /*
4047 4051   * Compare two shared memory policy segments
4048 4052   * Used by AVL tree code for searching
4049 4053   */
4050 4054  int
4051 4055  lgrp_shm_policy_compar(const void *x, const void *y)
4052 4056  {
4053 4057          lgrp_shm_policy_seg_t *a = (lgrp_shm_policy_seg_t *)x;
4054 4058          lgrp_shm_policy_seg_t *b = (lgrp_shm_policy_seg_t *)y;
4055 4059  
4056 4060          if (a->shm_off < b->shm_off)
4057 4061                  return (-1);
4058 4062          if (a->shm_off >= b->shm_off + b->shm_size)
4059 4063                  return (1);
4060 4064          return (0);
4061 4065  }
4062 4066  
4063 4067  /*
4064 4068   * Concatenate seg1 with seg2 and remove seg2
4065 4069   */
4066 4070  static int
4067 4071  lgrp_shm_policy_concat(avl_tree_t *tree, lgrp_shm_policy_seg_t *seg1,
4068 4072      lgrp_shm_policy_seg_t *seg2)
4069 4073  {
4070 4074          if (!seg1 || !seg2 ||
4071 4075              seg1->shm_off + seg1->shm_size != seg2->shm_off ||
4072 4076              seg1->shm_policy.mem_policy != seg2->shm_policy.mem_policy)
4073 4077                  return (-1);
4074 4078  
4075 4079          seg1->shm_size += seg2->shm_size;
4076 4080          avl_remove(tree, seg2);
4077 4081          kmem_free(seg2, sizeof (*seg2));
4078 4082          return (0);
4079 4083  }
4080 4084  
4081 4085  /*
4082 4086   * Split segment at given offset and return rightmost (uppermost) segment
4083 4087   * Assumes that there are no overlapping segments
4084 4088   */
4085 4089  static lgrp_shm_policy_seg_t *
4086 4090  lgrp_shm_policy_split(avl_tree_t *tree, lgrp_shm_policy_seg_t *seg,
4087 4091      u_offset_t off)
4088 4092  {
4089 4093          lgrp_shm_policy_seg_t   *newseg;
4090 4094          avl_index_t             where;
4091 4095  
4092 4096          ASSERT(seg != NULL);
4093 4097          ASSERT(off >= seg->shm_off && off <= seg->shm_off + seg->shm_size);
4094 4098  
4095 4099          if (!seg || off < seg->shm_off || off > seg->shm_off +
4096 4100              seg->shm_size)
4097 4101                  return (NULL);
4098 4102  
4099 4103          if (off == seg->shm_off || off == seg->shm_off + seg->shm_size)
4100 4104                  return (seg);
4101 4105  
4102 4106          /*
4103 4107           * Adjust size of left segment and allocate new (right) segment
4104 4108           */
4105 4109          newseg = kmem_alloc(sizeof (lgrp_shm_policy_seg_t), KM_SLEEP);
4106 4110          newseg->shm_policy = seg->shm_policy;
4107 4111          newseg->shm_off = off;
4108 4112          newseg->shm_size = seg->shm_size - (off - seg->shm_off);
4109 4113          seg->shm_size = off - seg->shm_off;
4110 4114  
4111 4115          /*
4112 4116           * Find where to insert new segment in AVL tree and insert it
4113 4117           */
4114 4118          (void) avl_find(tree, &off, &where);
4115 4119          avl_insert(tree, newseg, where);
4116 4120  
4117 4121          return (newseg);
4118 4122  }
4119 4123  
4120 4124  /*
4121 4125   * Set shared memory allocation policy on specified shared object at given
4122 4126   * offset and length
4123 4127   *
4124 4128   * Return 0 if policy wasn't set already, 1 if policy was set already, and
4125 4129   * -1 if can't set policy.
4126 4130   */
4127 4131  int
4128 4132  lgrp_shm_policy_set(lgrp_mem_policy_t policy, struct anon_map *amp,
4129 4133      ulong_t anon_index, vnode_t *vp, u_offset_t vn_off, size_t len)
4130 4134  {
4131 4135          u_offset_t              eoff;
4132 4136          lgrp_shm_policy_seg_t   *next;
4133 4137          lgrp_shm_policy_seg_t   *newseg;
4134 4138          u_offset_t              off;
4135 4139          u_offset_t              oldeoff;
4136 4140          lgrp_shm_policy_seg_t   *prev;
4137 4141          int                     retval;
4138 4142          lgrp_shm_policy_seg_t   *seg;
4139 4143          lgrp_shm_locality_t     *shm_locality;
4140 4144          avl_tree_t              *tree;
4141 4145          avl_index_t             where;
4142 4146  
4143 4147          ASSERT(amp || vp);
4144 4148          ASSERT((len & PAGEOFFSET) == 0);
4145 4149  
4146 4150          if (len == 0)
4147 4151                  return (-1);
4148 4152  
4149 4153          retval = 0;
4150 4154  
4151 4155          /*
4152 4156           * Get locality info and starting offset into shared object
4153 4157           * Try anon map first and then vnode
4154 4158           * Assume that no locks need to be held on anon_map or vnode, since
4155 4159           * it should be protected by its reference count which must be nonzero
4156 4160           * for an existing segment.
4157 4161           */
4158 4162          if (amp) {
4159 4163                  /*
4160 4164                   * Get policy info from anon_map
4161 4165                   *
4162 4166                   */
4163 4167                  ASSERT(amp->refcnt != 0);
4164 4168                  if (amp->locality == NULL)
4165 4169                          lgrp_shm_policy_init(amp, NULL);
4166 4170                  shm_locality = amp->locality;
4167 4171                  off = ptob(anon_index);
4168 4172          } else if (vp) {
4169 4173                  /*
4170 4174                   * Get policy info from vnode
4171 4175                   */
4172 4176                  if ((vp->v_flag & V_LOCALITY) == 0 || vp->v_locality == NULL)
4173 4177                          lgrp_shm_policy_init(NULL, vp);
4174 4178                  shm_locality = vp->v_locality;
4175 4179                  ASSERT(shm_locality->loc_count != 0);
4176 4180                  off = vn_off;
4177 4181          } else
4178 4182                  return (-1);
4179 4183  
4180 4184          ASSERT((off & PAGEOFFSET) == 0);
4181 4185  
4182 4186          /*
4183 4187           * Figure out default policy
4184 4188           */
4185 4189          if (policy == LGRP_MEM_POLICY_DEFAULT)
4186 4190                  policy = lgrp_mem_policy_default(len, MAP_SHARED);
4187 4191  
4188 4192          /*
4189 4193           * Create AVL tree if there isn't one yet
4190 4194           * and set locality field to point at it
4191 4195           */
4192 4196          rw_enter(&shm_locality->loc_lock, RW_WRITER);
4193 4197          tree = shm_locality->loc_tree;
4194 4198          if (!tree) {
4195 4199                  rw_exit(&shm_locality->loc_lock);
4196 4200  
4197 4201                  tree = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
4198 4202  
4199 4203                  rw_enter(&shm_locality->loc_lock, RW_WRITER);
4200 4204                  if (shm_locality->loc_tree == NULL) {
4201 4205                          avl_create(tree, lgrp_shm_policy_compar,
4202 4206                              sizeof (lgrp_shm_policy_seg_t),
4203 4207                              offsetof(lgrp_shm_policy_seg_t, shm_tree));
4204 4208                          shm_locality->loc_tree = tree;
4205 4209                  } else {
4206 4210                          /*
4207 4211                           * Another thread managed to set up the tree
4208 4212                           * before we could. Free the tree we allocated
4209 4213                           * and use the one that's already there.
4210 4214                           */
4211 4215                          kmem_free(tree, sizeof (*tree));
4212 4216                          tree = shm_locality->loc_tree;
4213 4217                  }
4214 4218          }
4215 4219  
4216 4220          /*
4217 4221           * Set policy
4218 4222           *
4219 4223           * Need to maintain hold on writer's lock to keep tree from
4220 4224           * changing out from under us
4221 4225           */
4222 4226          while (len != 0) {
4223 4227                  /*
4224 4228                   * Find policy segment for specified offset into shared object
4225 4229                   */
4226 4230                  seg = avl_find(tree, &off, &where);
4227 4231  
4228 4232                  /*
4229 4233                   * Didn't find any existing segment that contains specified
4230 4234                   * offset, so allocate new segment, insert it, and concatenate
4231 4235                   * with adjacent segments if possible
4232 4236                   */
4233 4237                  if (seg == NULL) {
4234 4238                          newseg = kmem_alloc(sizeof (lgrp_shm_policy_seg_t),
4235 4239                              KM_SLEEP);
4236 4240                          newseg->shm_policy.mem_policy = policy;
4237 4241                          newseg->shm_policy.mem_lgrpid = LGRP_NONE;
4238 4242                          newseg->shm_off = off;
4239 4243                          avl_insert(tree, newseg, where);
4240 4244  
4241 4245                          /*
4242 4246                           * Check to see whether new segment overlaps with next
4243 4247                           * one, set length of new segment accordingly, and
4244 4248                           * calculate remaining length and next offset
4245 4249                           */
4246 4250                          seg = AVL_NEXT(tree, newseg);
4247 4251                          if (seg == NULL || off + len <= seg->shm_off) {
4248 4252                                  newseg->shm_size = len;
4249 4253                                  len = 0;
4250 4254                          } else {
4251 4255                                  newseg->shm_size = seg->shm_off - off;
4252 4256                                  off = seg->shm_off;
4253 4257                                  len -= newseg->shm_size;
4254 4258                          }
4255 4259  
4256 4260                          /*
4257 4261                           * Try to concatenate new segment with next and
4258 4262                           * previous ones, since they might have the same policy
4259 4263                           * now.  Grab previous and next segments first because
4260 4264                           * they will change on concatenation.
4261 4265                           */
4262 4266                          prev =  AVL_PREV(tree, newseg);
4263 4267                          next = AVL_NEXT(tree, newseg);
4264 4268                          (void) lgrp_shm_policy_concat(tree, newseg, next);
4265 4269                          (void) lgrp_shm_policy_concat(tree, prev, newseg);
4266 4270  
4267 4271                          continue;
4268 4272                  }
4269 4273  
4270 4274                  eoff = off + len;
4271 4275                  oldeoff = seg->shm_off + seg->shm_size;
4272 4276  
4273 4277                  /*
4274 4278                   * Policy set already?
4275 4279                   */
4276 4280                  if (policy == seg->shm_policy.mem_policy) {
4277 4281                          /*
4278 4282                           * Nothing left to do if offset and length
4279 4283                           * fall within this segment
4280 4284                           */
4281 4285                          if (eoff <= oldeoff) {
4282 4286                                  retval = 1;
4283 4287                                  break;
4284 4288                          } else {
4285 4289                                  len = eoff - oldeoff;
4286 4290                                  off = oldeoff;
4287 4291                                  continue;
4288 4292                          }
4289 4293                  }
4290 4294  
4291 4295                  /*
4292 4296                   * Specified offset and length match existing segment exactly
4293 4297                   */
4294 4298                  if (off == seg->shm_off && len == seg->shm_size) {
4295 4299                          /*
4296 4300                           * Set policy and update current length
4297 4301                           */
4298 4302                          seg->shm_policy.mem_policy = policy;
4299 4303                          seg->shm_policy.mem_lgrpid = LGRP_NONE;
4300 4304                          len = 0;
4301 4305  
4302 4306                          /*
4303 4307                           * Try concatenating new segment with previous and next
4304 4308                           * segments, since they might have the same policy now.
4305 4309                           * Grab previous and next segments first because they
4306 4310                           * will change on concatenation.
4307 4311                           */
4308 4312                          prev =  AVL_PREV(tree, seg);
4309 4313                          next = AVL_NEXT(tree, seg);
4310 4314                          (void) lgrp_shm_policy_concat(tree, seg, next);
4311 4315                          (void) lgrp_shm_policy_concat(tree, prev, seg);
4312 4316                  } else {
4313 4317                          /*
4314 4318                           * Specified offset and length only apply to part of
4315 4319                           * existing segment
4316 4320                           */
4317 4321  
4318 4322                          /*
4319 4323                           * New segment starts in middle of old one, so split
4320 4324                           * new one off near beginning of old one
4321 4325                           */
4322 4326                          newseg = NULL;
4323 4327                          if (off > seg->shm_off) {
4324 4328                                  newseg = lgrp_shm_policy_split(tree, seg, off);
4325 4329  
4326 4330                                  /*
4327 4331                                   * New segment ends where old one did, so try
4328 4332                                   * to concatenate with next segment
4329 4333                                   */
4330 4334                                  if (eoff == oldeoff) {
4331 4335                                          newseg->shm_policy.mem_policy = policy;
4332 4336                                          newseg->shm_policy.mem_lgrpid =
4333 4337                                              LGRP_NONE;
4334 4338                                          (void) lgrp_shm_policy_concat(tree,
4335 4339                                              newseg, AVL_NEXT(tree, newseg));
4336 4340                                          break;
4337 4341                                  }
4338 4342                          }
4339 4343  
4340 4344                          /*
4341 4345                           * New segment ends before old one, so split off end of
4342 4346                           * old one
4343 4347                           */
4344 4348                          if (eoff < oldeoff) {
4345 4349                                  if (newseg) {
4346 4350                                          (void) lgrp_shm_policy_split(tree,
4347 4351                                              newseg, eoff);
4348 4352                                          newseg->shm_policy.mem_policy = policy;
4349 4353                                          newseg->shm_policy.mem_lgrpid =
4350 4354                                              LGRP_NONE;
4351 4355                                  } else {
4352 4356                                          (void) lgrp_shm_policy_split(tree, seg,
4353 4357                                              eoff);
4354 4358                                          seg->shm_policy.mem_policy = policy;
4355 4359                                          seg->shm_policy.mem_lgrpid = LGRP_NONE;
4356 4360                                  }
4357 4361  
4358 4362                                  if (off == seg->shm_off)
4359 4363                                          (void) lgrp_shm_policy_concat(tree,
4360 4364                                              AVL_PREV(tree, seg), seg);
4361 4365                                  break;
4362 4366                          }
4363 4367  
4364 4368                          /*
4365 4369                           * Calculate remaining length and next offset
4366 4370                           */
4367 4371                          len = eoff - oldeoff;
4368 4372                          off = oldeoff;
4369 4373                  }
4370 4374          }
4371 4375  
4372 4376          rw_exit(&shm_locality->loc_lock);
4373 4377          return (retval);
4374 4378  }
4375 4379  
4376 4380  /*
4377 4381   * Return the best memnode from which to allocate memory given
4378 4382   * an lgroup.
4379 4383   *
4380 4384   * "c" is for cookie, which is good enough for me.
4381 4385   * It references a cookie struct that should be zero'ed to initialize.
4382 4386   * The cookie should live on the caller's stack.
4383 4387   *
4384 4388   * The routine returns -1 when:
4385 4389   *      - traverse is 0, and all the memnodes in "lgrp" have been returned.
4386 4390   *      - traverse is 1, and all the memnodes in the system have been
4387 4391   *        returned.
4388 4392   */
4389 4393  int
4390 4394  lgrp_memnode_choose(lgrp_mnode_cookie_t *c)
4391 4395  {
4392 4396          lgrp_t          *lp = c->lmc_lgrp;
4393 4397          mnodeset_t      nodes = c->lmc_nodes;
4394 4398          int             cnt = c->lmc_cnt;
4395 4399          int             offset, mnode;
4396 4400  
4397 4401          extern int      max_mem_nodes;
4398 4402  
4399 4403          /*
4400 4404           * If the set is empty, and the caller is willing, traverse
4401 4405           * up the hierarchy until we find a non-empty set.
4402 4406           */
4403 4407          while (nodes == (mnodeset_t)0 || cnt <= 0) {
4404 4408                  if (c->lmc_scope == LGRP_SRCH_LOCAL ||
4405 4409                      ((lp = lp->lgrp_parent) == NULL))
4406 4410                          return (-1);
4407 4411  
4408 4412                  nodes = lp->lgrp_mnodes & ~(c->lmc_tried);
4409 4413                  cnt = lp->lgrp_nmnodes - c->lmc_ntried;
4410 4414          }
4411 4415  
4412 4416          /*
4413 4417           * Select a memnode by picking one at a "random" offset.
4414 4418           * Because of DR, memnodes can come and go at any time.
4415 4419           * This code must be able to cope with the possibility
4416 4420           * that the nodes count "cnt" is inconsistent with respect
4417 4421           * to the number of elements actually in "nodes", and
4418 4422           * therefore that the offset chosen could be greater than
4419 4423           * the number of elements in the set (some memnodes may
4420 4424           * have dissapeared just before cnt was read).
4421 4425           * If this happens, the search simply wraps back to the
4422 4426           * beginning of the set.
4423 4427           */
4424 4428          ASSERT(nodes != (mnodeset_t)0 && cnt > 0);
4425 4429          offset = c->lmc_rand % cnt;
4426 4430          do {
4427 4431                  for (mnode = 0; mnode < max_mem_nodes; mnode++)
4428 4432                          if (nodes & ((mnodeset_t)1 << mnode))
4429 4433                                  if (!offset--)
4430 4434                                          break;
4431 4435          } while (mnode >= max_mem_nodes);
4432 4436  
4433 4437          /* Found a node. Store state before returning. */
4434 4438          c->lmc_lgrp = lp;
4435 4439          c->lmc_nodes = (nodes & ~((mnodeset_t)1 << mnode));
4436 4440          c->lmc_cnt = cnt - 1;
4437 4441          c->lmc_tried = (c->lmc_tried | ((mnodeset_t)1 << mnode));
4438 4442          c->lmc_ntried++;
4439 4443  
4440 4444          return (mnode);
4441 4445  }

↓ open down ↓

3909 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX