1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  * Copyright 2018 Joyent, Inc.
  25  */
  26 
  27 /*
  28  * Basic NUMA support in terms of locality groups
  29  *
  30  * Solaris needs to know which CPUs, memory, etc. are near each other to
  31  * provide good performance on NUMA machines by optimizing for locality.
  32  * In order to do this, a new abstraction called a "locality group (lgroup)"
  33  * has been introduced to keep track of which CPU-like and memory-like hardware
  34  * resources are close to each other.  Currently, latency is the only measure
  35  * used to determine how to group hardware resources into lgroups, but this
  36  * does not limit the groupings to be based solely on latency.  Other factors
  37  * may be used to determine the groupings in the future.
  38  *
  39  * Lgroups are organized into a hieararchy or topology that represents the
  40  * latency topology of the machine.  There is always at least a root lgroup in
  41  * the system.  It represents all the hardware resources in the machine at a
  42  * latency big enough that any hardware resource can at least access any other
  43  * hardware resource within that latency.  A Uniform Memory Access (UMA)
  44  * machine is represented with one lgroup (the root).  In contrast, a NUMA
  45  * machine is represented at least by the root lgroup and some number of leaf
  46  * lgroups where the leaf lgroups contain the hardware resources within the
  47  * least latency of each other and the root lgroup still contains all the
  48  * resources in the machine.  Some number of intermediate lgroups may exist
  49  * which represent more levels of locality than just the local latency of the
  50  * leaf lgroups and the system latency of the root lgroup.  Non-leaf lgroups
  51  * (eg. root and intermediate lgroups) contain the next nearest resources to
  52  * its children lgroups.  Thus, the lgroup hierarchy from a given leaf lgroup
  53  * to the root lgroup shows the hardware resources from closest to farthest
  54  * from the leaf lgroup such that each successive ancestor lgroup contains
  55  * the next nearest resources at the next level of locality from the previous.
  56  *
  57  * The kernel uses the lgroup abstraction to know how to allocate resources
  58  * near a given process/thread.  At fork() and lwp/thread_create() time, a
  59  * "home" lgroup is chosen for a thread.  This is done by picking the lgroup
  60  * with the lowest load average.  Binding to a processor or processor set will
  61  * change the home lgroup for a thread.  The scheduler has been modified to try
  62  * to dispatch a thread on a CPU in its home lgroup.  Physical memory
  63  * allocation is lgroup aware too, so memory will be allocated from the current
  64  * thread's home lgroup if possible.  If the desired resources are not
  65  * available, the kernel traverses the lgroup hierarchy going to the parent
  66  * lgroup to find resources at the next level of locality until it reaches the
  67  * root lgroup.
  68  */
  69 
  70 #include <sys/lgrp.h>
  71 #include <sys/lgrp_user.h>
  72 #include <sys/types.h>
  73 #include <sys/mman.h>
  74 #include <sys/param.h>
  75 #include <sys/var.h>
  76 #include <sys/thread.h>
  77 #include <sys/cpuvar.h>
  78 #include <sys/cpupart.h>
  79 #include <sys/kmem.h>
  80 #include <vm/seg.h>
  81 #include <vm/seg_kmem.h>
  82 #include <vm/seg_spt.h>
  83 #include <vm/seg_vn.h>
  84 #include <vm/as.h>
  85 #include <sys/atomic.h>
  86 #include <sys/systm.h>
  87 #include <sys/errno.h>
  88 #include <sys/cmn_err.h>
  89 #include <sys/kstat.h>
  90 #include <sys/sysmacros.h>
  91 #include <sys/pg.h>
  92 #include <sys/promif.h>
  93 #include <sys/sdt.h>
  94 #include <sys/ht.h>
  95 
  96 lgrp_gen_t      lgrp_gen = 0;           /* generation of lgroup hierarchy */
  97 lgrp_t *lgrp_table[NLGRPS_MAX]; /* table of all initialized lgrp_t structs */
  98                                 /* indexed by lgrp_id */
  99 int     nlgrps;                 /* number of lgroups in machine */
 100 int     lgrp_alloc_hint = -1;   /* hint for where to try to allocate next */
 101 int     lgrp_alloc_max = 0;     /* max lgroup ID allocated so far */
 102 
 103 /*
 104  * Kstat data for lgroups.
 105  *
 106  * Actual kstat data is collected in lgrp_stats array.
 107  * The lgrp_kstat_data array of named kstats is used to extract data from
 108  * lgrp_stats and present it to kstat framework. It is protected from partallel
 109  * modifications by lgrp_kstat_mutex. This may cause some contention when
 110  * several kstat commands run in parallel but this is not the
 111  * performance-critical path.
 112  */
 113 extern struct lgrp_stats lgrp_stats[];  /* table of per-lgrp stats */
 114 
 115 /*
 116  * Declare kstat names statically for enums as defined in the header file.
 117  */
 118 LGRP_KSTAT_NAMES;
 119 
 120 static void     lgrp_kstat_init(void);
 121 static int      lgrp_kstat_extract(kstat_t *, int);
 122 static void     lgrp_kstat_reset(lgrp_id_t);
 123 
 124 static struct kstat_named lgrp_kstat_data[LGRP_NUM_STATS];
 125 static kmutex_t lgrp_kstat_mutex;
 126 
 127 
 128 /*
 129  * max number of lgroups supported by the platform
 130  */
 131 int     nlgrpsmax = 0;
 132 
 133 /*
 134  * The root lgroup. Represents the set of resources at the system wide
 135  * level of locality.
 136  */
 137 lgrp_t          *lgrp_root = NULL;
 138 
 139 /*
 140  * During system bootstrap cp_default does not contain the list of lgrp load
 141  * averages (cp_lgrploads). The list is allocated after the first CPU is brought
 142  * on-line when cp_default is initialized by cpupart_initialize_default().
 143  * Configuring CPU0 may create a two-level topology with root and one leaf node
 144  * containing CPU0. This topology is initially constructed in a special
 145  * statically allocated 2-element lpl list lpl_bootstrap_list and later cloned
 146  * to cp_default when cp_default is initialized. The lpl_bootstrap_list is used
 147  * for all lpl operations until cp_default is fully constructed.
 148  *
 149  * The lpl_bootstrap_list is maintained by the code in lgrp.c. Every other
 150  * consumer who needs default lpl should use lpl_bootstrap which is a pointer to
 151  * the first element of lpl_bootstrap_list.
 152  *
 153  * CPUs that are added to the system, but have not yet been assigned to an
 154  * lgrp will use lpl_bootstrap as a default lpl. This is necessary because
 155  * on some architectures (x86) it's possible for the slave CPU startup thread
 156  * to enter the dispatcher or allocate memory before calling lgrp_cpu_init().
 157  */
 158 #define LPL_BOOTSTRAP_SIZE 2
 159 static lpl_t    lpl_bootstrap_list[LPL_BOOTSTRAP_SIZE];
 160 lpl_t           *lpl_bootstrap;
 161 static lpl_t    *lpl_bootstrap_rset[LPL_BOOTSTRAP_SIZE];
 162 static int      lpl_bootstrap_id2rset[LPL_BOOTSTRAP_SIZE];
 163 
 164 /*
 165  * If cp still references the bootstrap lpl, it has not yet been added to
 166  * an lgrp. lgrp_mem_choose() uses this macro to detect the case where
 167  * a thread is trying to allocate memory close to a CPU that has no lgrp.
 168  */
 169 #define LGRP_CPU_HAS_NO_LGRP(cp)        ((cp)->cpu_lpl == lpl_bootstrap)
 170 
 171 static lgrp_t   lroot;
 172 
 173 /*
 174  * Size, in bytes, beyond which random memory allocation policy is applied
 175  * to non-shared memory.  Default is the maximum size, so random memory
 176  * allocation won't be used for non-shared memory by default.
 177  */
 178 size_t  lgrp_privm_random_thresh = (size_t)(-1);
 179 
 180 /* the maximum effect that a single thread can have on it's lgroup's load */
 181 #define LGRP_LOADAVG_MAX_EFFECT(ncpu) \
 182         ((lgrp_loadavg_max_effect) / (ncpu))
 183 uint32_t        lgrp_loadavg_max_effect = LGRP_LOADAVG_THREAD_MAX;
 184 
 185 
 186 /*
 187  * Size, in bytes, beyond which random memory allocation policy is applied to
 188  * shared memory.  Default is 8MB (2 ISM pages).
 189  */
 190 size_t  lgrp_shm_random_thresh = 8*1024*1024;
 191 
 192 /*
 193  * Whether to do processor set aware memory allocation by default
 194  */
 195 int     lgrp_mem_pset_aware = 0;
 196 
 197 /*
 198  * Set the default memory allocation policy for root lgroup
 199  */
 200 lgrp_mem_policy_t       lgrp_mem_policy_root = LGRP_MEM_POLICY_RANDOM;
 201 
 202 /*
 203  * Set the default memory allocation policy.  For most platforms,
 204  * next touch is sufficient, but some platforms may wish to override
 205  * this.
 206  */
 207 lgrp_mem_policy_t       lgrp_mem_default_policy = LGRP_MEM_POLICY_NEXT;
 208 
 209 
 210 /*
 211  * lgroup CPU event handlers
 212  */
 213 static void     lgrp_cpu_init(struct cpu *);
 214 static void     lgrp_cpu_fini(struct cpu *, lgrp_id_t);
 215 static lgrp_t   *lgrp_cpu_to_lgrp(struct cpu *);
 216 
 217 /*
 218  * lgroup memory event handlers
 219  */
 220 static void     lgrp_mem_init(int, lgrp_handle_t, boolean_t);
 221 static void     lgrp_mem_fini(int, lgrp_handle_t, boolean_t);
 222 static void     lgrp_mem_rename(int, lgrp_handle_t, lgrp_handle_t);
 223 
 224 /*
 225  * lgroup CPU partition event handlers
 226  */
 227 static void     lgrp_part_add_cpu(struct cpu *, lgrp_id_t);
 228 static void     lgrp_part_del_cpu(struct cpu *);
 229 
 230 /*
 231  * lgroup framework initialization
 232  */
 233 static void     lgrp_main_init(void);
 234 static void     lgrp_main_mp_init(void);
 235 static void     lgrp_root_init(void);
 236 static void     lgrp_setup(void);
 237 
 238 /*
 239  * lpl topology
 240  */
 241 static void     lpl_init(lpl_t *, lpl_t *, lgrp_t *);
 242 static void     lpl_clear(lpl_t *);
 243 static void     lpl_leaf_insert(lpl_t *, struct cpupart *);
 244 static void     lpl_leaf_remove(lpl_t *, struct cpupart *);
 245 static void     lpl_rset_add(lpl_t *, lpl_t *);
 246 static void     lpl_rset_del(lpl_t *, lpl_t *);
 247 static int      lpl_rset_contains(lpl_t *, lpl_t *);
 248 static void     lpl_cpu_adjcnt(lpl_act_t, struct cpu *);
 249 static void     lpl_child_update(lpl_t *, struct cpupart *);
 250 static int      lpl_pick(lpl_t *, lpl_t *);
 251 static void     lpl_verify_wrapper(struct cpupart *);
 252 
 253 /*
 254  * defines for lpl topology verifier return codes
 255  */
 256 
 257 #define LPL_TOPO_CORRECT                        0
 258 #define LPL_TOPO_PART_HAS_NO_LPL                -1
 259 #define LPL_TOPO_CPUS_NOT_EMPTY                 -2
 260 #define LPL_TOPO_LGRP_MISMATCH                  -3
 261 #define LPL_TOPO_MISSING_PARENT                 -4
 262 #define LPL_TOPO_PARENT_MISMATCH                -5
 263 #define LPL_TOPO_BAD_CPUCNT                     -6
 264 #define LPL_TOPO_RSET_MISMATCH                  -7
 265 #define LPL_TOPO_LPL_ORPHANED                   -8
 266 #define LPL_TOPO_LPL_BAD_NCPU                   -9
 267 #define LPL_TOPO_RSET_MSSNG_LF                  -10
 268 #define LPL_TOPO_CPU_HAS_BAD_LPL                -11
 269 #define LPL_TOPO_NONLEAF_HAS_CPUS               -12
 270 #define LPL_TOPO_LGRP_NOT_LEAF                  -13
 271 #define LPL_TOPO_BAD_RSETCNT                    -14
 272 
 273 /*
 274  * Return whether lgroup optimizations should be enabled on this system
 275  */
 276 int
 277 lgrp_optimizations(void)
 278 {
 279         /*
 280          * System must have more than 2 lgroups to enable lgroup optimizations
 281          *
 282          * XXX This assumes that a 2 lgroup system has an empty root lgroup
 283          * with one child lgroup containing all the resources. A 2 lgroup
 284          * system with a root lgroup directly containing CPUs or memory might
 285          * need lgroup optimizations with its child lgroup, but there
 286          * isn't such a machine for now....
 287          */
 288         if (nlgrps > 2)
 289                 return (1);
 290 
 291         return (0);
 292 }
 293 
 294 /*
 295  * Setup root lgroup
 296  */
 297 static void
 298 lgrp_root_init(void)
 299 {
 300         lgrp_handle_t   hand;
 301         int             i;
 302         lgrp_id_t       id;
 303 
 304         /*
 305          * Create the "root" lgroup
 306          */
 307         ASSERT(nlgrps == 0);
 308         id = nlgrps++;
 309 
 310         lgrp_root = &lroot;
 311 
 312         lgrp_root->lgrp_cpu = NULL;
 313         lgrp_root->lgrp_mnodes = 0;
 314         lgrp_root->lgrp_nmnodes = 0;
 315         hand = lgrp_plat_root_hand();
 316         lgrp_root->lgrp_plathand = hand;
 317 
 318         lgrp_root->lgrp_id = id;
 319         lgrp_root->lgrp_cpucnt = 0;
 320         lgrp_root->lgrp_childcnt = 0;
 321         klgrpset_clear(lgrp_root->lgrp_children);
 322         klgrpset_clear(lgrp_root->lgrp_leaves);
 323         lgrp_root->lgrp_parent = NULL;
 324         lgrp_root->lgrp_latency = lgrp_plat_latency(hand, hand);
 325 
 326         for (i = 0; i < LGRP_RSRC_COUNT; i++)
 327                 klgrpset_clear(lgrp_root->lgrp_set[i]);
 328 
 329         lgrp_root->lgrp_kstat = NULL;
 330 
 331         lgrp_table[id] = lgrp_root;
 332 
 333         /*
 334          * Setup initial lpl list for CPU0 and initial t0 home.
 335          * The only lpl space we have so far is lpl_bootstrap. It is used for
 336          * all topology operations until cp_default is initialized at which
 337          * point t0.t_lpl will be updated.
 338          */
 339         lpl_bootstrap = lpl_bootstrap_list;
 340         t0.t_lpl = lpl_bootstrap;
 341         cp_default.cp_nlgrploads = LPL_BOOTSTRAP_SIZE;
 342         lpl_bootstrap_list[1].lpl_lgrpid = 1;
 343 
 344         /*
 345          * Set up the bootstrap rset
 346          * Since the bootstrap toplogy has just the root, and a leaf,
 347          * the rset contains just the leaf, and both lpls can use the same rset
 348          */
 349         lpl_bootstrap_rset[0] = &lpl_bootstrap_list[1];
 350         lpl_bootstrap_list[0].lpl_rset_sz = 1;
 351         lpl_bootstrap_list[0].lpl_rset = lpl_bootstrap_rset;
 352         lpl_bootstrap_list[0].lpl_id2rset = lpl_bootstrap_id2rset;
 353 
 354         lpl_bootstrap_list[1].lpl_rset_sz = 1;
 355         lpl_bootstrap_list[1].lpl_rset = lpl_bootstrap_rset;
 356         lpl_bootstrap_list[1].lpl_id2rset = lpl_bootstrap_id2rset;
 357 
 358         cp_default.cp_lgrploads = lpl_bootstrap;
 359 }
 360 
 361 /*
 362  * Initialize the lgroup framework and allow the platform to do the same
 363  *
 364  * This happens in stages during boot and is all funnelled through this routine
 365  * (see definition of lgrp_init_stages_t to see what happens at each stage and
 366  * when)
 367  */
 368 void
 369 lgrp_init(lgrp_init_stages_t stage)
 370 {
 371         /*
 372          * Initialize the platform
 373          */
 374         lgrp_plat_init(stage);
 375 
 376         switch (stage) {
 377         case LGRP_INIT_STAGE1:
 378                 /*
 379                  * Set max number of lgroups supported on this platform which
 380                  * must be less than the max number of lgroups supported by the
 381                  * common lgroup framework (eg. NLGRPS_MAX is max elements in
 382                  * lgrp_table[], etc.)
 383                  */
 384                 nlgrpsmax = lgrp_plat_max_lgrps();
 385                 ASSERT(nlgrpsmax <= NLGRPS_MAX);
 386                 break;
 387 
 388         case LGRP_INIT_STAGE2:
 389                 lgrp_setup();
 390                 break;
 391 
 392         case LGRP_INIT_STAGE4:
 393                 lgrp_main_init();
 394                 break;
 395 
 396         case LGRP_INIT_STAGE5:
 397                 lgrp_main_mp_init();
 398                 break;
 399 
 400         default:
 401                 break;
 402         }
 403 }
 404 
 405 /*
 406  * Create the root and cpu0's lgroup, and set t0's home.
 407  */
 408 static void
 409 lgrp_setup(void)
 410 {
 411         /*
 412          * Setup the root lgroup
 413          */
 414         lgrp_root_init();
 415 
 416         /*
 417          * Add cpu0 to an lgroup
 418          */
 419         lgrp_config(LGRP_CONFIG_CPU_ADD, (uintptr_t)CPU, 0);
 420         lgrp_config(LGRP_CONFIG_CPU_ONLINE, (uintptr_t)CPU, 0);
 421 }
 422 
 423 /*
 424  * true when lgrp initialization has been completed.
 425  */
 426 int     lgrp_initialized = 0;
 427 
 428 /*
 429  * True when lgrp topology is constructed.
 430  */
 431 int     lgrp_topo_initialized = 0;
 432 
 433 /*
 434  * Init routine called after startup(), /etc/system has been processed,
 435  * and cpu0 has been added to an lgroup.
 436  */
 437 static void
 438 lgrp_main_init(void)
 439 {
 440         cpu_t           *cp = CPU;
 441         lgrp_id_t       lgrpid;
 442         int             i;
 443         extern void     pg_cpu0_reinit();
 444 
 445         /*
 446          * Enforce a valid lgrp_mem_default_policy
 447          */
 448         if ((lgrp_mem_default_policy <= LGRP_MEM_POLICY_DEFAULT) ||
 449             (lgrp_mem_default_policy >= LGRP_NUM_MEM_POLICIES) ||
 450             (lgrp_mem_default_policy == LGRP_MEM_POLICY_NEXT_SEG))
 451                 lgrp_mem_default_policy = LGRP_MEM_POLICY_NEXT;
 452 
 453         /*
 454          * See if mpo should be disabled.
 455          * This may happen in the case of null proc LPA on Starcat.
 456          * The platform won't be able to detect null proc LPA until after
 457          * cpu0 and memory have already been added to lgroups.
 458          * When and if it is detected, the Starcat platform will return
 459          * a different platform handle for cpu0 which is what we check for
 460          * here. If mpo should be disabled move cpu0 to it's rightful place
 461          * (the root), and destroy the remaining lgroups. This effectively
 462          * provides an UMA lgroup topology.
 463          */
 464         lgrpid = cp->cpu_lpl->lpl_lgrpid;
 465         if (lgrp_table[lgrpid]->lgrp_plathand !=
 466             lgrp_plat_cpu_to_hand(cp->cpu_id)) {
 467                 lgrp_part_del_cpu(cp);
 468                 lgrp_cpu_fini(cp, lgrpid);
 469 
 470                 lgrp_cpu_init(cp);
 471                 lgrp_part_add_cpu(cp, cp->cpu_lpl->lpl_lgrpid);
 472 
 473                 ASSERT(cp->cpu_lpl->lpl_lgrpid == LGRP_ROOTID);
 474 
 475                 /*
 476                  * Notify the PG subsystem that the CPU's lgrp
 477                  * association has changed
 478                  */
 479                 pg_cpu0_reinit();
 480 
 481                 /*
 482                  * Destroy all lgroups except for root
 483                  */
 484                 for (i = 0; i <= lgrp_alloc_max; i++) {
 485                         if (LGRP_EXISTS(lgrp_table[i]) &&
 486                             lgrp_table[i] != lgrp_root)
 487                                 lgrp_destroy(lgrp_table[i]);
 488                 }
 489 
 490                 /*
 491                  * Fix up root to point at itself for leaves and resources
 492                  * and not have any children
 493                  */
 494                 lgrp_root->lgrp_childcnt = 0;
 495                 klgrpset_clear(lgrp_root->lgrp_children);
 496                 klgrpset_clear(lgrp_root->lgrp_leaves);
 497                 klgrpset_add(lgrp_root->lgrp_leaves, LGRP_ROOTID);
 498                 klgrpset_clear(lgrp_root->lgrp_set[LGRP_RSRC_MEM]);
 499                 klgrpset_add(lgrp_root->lgrp_set[LGRP_RSRC_MEM], LGRP_ROOTID);
 500         }
 501 
 502         /*
 503          * Initialize kstats framework.
 504          */
 505         lgrp_kstat_init();
 506         /*
 507          * cpu0 is finally where it should be, so create it's lgroup's kstats
 508          */
 509         mutex_enter(&cpu_lock);
 510         lgrp_kstat_create(cp);
 511         mutex_exit(&cpu_lock);
 512 
 513         lgrp_initialized = 1;
 514 }
 515 
 516 /*
 517  * Finish lgrp initialization after all CPUS are brought on-line.
 518  * This routine is called after start_other_cpus().
 519  */
 520 static void
 521 lgrp_main_mp_init(void)
 522 {
 523         klgrpset_t changed;
 524 
 525         ht_init();
 526 
 527         /*
 528          * Update lgroup topology (if necessary)
 529          */
 530         klgrpset_clear(changed);
 531         (void) lgrp_topo_update(lgrp_table, lgrp_alloc_max + 1, &changed);
 532         lgrp_topo_initialized = 1;
 533 }
 534 
 535 /*
 536  * Change latency of lgroup with specified lgroup platform handle (if one is
 537  * given) or change all lgroups with old latency to new latency
 538  */
 539 void
 540 lgrp_latency_change(lgrp_handle_t hand, u_longlong_t oldtime,
 541     u_longlong_t newtime)
 542 {
 543         lgrp_t          *lgrp;
 544         int             i;
 545 
 546         for (i = 0; i <= lgrp_alloc_max; i++) {
 547                 lgrp = lgrp_table[i];
 548 
 549                 if (!LGRP_EXISTS(lgrp))
 550                         continue;
 551 
 552                 if ((hand == LGRP_NULL_HANDLE &&
 553                     lgrp->lgrp_latency == oldtime) ||
 554                     (hand != LGRP_NULL_HANDLE && lgrp->lgrp_plathand == hand))
 555                         lgrp->lgrp_latency = (int)newtime;
 556         }
 557 }
 558 
 559 /*
 560  * Handle lgroup (re)configuration events (eg. addition of CPU, etc.)
 561  */
 562 void
 563 lgrp_config(lgrp_config_flag_t event, uintptr_t resource, uintptr_t where)
 564 {
 565         klgrpset_t      changed;
 566         cpu_t           *cp;
 567         lgrp_id_t       id;
 568         int             rc;
 569 
 570         switch (event) {
 571         /*
 572          * The following (re)configuration events are common code
 573          * initiated. lgrp_plat_config() is called here to inform the
 574          * platform of the reconfiguration event.
 575          */
 576         case LGRP_CONFIG_CPU_ADD:
 577                 cp = (cpu_t *)resource;
 578 
 579                 /*
 580                  * Initialize the new CPU's lgrp related next/prev
 581                  * links, and give it a bootstrap lpl so that it can
 582                  * survive should it need to enter the dispatcher.
 583                  */
 584                 cp->cpu_next_lpl = cp;
 585                 cp->cpu_prev_lpl = cp;
 586                 cp->cpu_next_lgrp = cp;
 587                 cp->cpu_prev_lgrp = cp;
 588                 cp->cpu_lpl = lpl_bootstrap;
 589 
 590                 lgrp_plat_config(event, resource);
 591                 atomic_inc_32(&lgrp_gen);
 592 
 593                 break;
 594         case LGRP_CONFIG_CPU_DEL:
 595                 lgrp_plat_config(event, resource);
 596                 atomic_inc_32(&lgrp_gen);
 597 
 598                 break;
 599         case LGRP_CONFIG_CPU_ONLINE:
 600                 cp = (cpu_t *)resource;
 601                 lgrp_cpu_init(cp);
 602                 lgrp_part_add_cpu(cp, cp->cpu_lpl->lpl_lgrpid);
 603                 rc = lpl_topo_verify(cp->cpu_part);
 604                 if (rc != LPL_TOPO_CORRECT) {
 605                         panic("lpl_topo_verify failed: %d", rc);
 606                 }
 607                 lgrp_plat_config(event, resource);
 608                 atomic_inc_32(&lgrp_gen);
 609 
 610                 break;
 611         case LGRP_CONFIG_CPU_OFFLINE:
 612                 cp = (cpu_t *)resource;
 613                 id = cp->cpu_lpl->lpl_lgrpid;
 614                 lgrp_part_del_cpu(cp);
 615                 lgrp_cpu_fini(cp, id);
 616                 rc = lpl_topo_verify(cp->cpu_part);
 617                 if (rc != LPL_TOPO_CORRECT) {
 618                         panic("lpl_topo_verify failed: %d", rc);
 619                 }
 620                 lgrp_plat_config(event, resource);
 621                 atomic_inc_32(&lgrp_gen);
 622 
 623                 break;
 624         case LGRP_CONFIG_CPUPART_ADD:
 625                 cp = (cpu_t *)resource;
 626                 lgrp_part_add_cpu((cpu_t *)resource, (lgrp_id_t)where);
 627                 rc = lpl_topo_verify(cp->cpu_part);
 628                 if (rc != LPL_TOPO_CORRECT) {
 629                         panic("lpl_topo_verify failed: %d", rc);
 630                 }
 631                 lgrp_plat_config(event, resource);
 632 
 633                 break;
 634         case LGRP_CONFIG_CPUPART_DEL:
 635                 cp = (cpu_t *)resource;
 636                 lgrp_part_del_cpu((cpu_t *)resource);
 637                 rc = lpl_topo_verify(cp->cpu_part);
 638                 if (rc != LPL_TOPO_CORRECT) {
 639                         panic("lpl_topo_verify failed: %d", rc);
 640                 }
 641                 lgrp_plat_config(event, resource);
 642 
 643                 break;
 644         /*
 645          * The following events are initiated by the memnode
 646          * subsystem.
 647          */
 648         case LGRP_CONFIG_MEM_ADD:
 649                 lgrp_mem_init((int)resource, where, B_FALSE);
 650                 atomic_inc_32(&lgrp_gen);
 651 
 652                 break;
 653         case LGRP_CONFIG_MEM_DEL:
 654                 lgrp_mem_fini((int)resource, where, B_FALSE);
 655                 atomic_inc_32(&lgrp_gen);
 656 
 657                 break;
 658         case LGRP_CONFIG_MEM_RENAME: {
 659                 lgrp_config_mem_rename_t *ren_arg =
 660                     (lgrp_config_mem_rename_t *)where;
 661 
 662                 lgrp_mem_rename((int)resource,
 663                     ren_arg->lmem_rename_from,
 664                     ren_arg->lmem_rename_to);
 665                 atomic_inc_32(&lgrp_gen);
 666 
 667                 break;
 668         }
 669         case LGRP_CONFIG_GEN_UPDATE:
 670                 atomic_inc_32(&lgrp_gen);
 671 
 672                 break;
 673         case LGRP_CONFIG_FLATTEN:
 674                 if (where == 0)
 675                         lgrp_topo_levels = (int)resource;
 676                 else
 677                         (void) lgrp_topo_flatten(resource,
 678                             lgrp_table, lgrp_alloc_max, &changed);
 679 
 680                 break;
 681         /*
 682          * Update any lgroups with old latency to new latency
 683          */
 684         case LGRP_CONFIG_LAT_CHANGE_ALL:
 685                 lgrp_latency_change(LGRP_NULL_HANDLE, (u_longlong_t)resource,
 686                     (u_longlong_t)where);
 687 
 688                 break;
 689         /*
 690          * Update lgroup with specified lgroup platform handle to have
 691          * new latency
 692          */
 693         case LGRP_CONFIG_LAT_CHANGE:
 694                 lgrp_latency_change((lgrp_handle_t)resource, 0,
 695                     (u_longlong_t)where);
 696 
 697                 break;
 698         case LGRP_CONFIG_NOP:
 699 
 700                 break;
 701         default:
 702                 break;
 703         }
 704 
 705 }
 706 
 707 /*
 708  * Called to add lgrp info into cpu structure from cpu_add_unit;
 709  * do not assume cpu is in cpu[] yet!
 710  *
 711  * CPUs are brought online with all other CPUs paused so we can't
 712  * allocate memory or we could deadlock the system, so we rely on
 713  * the platform to statically allocate as much space as we need
 714  * for the lgrp structs and stats.
 715  */
 716 static void
 717 lgrp_cpu_init(struct cpu *cp)
 718 {
 719         klgrpset_t      changed;
 720         int             count;
 721         lgrp_handle_t   hand;
 722         int             first_cpu;
 723         lgrp_t          *my_lgrp;
 724         lgrp_id_t       lgrpid;
 725         struct cpu      *cptr;
 726 
 727         /*
 728          * This is the first time through if the resource set
 729          * for the root lgroup is empty. After cpu0 has been
 730          * initially added to an lgroup, the root's CPU resource
 731          * set can never be empty, since the system's last CPU
 732          * cannot be offlined.
 733          */
 734         if (klgrpset_isempty(lgrp_root->lgrp_set[LGRP_RSRC_CPU])) {
 735                 /*
 736                  * First time through.
 737                  */
 738                 first_cpu = 1;
 739         } else {
 740                 /*
 741                  * If cpu0 needs to move lgroups, we may come
 742                  * through here again, at which time cpu_lock won't
 743                  * be held, and lgrp_initialized will be false.
 744                  */
 745                 ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
 746                 ASSERT(cp->cpu_part != NULL);
 747                 first_cpu = 0;
 748         }
 749 
 750         hand = lgrp_plat_cpu_to_hand(cp->cpu_id);
 751         my_lgrp = lgrp_hand_to_lgrp(hand);
 752 
 753         if (my_lgrp == NULL) {
 754                 /*
 755                  * Create new lgrp and add it to lgroup topology
 756                  */
 757                 my_lgrp = lgrp_create();
 758                 my_lgrp->lgrp_plathand = hand;
 759                 my_lgrp->lgrp_latency = lgrp_plat_latency(hand, hand);
 760                 lgrpid = my_lgrp->lgrp_id;
 761                 klgrpset_add(my_lgrp->lgrp_leaves, lgrpid);
 762                 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
 763 
 764                 count = 0;
 765                 klgrpset_clear(changed);
 766                 count += lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
 767                     &changed);
 768                 /*
 769                  * May have added new intermediate lgroups, so need to add
 770                  * resources other than CPUs which are added below
 771                  */
 772                 (void) lgrp_mnode_update(changed, NULL);
 773         } else if (my_lgrp->lgrp_latency == 0 && lgrp_plat_latency(hand, hand)
 774             > 0) {
 775                 /*
 776                  * Leaf lgroup was created, but latency wasn't available
 777                  * then.  So, set latency for it and fill in rest of lgroup
 778                  * topology  now that we know how far it is from other leaf
 779                  * lgroups.
 780                  */
 781                 lgrpid = my_lgrp->lgrp_id;
 782                 klgrpset_clear(changed);
 783                 if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_CPU],
 784                     lgrpid))
 785                         klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
 786                 count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
 787                     &changed);
 788 
 789                 /*
 790                  * May have added new intermediate lgroups, so need to add
 791                  * resources other than CPUs which are added below
 792                  */
 793                 (void) lgrp_mnode_update(changed, NULL);
 794         } else if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_CPU],
 795             my_lgrp->lgrp_id)) {
 796                 int     i;
 797 
 798                 /*
 799                  * Update existing lgroup and lgroups containing it with CPU
 800                  * resource
 801                  */
 802                 lgrpid = my_lgrp->lgrp_id;
 803                 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
 804                 for (i = 0; i <= lgrp_alloc_max; i++) {
 805                         lgrp_t          *lgrp;
 806 
 807                         lgrp = lgrp_table[i];
 808                         if (!LGRP_EXISTS(lgrp) ||
 809                             !lgrp_rsets_member(lgrp->lgrp_set, lgrpid))
 810                                 continue;
 811 
 812                         klgrpset_add(lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
 813                 }
 814         }
 815 
 816         lgrpid = my_lgrp->lgrp_id;
 817         cp->cpu_lpl = &cp->cpu_part->cp_lgrploads[lgrpid];
 818 
 819         /*
 820          * For multi-lgroup systems, need to setup lpl for CPU0 or CPU0 will
 821          * end up in lpl for lgroup 0 whether it is supposed to be in there or
 822          * not since none of lgroup IDs in the lpl's have been set yet.
 823          */
 824         if (first_cpu && nlgrpsmax > 1 && lgrpid != cp->cpu_lpl->lpl_lgrpid)
 825                 cp->cpu_lpl->lpl_lgrpid = lgrpid;
 826 
 827         /*
 828          * link the CPU into the lgrp's CPU list
 829          */
 830         if (my_lgrp->lgrp_cpucnt == 0) {
 831                 my_lgrp->lgrp_cpu = cp;
 832                 cp->cpu_next_lgrp = cp->cpu_prev_lgrp = cp;
 833         } else {
 834                 cptr = my_lgrp->lgrp_cpu;
 835                 cp->cpu_next_lgrp = cptr;
 836                 cp->cpu_prev_lgrp = cptr->cpu_prev_lgrp;
 837                 cptr->cpu_prev_lgrp->cpu_next_lgrp = cp;
 838                 cptr->cpu_prev_lgrp = cp;
 839         }
 840         my_lgrp->lgrp_cpucnt++;
 841 }
 842 
 843 lgrp_t *
 844 lgrp_create(void)
 845 {
 846         lgrp_t          *my_lgrp;
 847         lgrp_id_t       lgrpid;
 848         int             i;
 849 
 850         ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock));
 851 
 852         /*
 853          * Find an open slot in the lgroup table and recycle unused lgroup
 854          * left there if any
 855          */
 856         my_lgrp = NULL;
 857         if (lgrp_alloc_hint == -1)
 858                 /*
 859                  * Allocate from end when hint not set yet because no lgroups
 860                  * have been deleted yet
 861                  */
 862                 lgrpid = nlgrps++;
 863         else {
 864                 /*
 865                  * Start looking for next open slot from hint and leave hint
 866                  * at slot allocated
 867                  */
 868                 for (i = lgrp_alloc_hint; i < nlgrpsmax; i++) {
 869                         my_lgrp = lgrp_table[i];
 870                         if (!LGRP_EXISTS(my_lgrp)) {
 871                                 lgrpid = i;
 872                                 nlgrps++;
 873                                 break;
 874                         }
 875                 }
 876                 lgrp_alloc_hint = lgrpid;
 877         }
 878 
 879         /*
 880          * Keep track of max lgroup ID allocated so far to cut down on searches
 881          */
 882         if (lgrpid > lgrp_alloc_max)
 883                 lgrp_alloc_max = lgrpid;
 884 
 885         /*
 886          * Need to allocate new lgroup if next open slot didn't have one
 887          * for recycling
 888          */
 889         if (my_lgrp == NULL)
 890                 my_lgrp = lgrp_plat_alloc(lgrpid);
 891 
 892         if (nlgrps > nlgrpsmax || my_lgrp == NULL)
 893                 panic("Too many lgrps for platform (%d)", nlgrps);
 894 
 895         my_lgrp->lgrp_id = lgrpid;
 896         my_lgrp->lgrp_latency = 0;
 897         my_lgrp->lgrp_plathand = LGRP_NULL_HANDLE;
 898         my_lgrp->lgrp_parent = NULL;
 899         my_lgrp->lgrp_childcnt = 0;
 900         my_lgrp->lgrp_mnodes = (mnodeset_t)0;
 901         my_lgrp->lgrp_nmnodes = 0;
 902         klgrpset_clear(my_lgrp->lgrp_children);
 903         klgrpset_clear(my_lgrp->lgrp_leaves);
 904         for (i = 0; i < LGRP_RSRC_COUNT; i++)
 905                 klgrpset_clear(my_lgrp->lgrp_set[i]);
 906 
 907         my_lgrp->lgrp_cpu = NULL;
 908         my_lgrp->lgrp_cpucnt = 0;
 909 
 910         if (my_lgrp->lgrp_kstat != NULL)
 911                 lgrp_kstat_reset(lgrpid);
 912 
 913         lgrp_table[my_lgrp->lgrp_id] = my_lgrp;
 914 
 915         return (my_lgrp);
 916 }
 917 
 918 void
 919 lgrp_destroy(lgrp_t *lgrp)
 920 {
 921         int             i;
 922 
 923         /*
 924          * Unless this lgroup is being destroyed on behalf of
 925          * the boot CPU, cpu_lock must be held
 926          */
 927         ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock));
 928 
 929         if (nlgrps == 1)
 930                 cmn_err(CE_PANIC, "Can't destroy only lgroup!");
 931 
 932         if (!LGRP_EXISTS(lgrp))
 933                 return;
 934 
 935         /*
 936          * Set hint to lgroup being deleted and try to keep lower numbered
 937          * hints to facilitate finding empty slots
 938          */
 939         if (lgrp_alloc_hint == -1 || lgrp->lgrp_id < lgrp_alloc_hint)
 940                 lgrp_alloc_hint = lgrp->lgrp_id;
 941 
 942         /*
 943          * Mark this lgroup to be recycled by setting its lgroup ID to
 944          * LGRP_NONE and clear relevant fields
 945          */
 946         lgrp->lgrp_id = LGRP_NONE;
 947         lgrp->lgrp_latency = 0;
 948         lgrp->lgrp_plathand = LGRP_NULL_HANDLE;
 949         lgrp->lgrp_parent = NULL;
 950         lgrp->lgrp_childcnt = 0;
 951 
 952         klgrpset_clear(lgrp->lgrp_children);
 953         klgrpset_clear(lgrp->lgrp_leaves);
 954         for (i = 0; i < LGRP_RSRC_COUNT; i++)
 955                 klgrpset_clear(lgrp->lgrp_set[i]);
 956 
 957         lgrp->lgrp_mnodes = (mnodeset_t)0;
 958         lgrp->lgrp_nmnodes = 0;
 959 
 960         lgrp->lgrp_cpu = NULL;
 961         lgrp->lgrp_cpucnt = 0;
 962 
 963         nlgrps--;
 964 }
 965 
 966 /*
 967  * Initialize kstat data. Called from lgrp intialization code.
 968  */
 969 static void
 970 lgrp_kstat_init(void)
 971 {
 972         lgrp_stat_t     stat;
 973 
 974         mutex_init(&lgrp_kstat_mutex, NULL, MUTEX_DEFAULT, NULL);
 975 
 976         for (stat = 0; stat < LGRP_NUM_STATS; stat++)
 977                 kstat_named_init(&lgrp_kstat_data[stat],
 978                     lgrp_kstat_names[stat], KSTAT_DATA_INT64);
 979 }
 980 
 981 /*
 982  * initialize an lgrp's kstats if needed
 983  * called with cpu_lock held but not with cpus paused.
 984  * we don't tear these down now because we don't know about
 985  * memory leaving the lgrp yet...
 986  */
 987 
 988 void
 989 lgrp_kstat_create(cpu_t *cp)
 990 {
 991         kstat_t         *lgrp_kstat;
 992         lgrp_id_t       lgrpid;
 993         lgrp_t          *my_lgrp;
 994 
 995         ASSERT(MUTEX_HELD(&cpu_lock));
 996 
 997         lgrpid = cp->cpu_lpl->lpl_lgrpid;
 998         my_lgrp = lgrp_table[lgrpid];
 999 
1000         if (my_lgrp->lgrp_kstat != NULL)
1001                 return; /* already initialized */
1002 
1003         lgrp_kstat = kstat_create("lgrp", lgrpid, NULL, "misc",
1004             KSTAT_TYPE_NAMED, LGRP_NUM_STATS,
1005             KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE);
1006 
1007         if (lgrp_kstat != NULL) {
1008                 lgrp_kstat->ks_lock = &lgrp_kstat_mutex;
1009                 lgrp_kstat->ks_private = my_lgrp;
1010                 lgrp_kstat->ks_data = &lgrp_kstat_data;
1011                 lgrp_kstat->ks_update = lgrp_kstat_extract;
1012                 my_lgrp->lgrp_kstat = lgrp_kstat;
1013                 kstat_install(lgrp_kstat);
1014         }
1015 }
1016 
1017 /*
1018  * this will do something when we manage to remove now unused lgrps
1019  */
1020 
1021 /* ARGSUSED */
1022 void
1023 lgrp_kstat_destroy(cpu_t *cp)
1024 {
1025         ASSERT(MUTEX_HELD(&cpu_lock));
1026 }
1027 
1028 /*
1029  * Called when a CPU is off-lined.
1030  */
1031 static void
1032 lgrp_cpu_fini(struct cpu *cp, lgrp_id_t lgrpid)
1033 {
1034         lgrp_t *my_lgrp;
1035         struct cpu *prev;
1036         struct cpu *next;
1037 
1038         ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
1039 
1040         prev = cp->cpu_prev_lgrp;
1041         next = cp->cpu_next_lgrp;
1042 
1043         prev->cpu_next_lgrp = next;
1044         next->cpu_prev_lgrp = prev;
1045 
1046         /*
1047          * just because I'm paranoid doesn't mean...
1048          */
1049 
1050         cp->cpu_next_lgrp = cp->cpu_prev_lgrp = NULL;
1051 
1052         my_lgrp = lgrp_table[lgrpid];
1053         my_lgrp->lgrp_cpucnt--;
1054 
1055         /*
1056          * Removing last CPU in lgroup, so update lgroup topology
1057          */
1058         if (my_lgrp->lgrp_cpucnt == 0) {
1059                 klgrpset_t      changed;
1060                 int             count;
1061                 int             i;
1062 
1063                 my_lgrp->lgrp_cpu = NULL;
1064 
1065                 /*
1066                  * Remove this lgroup from its lgroup CPU resources and remove
1067                  * lgroup from lgroup topology if it doesn't have any more
1068                  * resources in it now
1069                  */
1070                 klgrpset_del(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
1071                 if (lgrp_rsets_empty(my_lgrp->lgrp_set)) {
1072                         count = 0;
1073                         klgrpset_clear(changed);
1074                         count += lgrp_leaf_delete(my_lgrp, lgrp_table,
1075                             lgrp_alloc_max + 1, &changed);
1076                         return;
1077                 }
1078 
1079                 /*
1080                  * This lgroup isn't empty, so just remove it from CPU
1081                  * resources of any lgroups that contain it as such
1082                  */
1083                 for (i = 0; i <= lgrp_alloc_max; i++) {
1084                         lgrp_t          *lgrp;
1085 
1086                         lgrp = lgrp_table[i];
1087                         if (!LGRP_EXISTS(lgrp) ||
1088                             !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_CPU],
1089                             lgrpid))
1090                                 continue;
1091 
1092                         klgrpset_del(lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
1093                 }
1094                 return;
1095         }
1096 
1097         if (my_lgrp->lgrp_cpu == cp)
1098                 my_lgrp->lgrp_cpu = next;
1099 
1100 }
1101 
1102 /*
1103  * Update memory nodes in target lgroups and return ones that get changed
1104  */
1105 int
1106 lgrp_mnode_update(klgrpset_t target, klgrpset_t *changed)
1107 {
1108         int     count;
1109         int     i;
1110         int     j;
1111         lgrp_t  *lgrp;
1112         lgrp_t  *lgrp_rsrc;
1113 
1114         count = 0;
1115         if (changed)
1116                 klgrpset_clear(*changed);
1117 
1118         if (klgrpset_isempty(target))
1119                 return (0);
1120 
1121         /*
1122          * Find each lgroup in target lgroups
1123          */
1124         for (i = 0; i <= lgrp_alloc_max; i++) {
1125                 /*
1126                  * Skip any lgroups that don't exist or aren't in target group
1127                  */
1128                 lgrp = lgrp_table[i];
1129                 if (!klgrpset_ismember(target, i) || !LGRP_EXISTS(lgrp)) {
1130                         continue;
1131                 }
1132 
1133                 /*
1134                  * Initialize memnodes for intermediate lgroups to 0
1135                  * and update them from scratch since they may have completely
1136                  * changed
1137                  */
1138                 if (lgrp->lgrp_childcnt && lgrp != lgrp_root) {
1139                         lgrp->lgrp_mnodes = (mnodeset_t)0;
1140                         lgrp->lgrp_nmnodes = 0;
1141                 }
1142 
1143                 /*
1144                  * Update memory nodes of of target lgroup with memory nodes
1145                  * from each lgroup in its lgroup memory resource set
1146                  */
1147                 for (j = 0; j <= lgrp_alloc_max; j++) {
1148                         int     k;
1149 
1150                         /*
1151                          * Skip any lgroups that don't exist or aren't in
1152                          * memory resources of target lgroup
1153                          */
1154                         lgrp_rsrc = lgrp_table[j];
1155                         if (!LGRP_EXISTS(lgrp_rsrc) ||
1156                             !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM],
1157                             j))
1158                                 continue;
1159 
1160                         /*
1161                          * Update target lgroup's memnodes to include memnodes
1162                          * of this lgroup
1163                          */
1164                         for (k = 0; k < sizeof (mnodeset_t) * NBBY; k++) {
1165                                 mnodeset_t      mnode_mask;
1166 
1167                                 mnode_mask = (mnodeset_t)1 << k;
1168                                 if ((lgrp_rsrc->lgrp_mnodes & mnode_mask) &&
1169                                     !(lgrp->lgrp_mnodes & mnode_mask)) {
1170                                         lgrp->lgrp_mnodes |= mnode_mask;
1171                                         lgrp->lgrp_nmnodes++;
1172                                 }
1173                         }
1174                         count++;
1175                         if (changed)
1176                                 klgrpset_add(*changed, lgrp->lgrp_id);
1177                 }
1178         }
1179 
1180         return (count);
1181 }
1182 
1183 /*
1184  * Memory copy-rename. Called when the "mnode" containing the kernel cage memory
1185  * is moved from one board to another. The "from" and "to" arguments specify the
1186  * source and the destination of the move.
1187  *
1188  * See plat_lgrp_config() for a detailed description of the copy-rename
1189  * semantics.
1190  *
1191  * The lgrp_mem_rename() is called by the platform copy-rename code to update
1192  * the lgroup topology which is changing as memory moves from one lgroup to
1193  * another. It removes the mnode from the source lgroup and re-inserts it in the
1194  * target lgroup.
1195  *
1196  * The lgrp_mem_rename() function passes a flag to lgrp_mem_init() and
1197  * lgrp_mem_fini() telling that the insertion and deleteion are part of a DR
1198  * copy-rename operation.
1199  *
1200  * There is one case which requires special handling. If the system contains
1201  * only two boards (mnodes), the lgrp_mem_fini() removes the only mnode from the
1202  * lgroup hierarchy. This mnode is soon re-inserted back in the hierarchy by
1203  * lgrp_mem_init), but there is a window when the system has no memory in the
1204  * lgroup hierarchy. If another thread tries to allocate memory during this
1205  * window, the allocation will fail, although the system has physical memory.
1206  * This may cause a system panic or a deadlock (some sleeping memory allocations
1207  * happen with cpu_lock held which prevents lgrp_mem_init() from re-inserting
1208  * the mnode back).
1209  *
1210  * The lgrp_memnode_choose() function walks the lgroup hierarchy looking for the
1211  * lgrp with non-empty lgrp_mnodes. To deal with the special case above,
1212  * lgrp_mem_fini() does not remove the last mnode from the lroot->lgrp_mnodes,
1213  * but it updates the rest of the lgroup topology as if the mnode was actually
1214  * removed. The lgrp_mem_init() function recognizes that the mnode being
1215  * inserted represents such a special case and updates the topology
1216  * appropriately.
1217  */
1218 void
1219 lgrp_mem_rename(int mnode, lgrp_handle_t from, lgrp_handle_t to)
1220 {
1221         /*
1222          * Remove the memory from the source node and add it to the destination
1223          * node.
1224          */
1225         lgrp_mem_fini(mnode, from, B_TRUE);
1226         lgrp_mem_init(mnode, to, B_TRUE);
1227 }
1228 
1229 /*
1230  * Called to indicate that the lgrp with platform handle "hand" now
1231  * contains the memory identified by "mnode".
1232  *
1233  * LOCKING for this routine is a bit tricky. Usually it is called without
1234  * cpu_lock and it must must grab cpu_lock here to prevent racing with other
1235  * callers. During DR of the board containing the caged memory it may be called
1236  * with cpu_lock already held and CPUs paused.
1237  *
1238  * If the insertion is part of the DR copy-rename and the inserted mnode (and
1239  * only this mnode) is already present in the lgrp_root->lgrp_mnodes set, we are
1240  * dealing with the special case of DR copy-rename described in
1241  * lgrp_mem_rename().
1242  */
1243 void
1244 lgrp_mem_init(int mnode, lgrp_handle_t hand, boolean_t is_copy_rename)
1245 {
1246         klgrpset_t      changed;
1247         int             count;
1248         int             i;
1249         lgrp_t          *my_lgrp;
1250         lgrp_id_t       lgrpid;
1251         mnodeset_t      mnodes_mask = ((mnodeset_t)1 << mnode);
1252         boolean_t       drop_lock = B_FALSE;
1253         boolean_t       need_synch = B_FALSE;
1254 
1255         /*
1256          * Grab CPU lock (if we haven't already)
1257          */
1258         if (!MUTEX_HELD(&cpu_lock)) {
1259                 mutex_enter(&cpu_lock);
1260                 drop_lock = B_TRUE;
1261         }
1262 
1263         /*
1264          * This routine may be called from a context where we already
1265          * hold cpu_lock, and have already paused cpus.
1266          */
1267         if (!cpus_paused())
1268                 need_synch = B_TRUE;
1269 
1270         /*
1271          * Check if this mnode is already configured and return immediately if
1272          * it is.
1273          *
1274          * NOTE: in special case of copy-rename of the only remaining mnode,
1275          * lgrp_mem_fini() refuses to remove the last mnode from the root, so we
1276          * recognize this case and continue as usual, but skip the update to
1277          * the lgrp_mnodes and the lgrp_nmnodes. This restores the inconsistency
1278          * in topology, temporarily introduced by lgrp_mem_fini().
1279          */
1280         if (! (is_copy_rename && (lgrp_root->lgrp_mnodes == mnodes_mask)) &&
1281             lgrp_root->lgrp_mnodes & mnodes_mask) {
1282                 if (drop_lock)
1283                         mutex_exit(&cpu_lock);
1284                 return;
1285         }
1286 
1287         /*
1288          * Update lgroup topology with new memory resources, keeping track of
1289          * which lgroups change
1290          */
1291         count = 0;
1292         klgrpset_clear(changed);
1293         my_lgrp = lgrp_hand_to_lgrp(hand);
1294         if (my_lgrp == NULL) {
1295                 /* new lgrp */
1296                 my_lgrp = lgrp_create();
1297                 lgrpid = my_lgrp->lgrp_id;
1298                 my_lgrp->lgrp_plathand = hand;
1299                 my_lgrp->lgrp_latency = lgrp_plat_latency(hand, hand);
1300                 klgrpset_add(my_lgrp->lgrp_leaves, lgrpid);
1301                 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1302 
1303                 if (need_synch)
1304                         pause_cpus(NULL, NULL);
1305                 count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
1306                     &changed);
1307                 if (need_synch)
1308                         start_cpus();
1309         } else if (my_lgrp->lgrp_latency == 0 && lgrp_plat_latency(hand, hand)
1310             > 0) {
1311                 /*
1312                  * Leaf lgroup was created, but latency wasn't available
1313                  * then.  So, set latency for it and fill in rest of lgroup
1314                  * topology  now that we know how far it is from other leaf
1315                  * lgroups.
1316                  */
1317                 klgrpset_clear(changed);
1318                 lgrpid = my_lgrp->lgrp_id;
1319                 if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_MEM],
1320                     lgrpid))
1321                         klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1322                 if (need_synch)
1323                         pause_cpus(NULL, NULL);
1324                 count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
1325                     &changed);
1326                 if (need_synch)
1327                         start_cpus();
1328         } else if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_MEM],
1329             my_lgrp->lgrp_id)) {
1330                 /*
1331                  * Add new lgroup memory resource to existing lgroup
1332                  */
1333                 lgrpid = my_lgrp->lgrp_id;
1334                 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1335                 klgrpset_add(changed, lgrpid);
1336                 count++;
1337                 for (i = 0; i <= lgrp_alloc_max; i++) {
1338                         lgrp_t          *lgrp;
1339 
1340                         lgrp = lgrp_table[i];
1341                         if (!LGRP_EXISTS(lgrp) ||
1342                             !lgrp_rsets_member(lgrp->lgrp_set, lgrpid))
1343                                 continue;
1344 
1345                         klgrpset_add(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1346                         klgrpset_add(changed, lgrp->lgrp_id);
1347                         count++;
1348                 }
1349         }
1350 
1351         /*
1352          * Add memory node to lgroup and remove lgroup from ones that need
1353          * to be updated
1354          */
1355         if (!(my_lgrp->lgrp_mnodes & mnodes_mask)) {
1356                 my_lgrp->lgrp_mnodes |= mnodes_mask;
1357                 my_lgrp->lgrp_nmnodes++;
1358         }
1359         klgrpset_del(changed, lgrpid);
1360 
1361         /*
1362          * Update memory node information for all lgroups that changed and
1363          * contain new memory node as a resource
1364          */
1365         if (count)
1366                 (void) lgrp_mnode_update(changed, NULL);
1367 
1368         if (drop_lock)
1369                 mutex_exit(&cpu_lock);
1370 }
1371 
1372 /*
1373  * Called to indicate that the lgroup associated with the platform
1374  * handle "hand" no longer contains given memory node
1375  *
1376  * LOCKING for this routine is a bit tricky. Usually it is called without
1377  * cpu_lock and it must must grab cpu_lock here to prevent racing with other
1378  * callers. During DR of the board containing the caged memory it may be called
1379  * with cpu_lock already held and CPUs paused.
1380  *
1381  * If the deletion is part of the DR copy-rename and the deleted mnode is the
1382  * only one present in the lgrp_root->lgrp_mnodes, all the topology is updated,
1383  * but lgrp_root->lgrp_mnodes is left intact. Later, lgrp_mem_init() will insert
1384  * the same mnode back into the topology. See lgrp_mem_rename() and
1385  * lgrp_mem_init() for additional details.
1386  */
1387 void
1388 lgrp_mem_fini(int mnode, lgrp_handle_t hand, boolean_t is_copy_rename)
1389 {
1390         klgrpset_t      changed;
1391         int             count;
1392         int             i;
1393         lgrp_t          *my_lgrp;
1394         lgrp_id_t       lgrpid;
1395         mnodeset_t      mnodes_mask;
1396         boolean_t       drop_lock = B_FALSE;
1397         boolean_t       need_synch = B_FALSE;
1398 
1399         /*
1400          * Grab CPU lock (if we haven't already)
1401          */
1402         if (!MUTEX_HELD(&cpu_lock)) {
1403                 mutex_enter(&cpu_lock);
1404                 drop_lock = B_TRUE;
1405         }
1406 
1407         /*
1408          * This routine may be called from a context where we already
1409          * hold cpu_lock and have already paused cpus.
1410          */
1411         if (!cpus_paused())
1412                 need_synch = B_TRUE;
1413 
1414         my_lgrp = lgrp_hand_to_lgrp(hand);
1415 
1416         /*
1417          * The lgrp *must* be pre-existing
1418          */
1419         ASSERT(my_lgrp != NULL);
1420 
1421         /*
1422          * Delete memory node from lgroups which contain it
1423          */
1424         mnodes_mask = ((mnodeset_t)1 << mnode);
1425         for (i = 0; i <= lgrp_alloc_max; i++) {
1426                 lgrp_t *lgrp = lgrp_table[i];
1427                 /*
1428                  * Skip any non-existent lgroups and any lgroups that don't
1429                  * contain leaf lgroup of memory as a memory resource
1430                  */
1431                 if (!LGRP_EXISTS(lgrp) ||
1432                     !(lgrp->lgrp_mnodes & mnodes_mask))
1433                         continue;
1434 
1435                 /*
1436                  * Avoid removing the last mnode from the root in the DR
1437                  * copy-rename case. See lgrp_mem_rename() for details.
1438                  */
1439                 if (is_copy_rename &&
1440                     (lgrp == lgrp_root) && (lgrp->lgrp_mnodes == mnodes_mask))
1441                         continue;
1442 
1443                 /*
1444                  * Remove memory node from lgroup.
1445                  */
1446                 lgrp->lgrp_mnodes &= ~mnodes_mask;
1447                 lgrp->lgrp_nmnodes--;
1448                 ASSERT(lgrp->lgrp_nmnodes >= 0);
1449         }
1450         ASSERT(lgrp_root->lgrp_nmnodes > 0);
1451 
1452         /*
1453          * Don't need to update lgroup topology if this lgroup still has memory.
1454          *
1455          * In the special case of DR copy-rename with the only mnode being
1456          * removed, the lgrp_mnodes for the root is always non-zero, but we
1457          * still need to update the lgroup topology.
1458          */
1459         if ((my_lgrp->lgrp_nmnodes > 0) &&
1460             !(is_copy_rename && (my_lgrp == lgrp_root) &&
1461             (my_lgrp->lgrp_mnodes == mnodes_mask))) {
1462                 if (drop_lock)
1463                         mutex_exit(&cpu_lock);
1464                 return;
1465         }
1466 
1467         /*
1468          * This lgroup does not contain any memory now
1469          */
1470         klgrpset_clear(my_lgrp->lgrp_set[LGRP_RSRC_MEM]);
1471 
1472         /*
1473          * Remove this lgroup from lgroup topology if it does not contain any
1474          * resources now
1475          */
1476         lgrpid = my_lgrp->lgrp_id;
1477         count = 0;
1478         klgrpset_clear(changed);
1479         if (lgrp_rsets_empty(my_lgrp->lgrp_set)) {
1480                 /*
1481                  * Delete lgroup when no more resources
1482                  */
1483                 if (need_synch)
1484                         pause_cpus(NULL, NULL);
1485                 count = lgrp_leaf_delete(my_lgrp, lgrp_table,
1486                     lgrp_alloc_max + 1, &changed);
1487                 ASSERT(count > 0);
1488                 if (need_synch)
1489                         start_cpus();
1490         } else {
1491                 /*
1492                  * Remove lgroup from memory resources of any lgroups that
1493                  * contain it as such
1494                  */
1495                 for (i = 0; i <= lgrp_alloc_max; i++) {
1496                         lgrp_t          *lgrp;
1497 
1498                         lgrp = lgrp_table[i];
1499                         if (!LGRP_EXISTS(lgrp) ||
1500                             !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM],
1501                             lgrpid))
1502                                 continue;
1503 
1504                         klgrpset_del(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1505                 }
1506         }
1507         if (drop_lock)
1508                 mutex_exit(&cpu_lock);
1509 }
1510 
1511 /*
1512  * Return lgroup with given platform handle
1513  */
1514 lgrp_t *
1515 lgrp_hand_to_lgrp(lgrp_handle_t hand)
1516 {
1517         int     i;
1518         lgrp_t  *lgrp;
1519 
1520         if (hand == LGRP_NULL_HANDLE)
1521                 return (NULL);
1522 
1523         for (i = 0; i <= lgrp_alloc_max; i++) {
1524                 lgrp = lgrp_table[i];
1525                 if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand)
1526                         return (lgrp);
1527         }
1528         return (NULL);
1529 }
1530 
1531 /*
1532  * Return the home lgroup of the current thread.
1533  * We must do this with kernel preemption disabled, since we don't want our
1534  * thread to be re-homed while we're poking around with its lpl, and the lpl
1535  * should never be NULL.
1536  *
1537  * NOTE: Can't guarantee that lgroup will be valid once kernel preemption
1538  * is enabled because of DR.  Callers can use disable kernel preemption
1539  * around this call to guarantee that the lgroup will be valid beyond this
1540  * routine, since kernel preemption can be recursive.
1541  */
1542 lgrp_t *
1543 lgrp_home_lgrp(void)
1544 {
1545         lgrp_t  *lgrp;
1546         lpl_t   *lpl;
1547 
1548         kpreempt_disable();
1549 
1550         lpl = curthread->t_lpl;
1551         ASSERT(lpl != NULL);
1552         ASSERT(lpl->lpl_lgrpid >= 0 && lpl->lpl_lgrpid <= lgrp_alloc_max);
1553         ASSERT(LGRP_EXISTS(lgrp_table[lpl->lpl_lgrpid]));
1554         lgrp = lgrp_table[lpl->lpl_lgrpid];
1555 
1556         kpreempt_enable();
1557 
1558         return (lgrp);
1559 }
1560 
1561 /*
1562  * Return ID of home lgroup for given thread
1563  * (See comments for lgrp_home_lgrp() for special care and handling
1564  * instructions)
1565  */
1566 lgrp_id_t
1567 lgrp_home_id(kthread_t *t)
1568 {
1569         lgrp_id_t       lgrp;
1570         lpl_t           *lpl;
1571 
1572         ASSERT(t != NULL);
1573         /*
1574          * We'd like to ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock)), but we
1575          * cannot since the HAT layer can call into this routine to
1576          * determine the locality for its data structures in the context
1577          * of a page fault.
1578          */
1579 
1580         kpreempt_disable();
1581 
1582         lpl = t->t_lpl;
1583         ASSERT(lpl != NULL);
1584         ASSERT(lpl->lpl_lgrpid >= 0 && lpl->lpl_lgrpid <= lgrp_alloc_max);
1585         lgrp = lpl->lpl_lgrpid;
1586 
1587         kpreempt_enable();
1588 
1589         return (lgrp);
1590 }
1591 
1592 /*
1593  * Return lgroup containing the physical memory for the given page frame number
1594  */
1595 lgrp_t *
1596 lgrp_pfn_to_lgrp(pfn_t pfn)
1597 {
1598         lgrp_handle_t   hand;
1599         int             i;
1600         lgrp_t          *lgrp;
1601 
1602         hand = lgrp_plat_pfn_to_hand(pfn);
1603         if (hand != LGRP_NULL_HANDLE)
1604                 for (i = 0; i <= lgrp_alloc_max; i++) {
1605                         lgrp = lgrp_table[i];
1606                         if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand)
1607                                 return (lgrp);
1608                 }
1609         return (NULL);
1610 }
1611 
1612 /*
1613  * Return lgroup containing the physical memory for the given page frame number
1614  */
1615 lgrp_t *
1616 lgrp_phys_to_lgrp(u_longlong_t physaddr)
1617 {
1618         lgrp_handle_t   hand;
1619         int             i;
1620         lgrp_t          *lgrp;
1621         pfn_t           pfn;
1622 
1623         pfn = btop(physaddr);
1624         hand = lgrp_plat_pfn_to_hand(pfn);
1625         if (hand != LGRP_NULL_HANDLE)
1626                 for (i = 0; i <= lgrp_alloc_max; i++) {
1627                         lgrp = lgrp_table[i];
1628                         if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand)
1629                                 return (lgrp);
1630                 }
1631         return (NULL);
1632 }
1633 
1634 /*
1635  * Return the leaf lgroup containing the given CPU
1636  *
1637  * The caller needs to take precautions necessary to prevent
1638  * "cpu", and it's lpl from going away across a call to this function.
1639  * hint: kpreempt_disable()/kpreempt_enable()
1640  */
1641 static lgrp_t *
1642 lgrp_cpu_to_lgrp(cpu_t *cpu)
1643 {
1644         return (cpu->cpu_lpl->lpl_lgrp);
1645 }
1646 
1647 /*
1648  * Return the sum of the partition loads in an lgrp divided by
1649  * the number of CPUs in the lgrp.  This is our best approximation
1650  * of an 'lgroup load average' for a useful per-lgroup kstat.
1651  */
1652 static uint64_t
1653 lgrp_sum_loadavgs(lgrp_t *lgrp)
1654 {
1655         cpu_t *cpu;
1656         int ncpu;
1657         uint64_t loads = 0;
1658 
1659         mutex_enter(&cpu_lock);
1660 
1661         cpu = lgrp->lgrp_cpu;
1662         ncpu = lgrp->lgrp_cpucnt;
1663 
1664         if (cpu == NULL || ncpu == 0) {
1665                 mutex_exit(&cpu_lock);
1666                 return (0ull);
1667         }
1668 
1669         do {
1670                 loads += cpu->cpu_lpl->lpl_loadavg;
1671                 cpu = cpu->cpu_next_lgrp;
1672         } while (cpu != lgrp->lgrp_cpu);
1673 
1674         mutex_exit(&cpu_lock);
1675 
1676         return (loads / ncpu);
1677 }
1678 
1679 void
1680 lgrp_stat_add(lgrp_id_t lgrpid, lgrp_stat_t stat, int64_t val)
1681 {
1682         struct lgrp_stats *pstats;
1683 
1684         /*
1685          * Verify that the caller isn't trying to add to
1686          * a statistic for an lgroup that has gone away
1687          */
1688         if (lgrpid < 0 || lgrpid > lgrp_alloc_max)
1689                 return;
1690 
1691         pstats = &lgrp_stats[lgrpid];
1692         atomic_add_64((uint64_t *)LGRP_STAT_WRITE_PTR(pstats, stat), val);
1693 }
1694 
1695 int64_t
1696 lgrp_stat_read(lgrp_id_t lgrpid, lgrp_stat_t stat)
1697 {
1698         uint64_t val;
1699         struct lgrp_stats *pstats;
1700 
1701         if (lgrpid < 0 || lgrpid > lgrp_alloc_max)
1702                 return ((int64_t)0);
1703 
1704         pstats = &lgrp_stats[lgrpid];
1705         LGRP_STAT_READ(pstats, stat, val);
1706         return (val);
1707 }
1708 
1709 /*
1710  * Reset all kstats for lgrp specified by its lgrpid.
1711  */
1712 static void
1713 lgrp_kstat_reset(lgrp_id_t lgrpid)
1714 {
1715         lgrp_stat_t stat;
1716 
1717         if (lgrpid < 0 || lgrpid > lgrp_alloc_max)
1718                 return;
1719 
1720         for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) {
1721                 LGRP_STAT_RESET(&lgrp_stats[lgrpid], stat);
1722         }
1723 }
1724 
1725 /*
1726  * Collect all per-lgrp statistics for the lgrp associated with this
1727  * kstat, and store them in the ks_data array.
1728  *
1729  * The superuser can reset all the running counter statistics for an
1730  * lgrp by writing to any of the lgrp's stats.
1731  */
1732 static int
1733 lgrp_kstat_extract(kstat_t *ksp, int rw)
1734 {
1735         lgrp_stat_t             stat;
1736         struct kstat_named      *ksd;
1737         lgrp_t                  *lgrp;
1738         lgrp_id_t               lgrpid;
1739 
1740         lgrp = (lgrp_t *)ksp->ks_private;
1741 
1742         ksd = (struct kstat_named *)ksp->ks_data;
1743         ASSERT(ksd == (struct kstat_named *)&lgrp_kstat_data);
1744 
1745         lgrpid = lgrp->lgrp_id;
1746 
1747         if (lgrpid == LGRP_NONE) {
1748                 /*
1749                  * Return all zeroes as stats for freed lgrp.
1750                  */
1751                 for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) {
1752                         ksd[stat].value.i64 = 0;
1753                 }
1754                 ksd[stat + LGRP_NUM_CPUS].value.i64 = 0;
1755                 ksd[stat + LGRP_NUM_PG_INSTALL].value.i64 = 0;
1756                 ksd[stat + LGRP_NUM_PG_AVAIL].value.i64 = 0;
1757                 ksd[stat + LGRP_NUM_PG_FREE].value.i64 = 0;
1758                 ksd[stat + LGRP_LOADAVG].value.i64 = 0;
1759         } else if (rw != KSTAT_WRITE) {
1760                 /*
1761                  * Handle counter stats
1762                  */
1763                 for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) {
1764                         ksd[stat].value.i64 = lgrp_stat_read(lgrpid, stat);
1765                 }
1766 
1767                 /*
1768                  * Handle kernel data snapshot stats
1769                  */
1770                 ksd[stat + LGRP_NUM_CPUS].value.i64 = lgrp->lgrp_cpucnt;
1771                 ksd[stat + LGRP_NUM_PG_INSTALL].value.i64 =
1772                     lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_INSTALL);
1773                 ksd[stat + LGRP_NUM_PG_AVAIL].value.i64 =
1774                     lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_AVAIL);
1775                 ksd[stat + LGRP_NUM_PG_FREE].value.i64 =
1776                     lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_FREE);
1777                 ksd[stat + LGRP_LOADAVG].value.i64 = lgrp_sum_loadavgs(lgrp);
1778                 ksd[stat + LGRP_LOADAVG_SCALE].value.i64 =
1779                     lgrp_loadavg_max_effect;
1780         } else {
1781                 lgrp_kstat_reset(lgrpid);
1782         }
1783 
1784         return (0);
1785 }
1786 
1787 int
1788 lgrp_query_cpu(processorid_t id, lgrp_id_t *lp)
1789 {
1790         cpu_t   *cp;
1791 
1792         mutex_enter(&cpu_lock);
1793 
1794         if ((cp = cpu_get(id)) == NULL) {
1795                 mutex_exit(&cpu_lock);
1796                 return (EINVAL);
1797         }
1798 
1799         if (cpu_is_offline(cp) || cpu_is_poweredoff(cp)) {
1800                 mutex_exit(&cpu_lock);
1801                 return (EINVAL);
1802         }
1803 
1804         ASSERT(cp->cpu_lpl != NULL);
1805 
1806         *lp = cp->cpu_lpl->lpl_lgrpid;
1807 
1808         mutex_exit(&cpu_lock);
1809 
1810         return (0);
1811 }
1812 
1813 int
1814 lgrp_query_load(processorid_t id, lgrp_load_t *lp)
1815 {
1816         cpu_t *cp;
1817 
1818         mutex_enter(&cpu_lock);
1819 
1820         if ((cp = cpu_get(id)) == NULL) {
1821                 mutex_exit(&cpu_lock);
1822                 return (EINVAL);
1823         }
1824 
1825         ASSERT(cp->cpu_lpl != NULL);
1826 
1827         *lp = cp->cpu_lpl->lpl_loadavg;
1828 
1829         mutex_exit(&cpu_lock);
1830 
1831         return (0);
1832 }
1833 
1834 /*
1835  * Add a resource named by lpl_leaf to rset of lpl_target
1836  *
1837  * This routine also adjusts ncpu and nrset if the call succeeds in adding a
1838  * resource. It is adjusted here, as this is presently the only place that we
1839  * can be certain a resource addition has succeeded.
1840  *
1841  * We keep the list of rsets sorted so that the dispatcher can quickly walk the
1842  * list in order until it reaches a NULL.  (This list is required to be NULL
1843  * terminated, too).  This is done so that we can mark start pos + 1, so that
1844  * each lpl is traversed sequentially, but in a different order.  We hope this
1845  * will improve performance a bit.  (Hopefully, less read-to-own traffic...)
1846  */
1847 
1848 void
1849 lpl_rset_add(lpl_t *lpl_target, lpl_t *lpl_leaf)
1850 {
1851         int             i;
1852         int             entry_slot = 0;
1853 
1854         /* return if leaf is already present */
1855         for (i = 0; i < lpl_target->lpl_nrset; i++) {
1856                 if (lpl_target->lpl_rset[i] == lpl_leaf) {
1857                         return;
1858                 }
1859 
1860                 if (lpl_target->lpl_rset[i]->lpl_lgrpid >
1861                     lpl_leaf->lpl_lgrpid) {
1862                         break;
1863                 }
1864         }
1865 
1866         /* insert leaf, update counts */
1867         entry_slot = i;
1868         i = lpl_target->lpl_nrset++;
1869 
1870         /*
1871          * Start at the end of the rset array and work backwards towards the
1872          * slot into which the new lpl will be inserted. This effectively
1873          * preserves the current ordering by scooting everybody over one entry,
1874          * and placing the new entry into the space created.
1875          */
1876         while (i-- > entry_slot) {
1877                 lpl_target->lpl_rset[i + 1] = lpl_target->lpl_rset[i];
1878                 lpl_target->lpl_id2rset[lpl_target->lpl_rset[i]->lpl_lgrpid] =
1879                     i + 1;
1880         }
1881 
1882         lpl_target->lpl_rset[entry_slot] = lpl_leaf;
1883         lpl_target->lpl_id2rset[lpl_leaf->lpl_lgrpid] = entry_slot;
1884 
1885         lpl_target->lpl_ncpu += lpl_leaf->lpl_ncpu;
1886 }
1887 
1888 /*
1889  * Update each of lpl_parent's children with a reference to their parent.
1890  * The lgrp topology is used as the reference since it is fully
1891  * consistent and correct at this point.
1892  * This should be called after any potential change in lpl_parent's
1893  * rset.
1894  */
1895 static void
1896 lpl_child_update(lpl_t *lpl_parent, struct cpupart *cp)
1897 {
1898         klgrpset_t      children;
1899         int             i;
1900 
1901         children = lgrp_table[lpl_parent->lpl_lgrpid]->lgrp_children;
1902         if (klgrpset_isempty(children))
1903                 return; /* nothing to do */
1904 
1905         for (i = 0; i <= lgrp_alloc_max; i++) {
1906                 if (klgrpset_ismember(children, i)) {
1907                         /*
1908                          * (Re)set the parent. It may be incorrect if
1909                          * lpl_parent is new in the topology.
1910                          */
1911                         cp->cp_lgrploads[i].lpl_parent = lpl_parent;
1912                 }
1913         }
1914 }
1915 
1916 /*
1917  * Delete resource lpl_leaf from rset of lpl_target, assuming it's there.
1918  *
1919  * This routine also adjusts ncpu and nrset if the call succeeds in deleting a
1920  * resource. The values are adjusted here, as this is the only place that we can
1921  * be certain a resource was successfully deleted.
1922  */
1923 void
1924 lpl_rset_del(lpl_t *lpl_target, lpl_t *lpl_leaf)
1925 {
1926         int i;
1927         lpl_t *leaf;
1928 
1929         if (lpl_target->lpl_nrset == 0)
1930                 return;
1931 
1932         /* find leaf in intermediate node */
1933         for (i = 0; i < lpl_target->lpl_nrset; i++) {
1934                 if (lpl_target->lpl_rset[i] == lpl_leaf)
1935                         break;
1936         }
1937 
1938         /* return if leaf not found */
1939         if (lpl_target->lpl_rset[i] != lpl_leaf)
1940                 return;
1941 
1942         /* prune leaf, compress array */
1943         lpl_target->lpl_rset[lpl_target->lpl_nrset--] = NULL;
1944         lpl_target->lpl_id2rset[lpl_leaf->lpl_lgrpid] = -1;
1945         lpl_target->lpl_ncpu--;
1946         do {
1947                 lpl_target->lpl_rset[i] = lpl_target->lpl_rset[i + 1];
1948                 /*
1949                  * Update the lgrp id <=> rset mapping
1950                  */
1951                 if ((leaf = lpl_target->lpl_rset[i]) != NULL) {
1952                         lpl_target->lpl_id2rset[leaf->lpl_lgrpid] = i;
1953                 }
1954         } while (i++ < lpl_target->lpl_nrset);
1955 }
1956 
1957 /*
1958  * Check to see if the resource set of the target lpl contains the
1959  * supplied leaf lpl.  This returns 1 if the lpl is found, 0 if it is not.
1960  */
1961 
1962 int
1963 lpl_rset_contains(lpl_t *lpl_target, lpl_t *lpl_leaf)
1964 {
1965         int i;
1966 
1967         for (i = 0; i < lpl_target->lpl_nrset; i++) {
1968                 if (lpl_target->lpl_rset[i] == lpl_leaf)
1969                         return (1);
1970         }
1971 
1972         return (0);
1973 }
1974 
1975 /*
1976  * Called when we change cpu lpl membership.  This increments or decrements the
1977  * per-cpu counter in every lpl in which our leaf appears.
1978  */
1979 void
1980 lpl_cpu_adjcnt(lpl_act_t act, cpu_t *cp)
1981 {
1982         cpupart_t       *cpupart;
1983         lgrp_t          *lgrp_leaf;
1984         lgrp_t          *lgrp_cur;
1985         lpl_t           *lpl_leaf;
1986         lpl_t           *lpl_cur;
1987         int             i;
1988 
1989         ASSERT(act == LPL_DECREMENT || act == LPL_INCREMENT);
1990 
1991         cpupart = cp->cpu_part;
1992         lpl_leaf = cp->cpu_lpl;
1993         lgrp_leaf = lgrp_table[lpl_leaf->lpl_lgrpid];
1994 
1995         for (i = 0; i <= lgrp_alloc_max; i++) {
1996                 lgrp_cur = lgrp_table[i];
1997 
1998                 /*
1999                  * Don't adjust if the lgrp isn't there, if we're the leaf lpl
2000                  * for the cpu in question, or if the current lgrp and leaf
2001                  * don't share the same resources.
2002                  */
2003 
2004                 if (!LGRP_EXISTS(lgrp_cur) || (lgrp_cur == lgrp_leaf) ||
2005                     !klgrpset_intersects(lgrp_leaf->lgrp_set[LGRP_RSRC_CPU],
2006                     lgrp_cur->lgrp_set[LGRP_RSRC_CPU]))
2007                         continue;
2008 
2009 
2010                 lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id];
2011 
2012                 if (lpl_cur->lpl_nrset > 0) {
2013                         if (act == LPL_INCREMENT) {
2014                                 lpl_cur->lpl_ncpu++;
2015                         } else if (act == LPL_DECREMENT) {
2016                                 lpl_cur->lpl_ncpu--;
2017                         }
2018                 }
2019         }
2020 }
2021 
2022 /*
2023  * Initialize lpl with given resources and specified lgrp
2024  */
2025 void
2026 lpl_init(lpl_t *lpl, lpl_t *lpl_leaf, lgrp_t *lgrp)
2027 {
2028         lpl->lpl_lgrpid = lgrp->lgrp_id;
2029         lpl->lpl_loadavg = 0;
2030         if (lpl == lpl_leaf)
2031                 lpl->lpl_ncpu = 1;
2032         else
2033                 lpl->lpl_ncpu = lpl_leaf->lpl_ncpu;
2034         lpl->lpl_nrset = 1;
2035         lpl->lpl_rset[0] = lpl_leaf;
2036         lpl->lpl_id2rset[lpl_leaf->lpl_lgrpid] = 0;
2037         lpl->lpl_lgrp = lgrp;
2038         lpl->lpl_parent = NULL; /* set by lpl_leaf_insert() */
2039         lpl->lpl_cpus = NULL; /* set by lgrp_part_add_cpu() */
2040 }
2041 
2042 /*
2043  * Clear an unused lpl
2044  */
2045 void
2046 lpl_clear(lpl_t *lpl)
2047 {
2048         /*
2049          * Clear out all fields in the lpl except:
2050          *    lpl_lgrpid - to facilitate debugging
2051          *    lpl_rset, lpl_rset_sz, lpl_id2rset - rset array references / size
2052          *
2053          * Note that the lpl's rset and id2rset mapping are cleared as well.
2054          */
2055         lpl->lpl_loadavg = 0;
2056         lpl->lpl_ncpu = 0;
2057         lpl->lpl_lgrp = NULL;
2058         lpl->lpl_parent = NULL;
2059         lpl->lpl_cpus = NULL;
2060         lpl->lpl_nrset = 0;
2061         lpl->lpl_homed_time = 0;
2062         bzero(lpl->lpl_rset, sizeof (lpl->lpl_rset[0]) * lpl->lpl_rset_sz);
2063         bzero(lpl->lpl_id2rset,
2064             sizeof (lpl->lpl_id2rset[0]) * lpl->lpl_rset_sz);
2065 }
2066 
2067 /*
2068  * Given a CPU-partition, verify that the lpl topology in the CPU-partition
2069  * is in sync with the lgroup toplogy in the system.  The lpl topology may not
2070  * make full use of all of the lgroup topology, but this checks to make sure
2071  * that for the parts that it does use, it has correctly understood the
2072  * relationships that exist. This function returns
2073  * 0 if the topology is correct, and a non-zero error code, for non-debug
2074  * kernels if incorrect.  Asserts are spread throughout the code to aid in
2075  * debugging on a DEBUG kernel.
2076  */
2077 int
2078 lpl_topo_verify(cpupart_t *cpupart)
2079 {
2080         lgrp_t          *lgrp;
2081         lpl_t           *lpl;
2082         klgrpset_t      rset;
2083         klgrpset_t      cset;
2084         cpu_t           *cpu;
2085         cpu_t           *cp_start;
2086         int             i;
2087         int             j;
2088         int             sum;
2089 
2090         /* topology can't be incorrect if it doesn't exist */
2091         if (!lgrp_topo_initialized || !lgrp_initialized)
2092                 return (LPL_TOPO_CORRECT);
2093 
2094         ASSERT(cpupart != NULL);
2095 
2096         for (i = 0; i <= lgrp_alloc_max; i++) {
2097                 lgrp = lgrp_table[i];
2098                 lpl = NULL;
2099                 /* make sure lpls are allocated */
2100                 ASSERT(cpupart->cp_lgrploads);
2101                 if (!cpupart->cp_lgrploads)
2102                         return (LPL_TOPO_PART_HAS_NO_LPL);
2103 
2104                 lpl = &cpupart->cp_lgrploads[i];
2105                 /* make sure our index is good */
2106                 ASSERT(i < cpupart->cp_nlgrploads);
2107 
2108                 /* if lgroup doesn't exist, make sure lpl is empty */
2109                 if (!LGRP_EXISTS(lgrp)) {
2110                         ASSERT(lpl->lpl_ncpu == 0);
2111                         if (lpl->lpl_ncpu > 0) {
2112                                 return (LPL_TOPO_CPUS_NOT_EMPTY);
2113                         } else {
2114                                 continue;
2115                         }
2116                 }
2117 
2118                 /* verify that lgroup and lpl are identically numbered */
2119                 ASSERT(lgrp->lgrp_id == lpl->lpl_lgrpid);
2120 
2121                 /* if lgroup isn't in our partition, make sure lpl is empty */
2122                 if (!klgrpset_intersects(lgrp->lgrp_leaves,
2123                     cpupart->cp_lgrpset)) {
2124                         ASSERT(lpl->lpl_ncpu == 0);
2125                         if (lpl->lpl_ncpu > 0) {
2126                                 return (LPL_TOPO_CPUS_NOT_EMPTY);
2127                         }
2128                         /*
2129                          * lpl is empty, and lgroup isn't in partition.  verify
2130                          * that lpl doesn't show up in anyone else's rsets (in
2131                          * this partition, anyway)
2132                          */
2133                         for (j = 0; j < cpupart->cp_nlgrploads; j++) {
2134                                 lpl_t *i_lpl; /* lpl we're iterating over */
2135 
2136                                 i_lpl = &cpupart->cp_lgrploads[j];
2137 
2138                                 ASSERT(!lpl_rset_contains(i_lpl, lpl));
2139                                 if (lpl_rset_contains(i_lpl, lpl)) {
2140                                         return (LPL_TOPO_LPL_ORPHANED);
2141                                 }
2142                         }
2143                         /* lgroup is empty, and everything is ok. continue */
2144                         continue;
2145                 }
2146 
2147 
2148                 /* lgroup is in this partition, now check it against lpl */
2149 
2150                 /* do both have matching lgrps? */
2151                 ASSERT(lgrp == lpl->lpl_lgrp);
2152                 if (lgrp != lpl->lpl_lgrp) {
2153                         return (LPL_TOPO_LGRP_MISMATCH);
2154                 }
2155 
2156                 /* do the parent lgroups exist and do they match? */
2157                 if (lgrp->lgrp_parent) {
2158                         ASSERT(lpl->lpl_parent);
2159                         ASSERT(lgrp->lgrp_parent->lgrp_id ==
2160                             lpl->lpl_parent->lpl_lgrpid);
2161 
2162                         if (!lpl->lpl_parent) {
2163                                 return (LPL_TOPO_MISSING_PARENT);
2164                         } else if (lgrp->lgrp_parent->lgrp_id !=
2165                             lpl->lpl_parent->lpl_lgrpid) {
2166                                 return (LPL_TOPO_PARENT_MISMATCH);
2167                         }
2168                 }
2169 
2170                 /* only leaf lgroups keep a cpucnt, only check leaves */
2171                 if ((lpl->lpl_nrset == 1) && (lpl == lpl->lpl_rset[0])) {
2172 
2173                         /* verify that lgrp is also a leaf */
2174                         ASSERT((lgrp->lgrp_childcnt == 0) &&
2175                             (klgrpset_ismember(lgrp->lgrp_leaves,
2176                             lpl->lpl_lgrpid)));
2177 
2178                         if ((lgrp->lgrp_childcnt > 0) ||
2179                             (!klgrpset_ismember(lgrp->lgrp_leaves,
2180                             lpl->lpl_lgrpid))) {
2181                                 return (LPL_TOPO_LGRP_NOT_LEAF);
2182                         }
2183 
2184                         ASSERT((lgrp->lgrp_cpucnt >= lpl->lpl_ncpu) &&
2185                             (lpl->lpl_ncpu > 0));
2186                         if ((lgrp->lgrp_cpucnt < lpl->lpl_ncpu) ||
2187                             (lpl->lpl_ncpu <= 0)) {
2188                                 return (LPL_TOPO_BAD_CPUCNT);
2189                         }
2190 
2191                         /*
2192                          * Check that lpl_ncpu also matches the number of
2193                          * cpus in the lpl's linked list.  This only exists in
2194                          * leaves, but they should always match.
2195                          */
2196                         j = 0;
2197                         cpu = cp_start = lpl->lpl_cpus;
2198                         while (cpu != NULL) {
2199                                 j++;
2200 
2201                                 /* check to make sure cpu's lpl is leaf lpl */
2202                                 ASSERT(cpu->cpu_lpl == lpl);
2203                                 if (cpu->cpu_lpl != lpl) {
2204                                         return (LPL_TOPO_CPU_HAS_BAD_LPL);
2205                                 }
2206 
2207                                 /* check next cpu */
2208                                 if ((cpu = cpu->cpu_next_lpl) != cp_start) {
2209                                         continue;
2210                                 } else {
2211                                         cpu = NULL;
2212                                 }
2213                         }
2214 
2215                         ASSERT(j == lpl->lpl_ncpu);
2216                         if (j != lpl->lpl_ncpu) {
2217                                 return (LPL_TOPO_LPL_BAD_NCPU);
2218                         }
2219 
2220                         /*
2221                          * Also, check that leaf lpl is contained in all
2222                          * intermediate lpls that name the leaf as a descendant
2223                          */
2224                         for (j = 0; j <= lgrp_alloc_max; j++) {
2225                                 klgrpset_t intersect;
2226                                 lgrp_t *lgrp_cand;
2227                                 lpl_t *lpl_cand;
2228 
2229                                 lgrp_cand = lgrp_table[j];
2230                                 intersect = klgrpset_intersects(
2231                                     lgrp_cand->lgrp_set[LGRP_RSRC_CPU],
2232                                     cpupart->cp_lgrpset);
2233 
2234                                 if (!LGRP_EXISTS(lgrp_cand) ||
2235                                     !klgrpset_intersects(lgrp_cand->lgrp_leaves,
2236                                     cpupart->cp_lgrpset) ||
2237                                     (intersect == 0))
2238                                         continue;
2239 
2240                                 lpl_cand =
2241                                     &cpupart->cp_lgrploads[lgrp_cand->lgrp_id];
2242 
2243                                 if (klgrpset_ismember(intersect,
2244                                     lgrp->lgrp_id)) {
2245                                         ASSERT(lpl_rset_contains(lpl_cand,
2246                                             lpl));
2247 
2248                                         if (!lpl_rset_contains(lpl_cand, lpl)) {
2249                                                 return (LPL_TOPO_RSET_MSSNG_LF);
2250                                         }
2251                                 }
2252                         }
2253 
2254                 } else { /* non-leaf specific checks */
2255 
2256                         /*
2257                          * Non-leaf lpls should have lpl_cpus == NULL
2258                          * verify that this is so
2259                          */
2260                         ASSERT(lpl->lpl_cpus == NULL);
2261                         if (lpl->lpl_cpus != NULL) {
2262                                 return (LPL_TOPO_NONLEAF_HAS_CPUS);
2263                         }
2264 
2265                         /*
2266                          * verify that the sum of the cpus in the leaf resources
2267                          * is equal to the total ncpu in the intermediate
2268                          */
2269                         for (j = sum = 0; j < lpl->lpl_nrset; j++) {
2270                                 sum += lpl->lpl_rset[j]->lpl_ncpu;
2271                         }
2272 
2273                         ASSERT(sum == lpl->lpl_ncpu);
2274                         if (sum != lpl->lpl_ncpu) {
2275                                 return (LPL_TOPO_LPL_BAD_NCPU);
2276                         }
2277                 }
2278 
2279                 /*
2280                  * Check the rset of the lpl in question.  Make sure that each
2281                  * rset contains a subset of the resources in
2282                  * lgrp_set[LGRP_RSRC_CPU] and in cp_lgrpset.  This also makes
2283                  * sure that each rset doesn't include resources that are
2284                  * outside of that set.  (Which would be resources somehow not
2285                  * accounted for).
2286                  */
2287                 klgrpset_clear(rset);
2288                 for (j = 0; j < lpl->lpl_nrset; j++) {
2289                         klgrpset_add(rset, lpl->lpl_rset[j]->lpl_lgrpid);
2290                 }
2291                 klgrpset_copy(cset, rset);
2292                 /* make sure lpl rset matches lgrp rset */
2293                 klgrpset_diff(rset, lgrp->lgrp_set[LGRP_RSRC_CPU]);
2294                 /* make sure rset is contained with in partition, too */
2295                 klgrpset_diff(cset, cpupart->cp_lgrpset);
2296 
2297                 ASSERT(klgrpset_isempty(rset) && klgrpset_isempty(cset));
2298                 if (!klgrpset_isempty(rset) || !klgrpset_isempty(cset)) {
2299                         return (LPL_TOPO_RSET_MISMATCH);
2300                 }
2301 
2302                 /*
2303                  * check to make sure lpl_nrset matches the number of rsets
2304                  * contained in the lpl
2305                  */
2306                 for (j = 0; j < lpl->lpl_nrset; j++) {
2307                         if (lpl->lpl_rset[j] == NULL)
2308                                 break;
2309                 }
2310 
2311                 ASSERT(j == lpl->lpl_nrset);
2312                 if (j != lpl->lpl_nrset) {
2313                         return (LPL_TOPO_BAD_RSETCNT);
2314                 }
2315 
2316         }
2317         return (LPL_TOPO_CORRECT);
2318 }
2319 
2320 /*
2321  * Flatten lpl topology to given number of levels.  This is presently only
2322  * implemented for a flatten to 2 levels, which will prune out the intermediates
2323  * and home the leaf lpls to the root lpl.
2324  */
2325 int
2326 lpl_topo_flatten(int levels)
2327 {
2328         int             i;
2329         uint_t          sum;
2330         lgrp_t          *lgrp_cur;
2331         lpl_t           *lpl_cur;
2332         lpl_t           *lpl_root;
2333         cpupart_t       *cp;
2334 
2335         if (levels != 2)
2336                 return (0);
2337 
2338         /* called w/ cpus paused - grab no locks! */
2339         ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 ||
2340             !lgrp_initialized);
2341 
2342         cp = cp_list_head;
2343         do {
2344                 lpl_root = &cp->cp_lgrploads[lgrp_root->lgrp_id];
2345                 ASSERT(LGRP_EXISTS(lgrp_root) && (lpl_root->lpl_ncpu > 0));
2346 
2347                 for (i = 0; i <= lgrp_alloc_max; i++) {
2348                         lgrp_cur = lgrp_table[i];
2349                         lpl_cur = &cp->cp_lgrploads[i];
2350 
2351                         if ((lgrp_cur == lgrp_root) ||
2352                             (!LGRP_EXISTS(lgrp_cur) &&
2353                             (lpl_cur->lpl_ncpu == 0)))
2354                                 continue;
2355 
2356                         if (!LGRP_EXISTS(lgrp_cur) && (lpl_cur->lpl_ncpu > 0)) {
2357                                 /*
2358                                  * this should be a deleted intermediate, so
2359                                  * clear it
2360                                  */
2361                                 lpl_clear(lpl_cur);
2362                         } else if ((lpl_cur->lpl_nrset == 1) &&
2363                             (lpl_cur->lpl_rset[0] == lpl_cur) &&
2364                             ((lpl_cur->lpl_parent->lpl_ncpu == 0) ||
2365                             (!LGRP_EXISTS(lpl_cur->lpl_parent->lpl_lgrp)))) {
2366                                 /*
2367                                  * this is a leaf whose parent was deleted, or
2368                                  * whose parent had their lgrp deleted.  (And
2369                                  * whose parent will soon be deleted).  Point
2370                                  * this guy back to the root lpl.
2371                                  */
2372                                 lpl_cur->lpl_parent = lpl_root;
2373                                 lpl_rset_add(lpl_root, lpl_cur);
2374                         }
2375 
2376                 }
2377 
2378                 /*
2379                  * Now that we're done, make sure the count on the root lpl is
2380                  * correct, and update the hints of the children for the sake of
2381                  * thoroughness
2382                  */
2383                 for (i = sum = 0; i < lpl_root->lpl_nrset; i++) {
2384                         sum += lpl_root->lpl_rset[i]->lpl_ncpu;
2385                 }
2386                 lpl_root->lpl_ncpu = sum;
2387                 lpl_child_update(lpl_root, cp);
2388 
2389                 cp = cp->cp_next;
2390         } while (cp != cp_list_head);
2391 
2392         return (levels);
2393 }
2394 
2395 /*
2396  * Insert a lpl into the resource hierarchy and create any additional lpls that
2397  * are necessary to represent the varying states of locality for the cpu
2398  * resoruces newly added to the partition.
2399  *
2400  * This routine is clever enough that it can correctly add resources from the
2401  * new leaf into both direct and indirect resource sets in the hierarchy.  (Ie,
2402  * those for which the lpl is a leaf as opposed to simply a named equally local
2403  * resource).  The one special case that needs additional processing is when a
2404  * new intermediate lpl is introduced.  Since the main loop only traverses
2405  * looking to add the leaf resource where it does not yet exist, additional work
2406  * is necessary to add other leaf resources that may need to exist in the newly
2407  * created intermediate.  This is performed by the second inner loop, and is
2408  * only done when the check for more than one overlapping resource succeeds.
2409  */
2410 
2411 void
2412 lpl_leaf_insert(lpl_t *lpl_leaf, cpupart_t *cpupart)
2413 {
2414         int             i;
2415         int             j;
2416         int             rset_num_intersect;
2417         lgrp_t          *lgrp_cur;
2418         lpl_t           *lpl_cur;
2419         lpl_t           *lpl_parent;
2420         lgrp_id_t       parent_id;
2421         klgrpset_t      rset_intersect; /* resources in cpupart and lgrp */
2422 
2423         for (i = 0; i <= lgrp_alloc_max; i++) {
2424                 lgrp_cur = lgrp_table[i];
2425 
2426                 /*
2427                  * Don't insert if the lgrp isn't there, if the leaf isn't
2428                  * contained within the current lgrp, or if the current lgrp has
2429                  * no leaves in this partition
2430                  */
2431 
2432                 if (!LGRP_EXISTS(lgrp_cur) ||
2433                     !klgrpset_ismember(lgrp_cur->lgrp_set[LGRP_RSRC_CPU],
2434                     lpl_leaf->lpl_lgrpid) ||
2435                     !klgrpset_intersects(lgrp_cur->lgrp_leaves,
2436                     cpupart->cp_lgrpset))
2437                         continue;
2438 
2439                 lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id];
2440                 if (lgrp_cur->lgrp_parent != NULL) {
2441                         /* if lgrp has a parent, assign it properly */
2442                         parent_id = lgrp_cur->lgrp_parent->lgrp_id;
2443                         lpl_parent = &cpupart->cp_lgrploads[parent_id];
2444                 } else {
2445                         /* if not, make sure parent ptr gets set to null */
2446                         lpl_parent = NULL;
2447                 }
2448 
2449                 if (lpl_cur == lpl_leaf) {
2450                         /*
2451                          * Almost all leaf state was initialized elsewhere.  The
2452                          * only thing left to do is to set the parent.
2453                          */
2454                         lpl_cur->lpl_parent = lpl_parent;
2455                         continue;
2456                 }
2457 
2458                 lpl_clear(lpl_cur);
2459                 lpl_init(lpl_cur, lpl_leaf, lgrp_cur);
2460 
2461                 lpl_cur->lpl_parent = lpl_parent;
2462 
2463                 /* does new lpl need to be populated with other resources? */
2464                 rset_intersect =
2465                     klgrpset_intersects(lgrp_cur->lgrp_set[LGRP_RSRC_CPU],
2466                     cpupart->cp_lgrpset);
2467                 klgrpset_nlgrps(rset_intersect, rset_num_intersect);
2468 
2469                 if (rset_num_intersect > 1) {
2470                         /*
2471                          * If so, figure out what lpls have resources that
2472                          * intersect this one, and add them.
2473                          */
2474                         for (j = 0; j <= lgrp_alloc_max; j++) {
2475                                 lgrp_t  *lgrp_cand;     /* candidate lgrp */
2476                                 lpl_t   *lpl_cand;      /* candidate lpl */
2477 
2478                                 lgrp_cand = lgrp_table[j];
2479                                 if (!LGRP_EXISTS(lgrp_cand) ||
2480                                     !klgrpset_ismember(rset_intersect,
2481                                     lgrp_cand->lgrp_id))
2482                                         continue;
2483                                 lpl_cand =
2484                                     &cpupart->cp_lgrploads[lgrp_cand->lgrp_id];
2485                                 lpl_rset_add(lpl_cur, lpl_cand);
2486                         }
2487                 }
2488                 /*
2489                  * This lpl's rset has changed. Update the hint in it's
2490                  * children.
2491                  */
2492                 lpl_child_update(lpl_cur, cpupart);
2493         }
2494 }
2495 
2496 /*
2497  * remove a lpl from the hierarchy of resources, clearing its state when
2498  * finished.  If the lpls at the intermediate levels of the hierarchy have no
2499  * remaining resources, or no longer name a leaf resource in the cpu-partition,
2500  * delete them as well.
2501  */
2502 
2503 void
2504 lpl_leaf_remove(lpl_t *lpl_leaf, cpupart_t *cpupart)
2505 {
2506         int             i;
2507         lgrp_t          *lgrp_cur;
2508         lpl_t           *lpl_cur;
2509         klgrpset_t      leaf_intersect; /* intersection of leaves */
2510 
2511         for (i = 0; i <= lgrp_alloc_max; i++) {
2512                 lgrp_cur = lgrp_table[i];
2513 
2514                 /*
2515                  * Don't attempt to remove from lgrps that aren't there, that
2516                  * don't contain our leaf, or from the leaf itself. (We do that
2517                  * later)
2518                  */
2519 
2520                 if (!LGRP_EXISTS(lgrp_cur))
2521                         continue;
2522 
2523                 lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id];
2524 
2525                 if (!klgrpset_ismember(lgrp_cur->lgrp_set[LGRP_RSRC_CPU],
2526                     lpl_leaf->lpl_lgrpid) ||
2527                     (lpl_cur == lpl_leaf)) {
2528                         continue;
2529                 }
2530 
2531                 /*
2532                  * This is a slightly sleazy simplification in that we have
2533                  * already marked the cp_lgrpset as no longer containing the
2534                  * leaf we've deleted.  Any lpls that pass the above checks
2535                  * based upon lgrp membership but not necessarily cpu-part
2536                  * membership also get cleared by the checks below.  Currently
2537                  * this is harmless, as the lpls should be empty anyway.
2538                  *
2539                  * In particular, we want to preserve lpls that have additional
2540                  * leaf resources, even though we don't yet have a processor
2541                  * architecture that represents resources this way.
2542                  */
2543 
2544                 leaf_intersect = klgrpset_intersects(lgrp_cur->lgrp_leaves,
2545                     cpupart->cp_lgrpset);
2546 
2547                 lpl_rset_del(lpl_cur, lpl_leaf);
2548                 if ((lpl_cur->lpl_nrset == 0) || (!leaf_intersect)) {
2549                         lpl_clear(lpl_cur);
2550                 } else {
2551                         /*
2552                          * Update this lpl's children
2553                          */
2554                         lpl_child_update(lpl_cur, cpupart);
2555                 }
2556         }
2557         lpl_clear(lpl_leaf);
2558 }
2559 
2560 /*
2561  * add a cpu to a partition in terms of lgrp load avg bookeeping
2562  *
2563  * The lpl (cpu partition load average information) is now arranged in a
2564  * hierarchical fashion whereby resources that are closest, ie. most local, to
2565  * the cpu in question are considered to be leaves in a tree of resources.
2566  * There are two general cases for cpu additon:
2567  *
2568  * 1. A lpl structure that contains resources already in the hierarchy tree.
2569  * In this case, all of the associated lpl relationships have been defined, and
2570  * all that is necessary is that we link the new cpu into the per-lpl list of
2571  * cpus, and increment the ncpu count of all places where this cpu resource will
2572  * be accounted for.  lpl_cpu_adjcnt updates the cpu count, and the cpu pointer
2573  * pushing is accomplished by this routine.
2574  *
2575  * 2. The lpl to contain the resources in this cpu-partition for this lgrp does
2576  * not exist yet.  In this case, it is necessary to build the leaf lpl, and
2577  * construct the hierarchy of state necessary to name it's more distant
2578  * resources, if they should exist.  The leaf structure is initialized by this
2579  * routine, as is the cpu-partition state for the lgrp membership.  This routine
2580  * also calls lpl_leaf_insert() which inserts the named lpl into the hierarchy
2581  * and builds all of the "ancestoral" state necessary to identify resources at
2582  * differing levels of locality.
2583  */
2584 void
2585 lgrp_part_add_cpu(cpu_t *cp, lgrp_id_t lgrpid)
2586 {
2587         cpupart_t       *cpupart;
2588         lgrp_t          *lgrp_leaf;
2589         lpl_t           *lpl_leaf;
2590 
2591         /* called sometimes w/ cpus paused - grab no locks */
2592         ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
2593 
2594         cpupart = cp->cpu_part;
2595         lgrp_leaf = lgrp_table[lgrpid];
2596 
2597         /* don't add non-existent lgrp */
2598         ASSERT(LGRP_EXISTS(lgrp_leaf));
2599         lpl_leaf = &cpupart->cp_lgrploads[lgrpid];
2600         cp->cpu_lpl = lpl_leaf;
2601 
2602         /* only leaf lpls contain cpus */
2603 
2604         if (lpl_leaf->lpl_ncpu++ == 0) {
2605                 lpl_init(lpl_leaf, lpl_leaf, lgrp_leaf);
2606                 klgrpset_add(cpupart->cp_lgrpset, lgrpid);
2607                 lpl_leaf_insert(lpl_leaf, cpupart);
2608         } else {
2609                 /*
2610                  * the lpl should already exist in the parent, so just update
2611                  * the count of available CPUs
2612                  */
2613                 lpl_cpu_adjcnt(LPL_INCREMENT, cp);
2614         }
2615 
2616         /* link cpu into list of cpus in lpl */
2617 
2618         if (lpl_leaf->lpl_cpus) {
2619                 cp->cpu_next_lpl = lpl_leaf->lpl_cpus;
2620                 cp->cpu_prev_lpl = lpl_leaf->lpl_cpus->cpu_prev_lpl;
2621                 lpl_leaf->lpl_cpus->cpu_prev_lpl->cpu_next_lpl = cp;
2622                 lpl_leaf->lpl_cpus->cpu_prev_lpl = cp;
2623         } else {
2624                 /*
2625                  * We increment ncpu immediately after we create a new leaf
2626                  * lpl, so assert that ncpu == 1 for the case where we don't
2627                  * have any cpu pointers yet.
2628                  */
2629                 ASSERT(lpl_leaf->lpl_ncpu == 1);
2630                 lpl_leaf->lpl_cpus = cp->cpu_next_lpl = cp->cpu_prev_lpl = cp;
2631         }
2632 
2633 }
2634 
2635 
2636 /*
2637  * remove a cpu from a partition in terms of lgrp load avg bookeeping
2638  *
2639  * The lpl (cpu partition load average information) is now arranged in a
2640  * hierarchical fashion whereby resources that are closest, ie. most local, to
2641  * the cpu in question are considered to be leaves in a tree of resources.
2642  * There are two removal cases in question:
2643  *
2644  * 1. Removal of the resource in the leaf leaves other resources remaining in
2645  * that leaf.  (Another cpu still exists at this level of locality).  In this
2646  * case, the count of available cpus is decremented in all assocated lpls by
2647  * calling lpl_adj_cpucnt(), and the pointer to the removed cpu is pruned
2648  * from the per-cpu lpl list.
2649  *
2650  * 2. Removal of the resource results in the lpl containing no resources.  (It's
2651  * empty)  In this case, all of what has occurred for the first step must take
2652  * place; however, additionally we must remove the lpl structure itself, prune
2653  * out any stranded lpls that do not directly name a leaf resource, and mark the
2654  * cpu partition in question as no longer containing resources from the lgrp of
2655  * the lpl that has been delted.  Cpu-partition changes are handled by this
2656  * method, but the lpl_leaf_remove function deals with the details of pruning
2657  * out the empty lpl and any of its orphaned direct ancestors.
2658  */
2659 void
2660 lgrp_part_del_cpu(cpu_t *cp)
2661 {
2662         lpl_t           *lpl;
2663         lpl_t           *leaf_lpl;
2664         lgrp_t          *lgrp_leaf;
2665 
2666         /* called sometimes w/ cpus paused - grab no locks */
2667 
2668         ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
2669 
2670         lpl = leaf_lpl = cp->cpu_lpl;
2671         lgrp_leaf = leaf_lpl->lpl_lgrp;
2672 
2673         /* don't delete a leaf that isn't there */
2674         ASSERT(LGRP_EXISTS(lgrp_leaf));
2675 
2676         /* no double-deletes */
2677         ASSERT(lpl->lpl_ncpu);
2678         if (--lpl->lpl_ncpu == 0) {
2679                 /*
2680                  * This was the last cpu in this lgroup for this partition,
2681                  * clear its bit in the partition's lgroup bitmask
2682                  */
2683                 klgrpset_del(cp->cpu_part->cp_lgrpset, lpl->lpl_lgrpid);
2684 
2685                 /* eliminate remaning lpl link pointers in cpu, lpl */
2686                 lpl->lpl_cpus = cp->cpu_next_lpl = cp->cpu_prev_lpl = NULL;
2687 
2688                 lpl_leaf_remove(leaf_lpl, cp->cpu_part);
2689         } else {
2690 
2691                 /* unlink cpu from lists of cpus in lpl */
2692                 cp->cpu_prev_lpl->cpu_next_lpl = cp->cpu_next_lpl;
2693                 cp->cpu_next_lpl->cpu_prev_lpl = cp->cpu_prev_lpl;
2694                 if (lpl->lpl_cpus == cp) {
2695                         lpl->lpl_cpus = cp->cpu_next_lpl;
2696                 }
2697 
2698                 /*
2699                  * Update the cpu count in the lpls associated with parent
2700                  * lgroups.
2701                  */
2702                 lpl_cpu_adjcnt(LPL_DECREMENT, cp);
2703 
2704         }
2705         /* clear cpu's lpl ptr when we're all done */
2706         cp->cpu_lpl = NULL;
2707 }
2708 
2709 /*
2710  * Recompute load average for the specified partition/lgrp fragment.
2711  *
2712  * We rely on the fact that this routine is called from the clock thread
2713  * at a point before the clock thread can block (i.e. before its first
2714  * lock request).  Since the clock thread can not be preempted (since it
2715  * runs at highest priority), we know that cpu partitions can not change
2716  * (since doing so would require either the repartition requester or the
2717  * cpu_pause thread to run on this cpu), so we can update the cpu's load
2718  * without grabbing cpu_lock.
2719  */
2720 void
2721 lgrp_loadavg(lpl_t *lpl, uint_t nrcpus, int ageflag)
2722 {
2723         uint_t          ncpu;
2724         int64_t         old, new, f;
2725 
2726         /*
2727          * 1 - exp(-1/(20 * ncpu)) << 13 = 400 for 1 cpu...
2728          */
2729         static short expval[] = {
2730             0, 3196, 1618, 1083,
2731             814, 652, 543, 466,
2732             408, 363, 326, 297,
2733             272, 251, 233, 218,
2734             204, 192, 181, 172,
2735             163, 155, 148, 142,
2736             136, 130, 125, 121,
2737             116, 112, 109, 105
2738         };
2739 
2740         /* ASSERT (called from clock level) */
2741 
2742         if ((lpl == NULL) ||    /* we're booting - this is easiest for now */
2743             ((ncpu = lpl->lpl_ncpu) == 0)) {
2744                 return;
2745         }
2746 
2747         for (;;) {
2748 
2749                 if (ncpu >= sizeof (expval) / sizeof (expval[0]))
2750                         f = expval[1]/ncpu; /* good approx. for large ncpu */
2751                 else
2752                         f = expval[ncpu];
2753 
2754                 /*
2755                  * Modify the load average atomically to avoid losing
2756                  * anticipatory load updates (see lgrp_move_thread()).
2757                  */
2758                 if (ageflag) {
2759                         /*
2760                          * We're supposed to both update and age the load.
2761                          * This happens 10 times/sec. per cpu.  We do a
2762                          * little hoop-jumping to avoid integer overflow.
2763                          */
2764                         int64_t         q, r;
2765 
2766                         do {
2767                                 old = new = lpl->lpl_loadavg;
2768                                 q = (old  >> 16) << 7;
2769                                 r = (old  & 0xffff) << 7;
2770                                 new += ((long long)(nrcpus - q) * f -
2771                                     ((r * f) >> 16)) >> 7;
2772 
2773                                 /*
2774                                  * Check for overflow
2775                                  */
2776                                 if (new > LGRP_LOADAVG_MAX)
2777                                         new = LGRP_LOADAVG_MAX;
2778                                 else if (new < 0)
2779                                         new = 0;
2780                         } while (atomic_cas_32((lgrp_load_t *)&lpl->lpl_loadavg,
2781                             old, new) != old);
2782                 } else {
2783                         /*
2784                          * We're supposed to update the load, but not age it.
2785                          * This option is used to update the load (which either
2786                          * has already been aged in this 1/10 sec. interval or
2787                          * soon will be) to account for a remotely executing
2788                          * thread.
2789                          */
2790                         do {
2791                                 old = new = lpl->lpl_loadavg;
2792                                 new += f;
2793                                 /*
2794                                  * Check for overflow
2795                                  * Underflow not possible here
2796                                  */
2797                                 if (new < old)
2798                                         new = LGRP_LOADAVG_MAX;
2799                         } while (atomic_cas_32((lgrp_load_t *)&lpl->lpl_loadavg,
2800                             old, new) != old);
2801                 }
2802 
2803                 /*
2804                  * Do the same for this lpl's parent
2805                  */
2806                 if ((lpl = lpl->lpl_parent) == NULL)
2807                         break;
2808                 ncpu = lpl->lpl_ncpu;
2809         }
2810 }
2811 
2812 /*
2813  * Initialize lpl topology in the target based on topology currently present in
2814  * lpl_bootstrap.
2815  *
2816  * lpl_topo_bootstrap is only called once from cpupart_initialize_default() to
2817  * initialize cp_default list of lpls. Up to this point all topology operations
2818  * were performed using lpl_bootstrap. Now cp_default has its own list of lpls
2819  * and all subsequent lpl operations should use it instead of lpl_bootstrap. The
2820  * `target' points to the list of lpls in cp_default and `size' is the size of
2821  * this list.
2822  *
2823  * This function walks the lpl topology in lpl_bootstrap and does for things:
2824  *
2825  * 1) Copies all fields from lpl_bootstrap to the target.
2826  *
2827  * 2) Sets CPU0 lpl pointer to the correct element of the target list.
2828  *
2829  * 3) Updates lpl_parent pointers to point to the lpls in the target list
2830  *    instead of lpl_bootstrap.
2831  *
2832  * 4) Updates pointers in the resource list of the target to point to the lpls
2833  *    in the target list instead of lpl_bootstrap.
2834  *
2835  * After lpl_topo_bootstrap() completes, target contains the same information
2836  * that would be present there if it were used during boot instead of
2837  * lpl_bootstrap. There is no need in information in lpl_bootstrap after this
2838  * and it is bzeroed.
2839  */
2840 void
2841 lpl_topo_bootstrap(lpl_t *target, int size)
2842 {
2843         lpl_t   *lpl = lpl_bootstrap;
2844         lpl_t   *target_lpl = target;
2845         lpl_t   **rset;
2846         int     *id2rset;
2847         int     sz;
2848         int     howmany;
2849         int     id;
2850         int     i;
2851 
2852         /*
2853          * The only target that should be passed here is cp_default lpl list.
2854          */
2855         ASSERT(target == cp_default.cp_lgrploads);
2856         ASSERT(size == cp_default.cp_nlgrploads);
2857         ASSERT(!lgrp_topo_initialized);
2858         ASSERT(ncpus == 1);
2859 
2860         howmany = MIN(LPL_BOOTSTRAP_SIZE, size);
2861         for (i = 0; i < howmany; i++, lpl++, target_lpl++) {
2862                 /*
2863                  * Copy all fields from lpl, except for the rset,
2864                  * lgrp id <=> rset mapping storage,
2865                  * and amount of storage
2866                  */
2867                 rset = target_lpl->lpl_rset;
2868                 id2rset = target_lpl->lpl_id2rset;
2869                 sz = target_lpl->lpl_rset_sz;
2870 
2871                 *target_lpl = *lpl;
2872 
2873                 target_lpl->lpl_rset_sz = sz;
2874                 target_lpl->lpl_rset = rset;
2875                 target_lpl->lpl_id2rset = id2rset;
2876 
2877                 /*
2878                  * Substitute CPU0 lpl pointer with one relative to target.
2879                  */
2880                 if (lpl->lpl_cpus == CPU) {
2881                         ASSERT(CPU->cpu_lpl == lpl);
2882                         CPU->cpu_lpl = target_lpl;
2883                 }
2884 
2885                 /*
2886                  * Substitute parent information with parent relative to target.
2887                  */
2888                 if (lpl->lpl_parent != NULL)
2889                         target_lpl->lpl_parent = (lpl_t *)
2890                             (((uintptr_t)lpl->lpl_parent -
2891                             (uintptr_t)lpl_bootstrap) +
2892                             (uintptr_t)target);
2893 
2894                 /*
2895                  * Walk over resource set substituting pointers relative to
2896                  * lpl_bootstrap's rset to pointers relative to target's
2897                  */
2898                 ASSERT(lpl->lpl_nrset <= 1);
2899 
2900                 for (id = 0; id < lpl->lpl_nrset; id++) {
2901                         if (lpl->lpl_rset[id] != NULL) {
2902                                 target_lpl->lpl_rset[id] = (lpl_t *)
2903                                     (((uintptr_t)lpl->lpl_rset[id] -
2904                                     (uintptr_t)lpl_bootstrap) +
2905                                     (uintptr_t)target);
2906                         }
2907                         target_lpl->lpl_id2rset[id] =
2908                             lpl->lpl_id2rset[id];
2909                 }
2910         }
2911 
2912         /*
2913          * Clean up the bootstrap lpls since we have switched over to the
2914          * actual lpl array in the default cpu partition.
2915          *
2916          * We still need to keep one empty lpl around for newly starting
2917          * slave CPUs to reference should they need to make it through the
2918          * dispatcher prior to their lgrp/lpl initialization.
2919          *
2920          * The lpl related dispatcher code has been designed to work properly
2921          * (and without extra checks) for this special case of a zero'ed
2922          * bootstrap lpl. Such an lpl appears to the dispatcher as an lpl
2923          * with lgrpid 0 and an empty resource set. Iteration over the rset
2924          * array by the dispatcher is also NULL terminated for this reason.
2925          *
2926          * This provides the desired behaviour for an uninitialized CPU.
2927          * It shouldn't see any other CPU to either dispatch to or steal
2928          * from until it is properly initialized.
2929          */
2930         bzero(lpl_bootstrap_list, sizeof (lpl_bootstrap_list));
2931         bzero(lpl_bootstrap_id2rset, sizeof (lpl_bootstrap_id2rset));
2932         bzero(lpl_bootstrap_rset, sizeof (lpl_bootstrap_rset));
2933 
2934         lpl_bootstrap_list[0].lpl_rset = lpl_bootstrap_rset;
2935         lpl_bootstrap_list[0].lpl_id2rset = lpl_bootstrap_id2rset;
2936 }
2937 
2938 /*
2939  * If the lowest load among the lgroups a process' threads are currently
2940  * spread across is greater than lgrp_expand_proc_thresh, we'll consider
2941  * expanding the process to a new lgroup.
2942  */
2943 #define LGRP_EXPAND_PROC_THRESH_DEFAULT 62250
2944 lgrp_load_t     lgrp_expand_proc_thresh = LGRP_EXPAND_PROC_THRESH_DEFAULT;
2945 
2946 #define LGRP_EXPAND_PROC_THRESH(ncpu) \
2947         ((lgrp_expand_proc_thresh) / (ncpu))
2948 
2949 /*
2950  * A process will be expanded to a new lgroup only if the difference between
2951  * the lowest load on the lgroups the process' thread's are currently spread
2952  * across and the lowest load on the other lgroups in the process' partition
2953  * is greater than lgrp_expand_proc_diff.
2954  */
2955 #define LGRP_EXPAND_PROC_DIFF_DEFAULT 60000
2956 lgrp_load_t     lgrp_expand_proc_diff = LGRP_EXPAND_PROC_DIFF_DEFAULT;
2957 
2958 #define LGRP_EXPAND_PROC_DIFF(ncpu) \
2959         ((lgrp_expand_proc_diff) / (ncpu))
2960 
2961 /*
2962  * The loadavg tolerance accounts for "noise" inherent in the load, which may
2963  * be present due to impreciseness of the load average decay algorithm.
2964  *
2965  * The default tolerance is lgrp_loadavg_max_effect. Note that the tunable
2966  * tolerance is scaled by the number of cpus in the lgroup just like
2967  * lgrp_loadavg_max_effect. For example, if lgrp_loadavg_tolerance = 0x10000,
2968  * and ncpu = 4, then lgrp_choose will consider differences in lgroup loads
2969  * of: 0x10000 / 4 => 0x4000 or greater to be significant.
2970  */
2971 uint32_t        lgrp_loadavg_tolerance = LGRP_LOADAVG_THREAD_MAX;
2972 #define LGRP_LOADAVG_TOLERANCE(ncpu)    \
2973         ((lgrp_loadavg_tolerance) / ncpu)
2974 
2975 /*
2976  * lgrp_choose() will choose root lgroup as home when lowest lgroup load
2977  * average is above this threshold
2978  */
2979 uint32_t        lgrp_load_thresh = UINT32_MAX;
2980 
2981 /*
2982  * lgrp_choose() will try to skip any lgroups with less memory
2983  * than this free when choosing a home lgroup
2984  */
2985 pgcnt_t lgrp_mem_free_thresh = 0;
2986 
2987 /*
2988  * When choosing between similarly loaded lgroups, lgrp_choose() will pick
2989  * one based on one of the following policies:
2990  * - Random selection
2991  * - Pseudo round robin placement
2992  * - Longest time since a thread was last placed
2993  */
2994 #define LGRP_CHOOSE_RANDOM      1
2995 #define LGRP_CHOOSE_RR          2
2996 #define LGRP_CHOOSE_TIME        3
2997 
2998 int     lgrp_choose_policy = LGRP_CHOOSE_TIME;
2999 
3000 /*
3001  * Choose a suitable leaf lgroup for a kthread.  The kthread is assumed not to
3002  * be bound to a CPU or processor set.
3003  *
3004  * Arguments:
3005  *      t               The thread
3006  *      cpupart         The partition the thread belongs to.
3007  *
3008  * NOTE: Should at least be called with the cpu_lock held, kernel preemption
3009  *       disabled, or thread_lock held (at splhigh) to protect against the CPU
3010  *       partitions changing out from under us and assumes that given thread is
3011  *       protected.  Also, called sometimes w/ cpus paused or kernel preemption
3012  *       disabled, so don't grab any locks because we should never block under
3013  *       those conditions.
3014  */
3015 lpl_t *
3016 lgrp_choose(kthread_t *t, cpupart_t *cpupart)
3017 {
3018         lgrp_load_t     bestload, bestrload;
3019         int             lgrpid_offset, lgrp_count;
3020         lgrp_id_t       lgrpid, lgrpid_start;
3021         lpl_t           *lpl, *bestlpl, *bestrlpl;
3022         klgrpset_t      lgrpset;
3023         proc_t          *p;
3024 
3025         ASSERT(t != NULL);
3026         ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 ||
3027             THREAD_LOCK_HELD(t));
3028         ASSERT(cpupart != NULL);
3029 
3030         p = t->t_procp;
3031 
3032         /* A process should always be in an active partition */
3033         ASSERT(!klgrpset_isempty(cpupart->cp_lgrpset));
3034 
3035         bestlpl = bestrlpl = NULL;
3036         bestload = bestrload = LGRP_LOADAVG_MAX;
3037         lgrpset = cpupart->cp_lgrpset;
3038 
3039         switch (lgrp_choose_policy) {
3040         case LGRP_CHOOSE_RR:
3041                 lgrpid = cpupart->cp_lgrp_hint;
3042                 do {
3043                         if (++lgrpid > lgrp_alloc_max)
3044                                 lgrpid = 0;
3045                 } while (!klgrpset_ismember(lgrpset, lgrpid));
3046 
3047                 break;
3048         default:
3049         case LGRP_CHOOSE_TIME:
3050         case LGRP_CHOOSE_RANDOM:
3051                 klgrpset_nlgrps(lgrpset, lgrp_count);
3052                 lgrpid_offset =
3053                     (((ushort_t)(gethrtime() >> 4)) % lgrp_count) + 1;
3054                 for (lgrpid = 0; ; lgrpid++) {
3055                         if (klgrpset_ismember(lgrpset, lgrpid)) {
3056                                 if (--lgrpid_offset == 0)
3057                                         break;
3058                         }
3059                 }
3060                 break;
3061         }
3062 
3063         lgrpid_start = lgrpid;
3064 
3065         DTRACE_PROBE2(lgrp_choose_start, lgrp_id_t, lgrpid_start,
3066             lgrp_id_t, cpupart->cp_lgrp_hint);
3067 
3068         /*
3069          * Use lgroup affinities (if any) to choose best lgroup
3070          *
3071          * NOTE: Assumes that thread is protected from going away and its
3072          *       lgroup affinities won't change (ie. p_lock, or
3073          *       thread_lock() being held and/or CPUs paused)
3074          */
3075         if (t->t_lgrp_affinity) {
3076                 lpl = lgrp_affinity_best(t, cpupart, lgrpid_start, B_FALSE);
3077                 if (lpl != NULL)
3078                         return (lpl);
3079         }
3080 
3081         ASSERT(klgrpset_ismember(lgrpset, lgrpid_start));
3082 
3083         do {
3084                 pgcnt_t npgs;
3085 
3086                 /*
3087                  * Skip any lgroups outside of thread's pset
3088                  */
3089                 if (!klgrpset_ismember(lgrpset, lgrpid)) {
3090                         if (++lgrpid > lgrp_alloc_max)
3091                                 lgrpid = 0;     /* wrap the search */
3092                         continue;
3093                 }
3094 
3095                 /*
3096                  * Skip any non-leaf lgroups
3097                  */
3098                 if (lgrp_table[lgrpid]->lgrp_childcnt != 0)
3099                         continue;
3100 
3101                 /*
3102                  * Skip any lgroups without enough free memory
3103                  * (when threshold set to nonzero positive value)
3104                  */
3105                 if (lgrp_mem_free_thresh > 0) {
3106                         npgs = lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_FREE);
3107                         if (npgs < lgrp_mem_free_thresh) {
3108                                 if (++lgrpid > lgrp_alloc_max)
3109                                         lgrpid = 0;     /* wrap the search */
3110                                 continue;
3111                         }
3112                 }
3113 
3114                 lpl = &cpupart->cp_lgrploads[lgrpid];
3115                 if (klgrpset_isempty(p->p_lgrpset) ||
3116                     klgrpset_ismember(p->p_lgrpset, lgrpid)) {
3117                         /*
3118                          * Either this is a new process or the process already
3119                          * has threads on this lgrp, so this is a preferred
3120                          * lgroup for the thread.
3121                          */
3122                         if (bestlpl == NULL ||
3123                             lpl_pick(lpl, bestlpl)) {
3124                                 bestload = lpl->lpl_loadavg;
3125                                 bestlpl = lpl;
3126                         }
3127                 } else {
3128                         /*
3129                          * The process doesn't have any threads on this lgrp,
3130                          * but we're willing to consider this lgrp if the load
3131                          * difference is big enough to justify splitting up
3132                          * the process' threads.
3133                          */
3134                         if (bestrlpl == NULL ||
3135                             lpl_pick(lpl, bestrlpl)) {
3136                                 bestrload = lpl->lpl_loadavg;
3137                                 bestrlpl = lpl;
3138                         }
3139                 }
3140                 if (++lgrpid > lgrp_alloc_max)
3141                         lgrpid = 0;     /* wrap the search */
3142         } while (lgrpid != lgrpid_start);
3143 
3144         /*
3145          * Return root lgroup if threshold isn't set to maximum value and
3146          * lowest lgroup load average more than a certain threshold
3147          */
3148         if (lgrp_load_thresh != UINT32_MAX &&
3149             bestload >= lgrp_load_thresh && bestrload >= lgrp_load_thresh)
3150                 return (&cpupart->cp_lgrploads[lgrp_root->lgrp_id]);
3151 
3152         /*
3153          * If all the lgroups over which the thread's process is spread are
3154          * heavily loaded, or otherwise undesirable, we'll consider placing
3155          * the thread on one of the other leaf lgroups in the thread's
3156          * partition.
3157          */
3158         if ((bestlpl == NULL) ||
3159             ((bestload > LGRP_EXPAND_PROC_THRESH(bestlpl->lpl_ncpu)) &&
3160             (bestrload < bestload) &&        /* paranoid about wraparound */
3161             (bestrload + LGRP_EXPAND_PROC_DIFF(bestrlpl->lpl_ncpu) <
3162             bestload))) {
3163                 bestlpl = bestrlpl;
3164         }
3165 
3166         if (bestlpl == NULL) {
3167                 /*
3168                  * No lgroup looked particularly good, but we still
3169                  * have to pick something. Go with the randomly selected
3170                  * legal lgroup we started with above.
3171                  */
3172                 bestlpl = &cpupart->cp_lgrploads[lgrpid_start];
3173         }
3174 
3175         cpupart->cp_lgrp_hint = bestlpl->lpl_lgrpid;
3176         bestlpl->lpl_homed_time = gethrtime_unscaled();
3177 
3178         ASSERT(bestlpl->lpl_ncpu > 0);
3179         return (bestlpl);
3180 }
3181 
3182 /*
3183  * Decide if lpl1 is a better candidate than lpl2 for lgrp homing.
3184  * Returns non-zero if lpl1 is a better candidate, and 0 otherwise.
3185  */
3186 static int
3187 lpl_pick(lpl_t *lpl1, lpl_t *lpl2)
3188 {
3189         lgrp_load_t     l1, l2;
3190         lgrp_load_t     tolerance = LGRP_LOADAVG_TOLERANCE(lpl1->lpl_ncpu);
3191 
3192         l1 = lpl1->lpl_loadavg;
3193         l2 = lpl2->lpl_loadavg;
3194 
3195         if ((l1 + tolerance < l2) && (l1 < l2)) {
3196                 /* lpl1 is significantly less loaded than lpl2 */
3197                 return (1);
3198         }
3199 
3200         if (lgrp_choose_policy == LGRP_CHOOSE_TIME &&
3201             l1 + tolerance >= l2 && l1 < l2 &&
3202             lpl1->lpl_homed_time < lpl2->lpl_homed_time) {
3203                 /*
3204                  * lpl1's load is within the tolerance of lpl2. We're
3205                  * willing to consider it be to better however if
3206                  * it has been longer since we last homed a thread there
3207                  */
3208                 return (1);
3209         }
3210 
3211         return (0);
3212 }
3213 
3214 /*
3215  * lgrp_trthr_moves counts the number of times main thread (t_tid = 1) of a
3216  * process that uses text replication changed home lgrp. This info is used by
3217  * segvn asyncronous thread to detect if it needs to recheck what lgrps
3218  * should be used for text replication.
3219  */
3220 static uint64_t lgrp_trthr_moves = 0;
3221 
3222 uint64_t
3223 lgrp_get_trthr_migrations(void)
3224 {
3225         return (lgrp_trthr_moves);
3226 }
3227 
3228 void
3229 lgrp_update_trthr_migrations(uint64_t incr)
3230 {
3231         atomic_add_64(&lgrp_trthr_moves, incr);
3232 }
3233 
3234 /*
3235  * An LWP is expected to be assigned to an lgroup for at least this long
3236  * for its anticipatory load to be justified.  NOTE that this value should
3237  * not be set extremely huge (say, larger than 100 years), to avoid problems
3238  * with overflow in the calculation that uses it.
3239  */
3240 #define LGRP_MIN_NSEC   (NANOSEC / 10)          /* 1/10 of a second */
3241 hrtime_t lgrp_min_nsec = LGRP_MIN_NSEC;
3242 
3243 /*
3244  * Routine to change a thread's lgroup affiliation.  This routine updates
3245  * the thread's kthread_t struct and its process' proc_t struct to note the
3246  * thread's new lgroup affiliation, and its lgroup affinities.
3247  *
3248  * Note that this is the only routine that modifies a thread's t_lpl field,
3249  * and that adds in or removes anticipatory load.
3250  *
3251  * If the thread is exiting, newlpl is NULL.
3252  *
3253  * Locking:
3254  * The following lock must be held on entry:
3255  *      cpu_lock, kpreempt_disable(), or thread_lock -- to assure t's new lgrp
3256  *              doesn't get removed from t's partition
3257  *
3258  * This routine is not allowed to grab any locks, since it may be called
3259  * with cpus paused (such as from cpu_offline).
3260  */
3261 void
3262 lgrp_move_thread(kthread_t *t, lpl_t *newlpl, int do_lgrpset_delete)
3263 {
3264         proc_t          *p;
3265         lpl_t           *lpl, *oldlpl;
3266         lgrp_id_t       oldid;
3267         kthread_t       *tp;
3268         uint_t          ncpu;
3269         lgrp_load_t     old, new;
3270 
3271         ASSERT(t);
3272         ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 ||
3273             THREAD_LOCK_HELD(t));
3274 
3275         /*
3276          * If not changing lpls, just return
3277          */
3278         if ((oldlpl = t->t_lpl) == newlpl)
3279                 return;
3280 
3281         /*
3282          * Make sure the thread's lwp hasn't exited (if so, this thread is now
3283          * associated with process 0 rather than with its original process).
3284          */
3285         if (t->t_proc_flag & TP_LWPEXIT) {
3286                 if (newlpl != NULL) {
3287                         t->t_lpl = newlpl;
3288                 }
3289                 return;
3290         }
3291 
3292         p = ttoproc(t);
3293 
3294         /*
3295          * If the thread had a previous lgroup, update its process' p_lgrpset
3296          * to account for it being moved from its old lgroup.
3297          */
3298         if ((oldlpl != NULL) && /* thread had a previous lgroup */
3299             (p->p_tlist != NULL)) {
3300                 oldid = oldlpl->lpl_lgrpid;
3301 
3302                 if (newlpl != NULL)
3303                         lgrp_stat_add(oldid, LGRP_NUM_MIGR, 1);
3304 
3305                 if ((do_lgrpset_delete) &&
3306                     (klgrpset_ismember(p->p_lgrpset, oldid))) {
3307                         for (tp = p->p_tlist->t_forw; ; tp = tp->t_forw) {
3308                                 /*
3309                                  * Check if a thread other than the thread
3310                                  * that's moving is assigned to the same
3311                                  * lgroup as the thread that's moving.  Note
3312                                  * that we have to compare lgroup IDs, rather
3313                                  * than simply comparing t_lpl's, since the
3314                                  * threads may belong to different partitions
3315                                  * but be assigned to the same lgroup.
3316                                  */
3317                                 ASSERT(tp->t_lpl != NULL);
3318 
3319                                 if ((tp != t) &&
3320                                     (tp->t_lpl->lpl_lgrpid == oldid)) {
3321                                         /*
3322                                          * Another thread is assigned to the
3323                                          * same lgroup as the thread that's
3324                                          * moving, p_lgrpset doesn't change.
3325                                          */
3326                                         break;
3327                                 } else if (tp == p->p_tlist) {
3328                                         /*
3329                                          * No other thread is assigned to the
3330                                          * same lgroup as the exiting thread,
3331                                          * clear the lgroup's bit in p_lgrpset.
3332                                          */
3333                                         klgrpset_del(p->p_lgrpset, oldid);
3334                                         break;
3335                                 }
3336                         }
3337                 }
3338 
3339                 /*
3340                  * If this thread was assigned to its old lgroup for such a
3341                  * short amount of time that the anticipatory load that was
3342                  * added on its behalf has aged very little, remove that
3343                  * anticipatory load.
3344                  */
3345                 if ((t->t_anttime + lgrp_min_nsec > gethrtime()) &&
3346                     ((ncpu = oldlpl->lpl_ncpu) > 0)) {
3347                         lpl = oldlpl;
3348                         for (;;) {
3349                                 do {
3350                                         old = new = lpl->lpl_loadavg;
3351                                         new -= LGRP_LOADAVG_MAX_EFFECT(ncpu);
3352                                         if (new > old) {
3353                                                 /*
3354                                                  * this can happen if the load
3355                                                  * average was aged since we
3356                                                  * added in the anticipatory
3357                                                  * load
3358                                                  */
3359                                                 new = 0;
3360                                         }
3361                                 } while (atomic_cas_32(
3362                                     (lgrp_load_t *)&lpl->lpl_loadavg, old,
3363                                     new) != old);
3364 
3365                                 lpl = lpl->lpl_parent;
3366                                 if (lpl == NULL)
3367                                         break;
3368 
3369                                 ncpu = lpl->lpl_ncpu;
3370                                 ASSERT(ncpu > 0);
3371                         }
3372                 }
3373         }
3374         /*
3375          * If the thread has a new lgroup (i.e. it's not exiting), update its
3376          * t_lpl and its process' p_lgrpset, and apply an anticipatory load
3377          * to its new lgroup to account for its move to its new lgroup.
3378          */
3379         if (newlpl != NULL) {
3380                 /*
3381                  * This thread is moving to a new lgroup
3382                  */
3383                 t->t_lpl = newlpl;
3384                 if (t->t_tid == 1 && p->p_t1_lgrpid != newlpl->lpl_lgrpid) {
3385                         p->p_t1_lgrpid = newlpl->lpl_lgrpid;
3386                         membar_producer();
3387                         if (p->p_tr_lgrpid != LGRP_NONE &&
3388                             p->p_tr_lgrpid != p->p_t1_lgrpid) {
3389                                 lgrp_update_trthr_migrations(1);
3390                         }
3391                 }
3392 
3393                 /*
3394                  * Reflect move in load average of new lgroup
3395                  * unless it is root lgroup
3396                  */
3397                 if (lgrp_table[newlpl->lpl_lgrpid] == lgrp_root)
3398                         return;
3399 
3400                 if (!klgrpset_ismember(p->p_lgrpset, newlpl->lpl_lgrpid)) {
3401                         klgrpset_add(p->p_lgrpset, newlpl->lpl_lgrpid);
3402                 }
3403 
3404                 /*
3405                  * It'll take some time for the load on the new lgroup
3406                  * to reflect this thread's placement on it.  We'd
3407                  * like not, however, to have all threads between now
3408                  * and then also piling on to this lgroup.  To avoid
3409                  * this pileup, we anticipate the load this thread
3410                  * will generate on its new lgroup.  The goal is to
3411                  * make the lgroup's load appear as though the thread
3412                  * had been there all along.  We're very conservative
3413                  * in calculating this anticipatory load, we assume
3414                  * the worst case case (100% CPU-bound thread).  This
3415                  * may be modified in the future to be more accurate.
3416                  */
3417                 lpl = newlpl;
3418                 for (;;) {
3419                         ncpu = lpl->lpl_ncpu;
3420                         ASSERT(ncpu > 0);
3421                         do {
3422                                 old = new = lpl->lpl_loadavg;
3423                                 new += LGRP_LOADAVG_MAX_EFFECT(ncpu);
3424                                 /*
3425                                  * Check for overflow
3426                                  * Underflow not possible here
3427                                  */
3428                                 if (new < old)
3429                                         new = UINT32_MAX;
3430                         } while (atomic_cas_32((lgrp_load_t *)&lpl->lpl_loadavg,
3431                             old, new) != old);
3432 
3433                         lpl = lpl->lpl_parent;
3434                         if (lpl == NULL)
3435                                 break;
3436                 }
3437                 t->t_anttime = gethrtime();
3438         }
3439 }
3440 
3441 /*
3442  * Return lgroup memory allocation policy given advice from madvise(3C)
3443  */
3444 lgrp_mem_policy_t
3445 lgrp_madv_to_policy(uchar_t advice, size_t size, int type)
3446 {
3447         switch (advice) {
3448         case MADV_ACCESS_LWP:
3449                 return (LGRP_MEM_POLICY_NEXT);
3450         case MADV_ACCESS_MANY:
3451                 return (LGRP_MEM_POLICY_RANDOM);
3452         default:
3453                 return (lgrp_mem_policy_default(size, type));
3454         }
3455 }
3456 
3457 /*
3458  * Figure out default policy
3459  */
3460 lgrp_mem_policy_t
3461 lgrp_mem_policy_default(size_t size, int type)
3462 {
3463         cpupart_t               *cp;
3464         lgrp_mem_policy_t       policy;
3465         size_t                  pset_mem_size;
3466 
3467         /*
3468          * Randomly allocate memory across lgroups for shared memory
3469          * beyond a certain threshold
3470          */
3471         if ((type != MAP_SHARED && size > lgrp_privm_random_thresh) ||
3472             (type == MAP_SHARED && size > lgrp_shm_random_thresh)) {
3473                 /*
3474                  * Get total memory size of current thread's pset
3475                  */
3476                 kpreempt_disable();
3477                 cp = curthread->t_cpupart;
3478                 klgrpset_totalsize(cp->cp_lgrpset, pset_mem_size);
3479                 kpreempt_enable();
3480 
3481                 /*
3482                  * Choose policy to randomly allocate memory across
3483                  * lgroups in pset if it will fit and is not default
3484                  * partition.  Otherwise, allocate memory randomly
3485                  * across machine.
3486                  */
3487                 if (lgrp_mem_pset_aware && size < pset_mem_size)
3488                         policy = LGRP_MEM_POLICY_RANDOM_PSET;
3489                 else
3490                         policy = LGRP_MEM_POLICY_RANDOM;
3491         } else
3492                 /*
3493                  * Apply default policy for private memory and
3494                  * shared memory under the respective random
3495                  * threshold.
3496                  */
3497                 policy = lgrp_mem_default_policy;
3498 
3499         return (policy);
3500 }
3501 
3502 /*
3503  * Get memory allocation policy for this segment
3504  */
3505 lgrp_mem_policy_info_t *
3506 lgrp_mem_policy_get(struct seg *seg, caddr_t vaddr)
3507 {
3508         lgrp_mem_policy_info_t  *policy_info;
3509         extern struct seg_ops   segspt_ops;
3510         extern struct seg_ops   segspt_shmops;
3511 
3512         /*
3513          * This is for binary compatibility to protect against third party
3514          * segment drivers which haven't recompiled to allow for
3515          * SEGOP_GETPOLICY()
3516          */
3517         if (seg->s_ops != &segvn_ops && seg->s_ops != &segspt_ops &&
3518             seg->s_ops != &segspt_shmops)
3519                 return (NULL);
3520 
3521         policy_info = NULL;
3522         if (seg->s_ops->getpolicy != NULL)
3523                 policy_info = SEGOP_GETPOLICY(seg, vaddr);
3524 
3525         return (policy_info);
3526 }
3527 
3528 /*
3529  * Set policy for allocating private memory given desired policy, policy info,
3530  * size in bytes of memory that policy is being applied.
3531  * Return 0 if policy wasn't set already and 1 if policy was set already
3532  */
3533 int
3534 lgrp_privm_policy_set(lgrp_mem_policy_t policy,
3535     lgrp_mem_policy_info_t *policy_info, size_t size)
3536 {
3537 
3538         ASSERT(policy_info != NULL);
3539 
3540         if (policy == LGRP_MEM_POLICY_DEFAULT)
3541                 policy = lgrp_mem_policy_default(size, MAP_PRIVATE);
3542 
3543         /*
3544          * Policy set already?
3545          */
3546         if (policy == policy_info->mem_policy)
3547                 return (1);
3548 
3549         /*
3550          * Set policy
3551          */
3552         policy_info->mem_policy = policy;
3553         policy_info->mem_lgrpid = LGRP_NONE;
3554 
3555         return (0);
3556 }
3557 
3558 
3559 /*
3560  * Get shared memory allocation policy with given tree and offset
3561  */
3562 lgrp_mem_policy_info_t *
3563 lgrp_shm_policy_get(struct anon_map *amp, ulong_t anon_index, vnode_t *vp,
3564     u_offset_t vn_off)
3565 {
3566         u_offset_t              off;
3567         lgrp_mem_policy_info_t  *policy_info;
3568         lgrp_shm_policy_seg_t   *policy_seg;
3569         lgrp_shm_locality_t     *shm_locality;
3570         avl_tree_t              *tree;
3571         avl_index_t             where;
3572 
3573         /*
3574          * Get policy segment tree from anon_map or vnode and use specified
3575          * anon index or vnode offset as offset
3576          *
3577          * Assume that no lock needs to be held on anon_map or vnode, since
3578          * they should be protected by their reference count which must be
3579          * nonzero for an existing segment
3580          */
3581         if (amp) {
3582                 ASSERT(amp->refcnt != 0);
3583                 shm_locality = amp->locality;
3584                 if (shm_locality == NULL)
3585                         return (NULL);
3586                 tree = shm_locality->loc_tree;
3587                 off = ptob(anon_index);
3588         } else if (vp) {
3589                 shm_locality = vp->v_locality;
3590                 if (shm_locality == NULL)
3591                         return (NULL);
3592                 ASSERT(shm_locality->loc_count != 0);
3593                 tree = shm_locality->loc_tree;
3594                 off = vn_off;
3595         }
3596 
3597         if (tree == NULL)
3598                 return (NULL);
3599 
3600         /*
3601          * Lookup policy segment for offset into shared object and return
3602          * policy info
3603          */
3604         rw_enter(&shm_locality->loc_lock, RW_READER);
3605         policy_info = NULL;
3606         policy_seg = avl_find(tree, &off, &where);
3607         if (policy_seg)
3608                 policy_info = &policy_seg->shm_policy;
3609         rw_exit(&shm_locality->loc_lock);
3610 
3611         return (policy_info);
3612 }
3613 
3614 /*
3615  * Default memory allocation policy for kernel segmap pages
3616  */
3617 lgrp_mem_policy_t       lgrp_segmap_default_policy = LGRP_MEM_POLICY_RANDOM;
3618 
3619 /*
3620  * Return lgroup to use for allocating memory
3621  * given the segment and address
3622  *
3623  * There isn't any mutual exclusion that exists between calls
3624  * to this routine and DR, so this routine and whomever calls it
3625  * should be mindful of the possibility that the lgrp returned
3626  * may be deleted. If this happens, dereferences of the lgrp
3627  * pointer will still be safe, but the resources in the lgrp will
3628  * be gone, and LGRP_EXISTS() will no longer be true.
3629  */
3630 lgrp_t *
3631 lgrp_mem_choose(struct seg *seg, caddr_t vaddr, size_t pgsz)
3632 {
3633         int                     i;
3634         lgrp_t                  *lgrp;
3635         klgrpset_t              lgrpset;
3636         int                     lgrps_spanned;
3637         unsigned long           off;
3638         lgrp_mem_policy_t       policy;
3639         lgrp_mem_policy_info_t  *policy_info;
3640         ushort_t                random;
3641         int                     stat = 0;
3642         extern struct seg       *segkmap;
3643 
3644         /*
3645          * Just return null if the lgrp framework hasn't finished
3646          * initializing or if this is a UMA machine.
3647          */
3648         if (nlgrps == 1 || !lgrp_initialized)
3649                 return (lgrp_root);
3650 
3651         /*
3652          * Get memory allocation policy for this segment
3653          */
3654         policy = lgrp_mem_default_policy;
3655         if (seg != NULL) {
3656                 if (seg->s_as == &kas) {
3657                         if (seg == segkmap)
3658                                 policy = lgrp_segmap_default_policy;
3659                         if (policy == LGRP_MEM_POLICY_RANDOM_PROC ||
3660                             policy == LGRP_MEM_POLICY_RANDOM_PSET)
3661                                 policy = LGRP_MEM_POLICY_RANDOM;
3662                 } else {
3663                         policy_info = lgrp_mem_policy_get(seg, vaddr);
3664                         if (policy_info != NULL) {
3665                                 policy = policy_info->mem_policy;
3666                                 if (policy == LGRP_MEM_POLICY_NEXT_SEG) {
3667                                         lgrp_id_t id = policy_info->mem_lgrpid;
3668                                         ASSERT(id != LGRP_NONE);
3669                                         ASSERT(id < NLGRPS_MAX);
3670                                         lgrp = lgrp_table[id];
3671                                         if (!LGRP_EXISTS(lgrp)) {
3672                                                 policy = LGRP_MEM_POLICY_NEXT;
3673                                         } else {
3674                                                 lgrp_stat_add(id,
3675                                                     LGRP_NUM_NEXT_SEG, 1);
3676                                                 return (lgrp);
3677                                         }
3678                                 }
3679                         }
3680                 }
3681         }
3682         lgrpset = 0;
3683 
3684         /*
3685          * Initialize lgroup to home by default
3686          */
3687         lgrp = lgrp_home_lgrp();
3688 
3689         /*
3690          * When homing threads on root lgrp, override default memory
3691          * allocation policies with root lgroup memory allocation policy
3692          */
3693         if (lgrp == lgrp_root)
3694                 policy = lgrp_mem_policy_root;
3695 
3696         /*
3697          * Implement policy
3698          */
3699         switch (policy) {
3700         case LGRP_MEM_POLICY_NEXT_CPU:
3701 
3702                 /*
3703                  * Return lgroup of current CPU which faulted on memory
3704                  * If the CPU isn't currently in an lgrp, then opt to
3705                  * allocate from the root.
3706                  *
3707                  * Kernel preemption needs to be disabled here to prevent
3708                  * the current CPU from going away before lgrp is found.
3709                  */
3710                 if (LGRP_CPU_HAS_NO_LGRP(CPU)) {
3711                         lgrp = lgrp_root;
3712                 } else {
3713                         kpreempt_disable();
3714                         lgrp = lgrp_cpu_to_lgrp(CPU);
3715                         kpreempt_enable();
3716                 }
3717                 break;
3718 
3719         case LGRP_MEM_POLICY_NEXT:
3720         case LGRP_MEM_POLICY_DEFAULT:
3721         default:
3722 
3723                 /*
3724                  * Just return current thread's home lgroup
3725                  * for default policy (next touch)
3726                  * If the thread is homed to the root,
3727                  * then the default policy is random across lgroups.
3728                  * Fallthrough to the random case.
3729                  */
3730                 if (lgrp != lgrp_root) {
3731                         if (policy == LGRP_MEM_POLICY_NEXT)
3732                                 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_NEXT, 1);
3733                         else
3734                                 lgrp_stat_add(lgrp->lgrp_id,
3735                                     LGRP_NUM_DEFAULT, 1);
3736                         break;
3737                 }
3738                 /* FALLTHROUGH */
3739         case LGRP_MEM_POLICY_RANDOM:
3740 
3741                 /*
3742                  * Return a random leaf lgroup with memory
3743                  */
3744                 lgrpset = lgrp_root->lgrp_set[LGRP_RSRC_MEM];
3745                 /*
3746                  * Count how many lgroups are spanned
3747                  */
3748                 klgrpset_nlgrps(lgrpset, lgrps_spanned);
3749 
3750                 /*
3751                  * There may be no memnodes in the root lgroup during DR copy
3752                  * rename on a system with only two boards (memnodes)
3753                  * configured. In this case just return the root lgrp.
3754                  */
3755                 if (lgrps_spanned == 0) {
3756                         lgrp = lgrp_root;
3757                         break;
3758                 }
3759 
3760                 /*
3761                  * Pick a random offset within lgroups spanned
3762                  * and return lgroup at that offset
3763                  */
3764                 random = (ushort_t)gethrtime() >> 4;
3765                 off = random % lgrps_spanned;
3766                 ASSERT(off <= lgrp_alloc_max);
3767 
3768                 for (i = 0; i <= lgrp_alloc_max; i++) {
3769                         if (!klgrpset_ismember(lgrpset, i))
3770                                 continue;
3771                         if (off)
3772                                 off--;
3773                         else {
3774                                 lgrp = lgrp_table[i];
3775                                 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_RANDOM,
3776                                     1);
3777                                 break;
3778                         }
3779                 }
3780                 break;
3781 
3782         case LGRP_MEM_POLICY_RANDOM_PROC:
3783 
3784                 /*
3785                  * Grab copy of bitmask of lgroups spanned by
3786                  * this process
3787                  */
3788                 klgrpset_copy(lgrpset, curproc->p_lgrpset);
3789                 stat = LGRP_NUM_RANDOM_PROC;
3790 
3791                 /* FALLTHROUGH */
3792         case LGRP_MEM_POLICY_RANDOM_PSET:
3793 
3794                 if (!stat)
3795                         stat = LGRP_NUM_RANDOM_PSET;
3796 
3797                 if (klgrpset_isempty(lgrpset)) {
3798                         /*
3799                          * Grab copy of bitmask of lgroups spanned by
3800                          * this processor set
3801                          */
3802                         kpreempt_disable();
3803                         klgrpset_copy(lgrpset,
3804                             curthread->t_cpupart->cp_lgrpset);
3805                         kpreempt_enable();
3806                 }
3807 
3808                 /*
3809                  * Count how many lgroups are spanned
3810                  */
3811                 klgrpset_nlgrps(lgrpset, lgrps_spanned);
3812                 ASSERT(lgrps_spanned <= nlgrps);
3813 
3814                 /*
3815                  * Probably lgrps_spanned should be always non-zero, but to be
3816                  * on the safe side we return lgrp_root if it is empty.
3817                  */
3818                 if (lgrps_spanned == 0) {
3819                         lgrp = lgrp_root;
3820                         break;
3821                 }
3822 
3823                 /*
3824                  * Pick a random offset within lgroups spanned
3825                  * and return lgroup at that offset
3826                  */
3827                 random = (ushort_t)gethrtime() >> 4;
3828                 off = random % lgrps_spanned;
3829                 ASSERT(off <= lgrp_alloc_max);
3830 
3831                 for (i = 0; i <= lgrp_alloc_max; i++) {
3832                         if (!klgrpset_ismember(lgrpset, i))
3833                                 continue;
3834                         if (off)
3835                                 off--;
3836                         else {
3837                                 lgrp = lgrp_table[i];
3838                                 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_RANDOM,
3839                                     1);
3840                                 break;
3841                         }
3842                 }
3843                 break;
3844 
3845         case LGRP_MEM_POLICY_ROUNDROBIN:
3846 
3847                 /*
3848                  * Use offset within segment to determine
3849                  * offset from home lgroup to choose for
3850                  * next lgroup to allocate memory from
3851                  */
3852                 off = ((unsigned long)(vaddr - seg->s_base) / pgsz) %
3853                     (lgrp_alloc_max + 1);
3854 
3855                 kpreempt_disable();
3856                 lgrpset = lgrp_root->lgrp_set[LGRP_RSRC_MEM];
3857                 i = lgrp->lgrp_id;
3858                 kpreempt_enable();
3859 
3860                 while (off > 0) {
3861                         i = (i + 1) % (lgrp_alloc_max + 1);
3862                         lgrp = lgrp_table[i];
3863                         if (klgrpset_ismember(lgrpset, i))
3864                                 off--;
3865                 }
3866                 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ROUNDROBIN, 1);
3867 
3868                 break;
3869         }
3870 
3871         ASSERT(lgrp != NULL);
3872         return (lgrp);
3873 }
3874 
3875 /*
3876  * Return the number of pages in an lgroup
3877  *
3878  * NOTE: NUMA test (numat) driver uses this, so changing arguments or semantics
3879  *       could cause tests that rely on the numat driver to fail....
3880  */
3881 pgcnt_t
3882 lgrp_mem_size(lgrp_id_t lgrpid, lgrp_mem_query_t query)
3883 {
3884         lgrp_t *lgrp;
3885 
3886         lgrp = lgrp_table[lgrpid];
3887         if (!LGRP_EXISTS(lgrp) ||
3888             klgrpset_isempty(lgrp->lgrp_set[LGRP_RSRC_MEM]) ||
3889             !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid))
3890                 return (0);
3891 
3892         return (lgrp_plat_mem_size(lgrp->lgrp_plathand, query));
3893 }
3894 
3895 /*
3896  * Initialize lgroup shared memory allocation policy support
3897  */
3898 void
3899 lgrp_shm_policy_init(struct anon_map *amp, vnode_t *vp)
3900 {
3901         lgrp_shm_locality_t     *shm_locality;
3902 
3903         /*
3904          * Initialize locality field in anon_map
3905          * Don't need any locks because this is called when anon_map is
3906          * allocated, but not used anywhere yet.
3907          */
3908         if (amp) {
3909                 ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
3910                 if (amp->locality == NULL) {
3911                         /*
3912                          * Allocate and initialize shared memory locality info
3913                          * and set anon_map locality pointer to it
3914                          * Drop lock across kmem_alloc(KM_SLEEP)
3915                          */
3916                         ANON_LOCK_EXIT(&amp->a_rwlock);
3917                         shm_locality = kmem_alloc(sizeof (*shm_locality),
3918                             KM_SLEEP);
3919                         rw_init(&shm_locality->loc_lock, NULL, RW_DEFAULT,
3920                             NULL);
3921                         shm_locality->loc_count = 1; /* not used for amp */
3922                         shm_locality->loc_tree = NULL;
3923 
3924                         /*
3925                          * Reacquire lock and check to see whether anyone beat
3926                          * us to initializing the locality info
3927                          */
3928                         ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
3929                         if (amp->locality != NULL) {
3930                                 rw_destroy(&shm_locality->loc_lock);
3931                                 kmem_free(shm_locality,
3932                                     sizeof (*shm_locality));
3933                         } else
3934                                 amp->locality = shm_locality;
3935                 }
3936                 ANON_LOCK_EXIT(&amp->a_rwlock);
3937                 return;
3938         }
3939 
3940         /*
3941          * Allocate shared vnode policy info if vnode is not locality aware yet
3942          */
3943         mutex_enter(&vp->v_lock);
3944         if ((vp->v_flag & V_LOCALITY) == 0) {
3945                 /*
3946                  * Allocate and initialize shared memory locality info
3947                  */
3948                 mutex_exit(&vp->v_lock);
3949                 shm_locality = kmem_alloc(sizeof (*shm_locality), KM_SLEEP);
3950                 rw_init(&shm_locality->loc_lock, NULL, RW_DEFAULT, NULL);
3951                 shm_locality->loc_count = 1;
3952                 shm_locality->loc_tree = NULL;
3953 
3954                 /*
3955                  * Point vnode locality field at shared vnode policy info
3956                  * and set locality aware flag in vnode
3957                  */
3958                 mutex_enter(&vp->v_lock);
3959                 if ((vp->v_flag & V_LOCALITY) == 0) {
3960                         vp->v_locality = shm_locality;
3961                         vp->v_flag |= V_LOCALITY;
3962                 } else {
3963                         /*
3964                          * Lost race so free locality info and increment count.
3965                          */
3966                         rw_destroy(&shm_locality->loc_lock);
3967                         kmem_free(shm_locality, sizeof (*shm_locality));
3968                         shm_locality = vp->v_locality;
3969                         shm_locality->loc_count++;
3970                 }
3971                 mutex_exit(&vp->v_lock);
3972 
3973                 return;
3974         }
3975 
3976         /*
3977          * Increment reference count of number of segments mapping this vnode
3978          * shared
3979          */
3980         shm_locality = vp->v_locality;
3981         shm_locality->loc_count++;
3982         mutex_exit(&vp->v_lock);
3983 }
3984 
3985 /*
3986  * Destroy the given shared memory policy segment tree
3987  */
3988 void
3989 lgrp_shm_policy_tree_destroy(avl_tree_t *tree)
3990 {
3991         lgrp_shm_policy_seg_t   *cur;
3992         lgrp_shm_policy_seg_t   *next;
3993 
3994         if (tree == NULL)
3995                 return;
3996 
3997         cur = (lgrp_shm_policy_seg_t *)avl_first(tree);
3998         while (cur != NULL) {
3999                 next = AVL_NEXT(tree, cur);
4000                 avl_remove(tree, cur);
4001                 kmem_free(cur, sizeof (*cur));
4002                 cur = next;
4003         }
4004         kmem_free(tree, sizeof (avl_tree_t));
4005 }
4006 
4007 /*
4008  * Uninitialize lgroup shared memory allocation policy support
4009  */
4010 void
4011 lgrp_shm_policy_fini(struct anon_map *amp, vnode_t *vp)
4012 {
4013         lgrp_shm_locality_t     *shm_locality;
4014 
4015         /*
4016          * For anon_map, deallocate shared memory policy tree and
4017          * zero locality field
4018          * Don't need any locks because anon_map is being freed
4019          */
4020         if (amp) {
4021                 if (amp->locality == NULL)
4022                         return;
4023                 shm_locality = amp->locality;
4024                 shm_locality->loc_count = 0; /* not really used for amp */
4025                 rw_destroy(&shm_locality->loc_lock);
4026                 lgrp_shm_policy_tree_destroy(shm_locality->loc_tree);
4027                 kmem_free(shm_locality, sizeof (*shm_locality));
4028                 amp->locality = 0;
4029                 return;
4030         }
4031 
4032         /*
4033          * For vnode, decrement reference count of segments mapping this vnode
4034          * shared and delete locality info if reference count drops to 0
4035          */
4036         mutex_enter(&vp->v_lock);
4037         shm_locality = vp->v_locality;
4038         shm_locality->loc_count--;
4039 
4040         if (shm_locality->loc_count == 0) {
4041                 rw_destroy(&shm_locality->loc_lock);
4042                 lgrp_shm_policy_tree_destroy(shm_locality->loc_tree);
4043                 kmem_free(shm_locality, sizeof (*shm_locality));
4044                 vp->v_locality = 0;
4045                 vp->v_flag &= ~V_LOCALITY;
4046         }
4047         mutex_exit(&vp->v_lock);
4048 }
4049 
4050 /*
4051  * Compare two shared memory policy segments
4052  * Used by AVL tree code for searching
4053  */
4054 int
4055 lgrp_shm_policy_compar(const void *x, const void *y)
4056 {
4057         lgrp_shm_policy_seg_t *a = (lgrp_shm_policy_seg_t *)x;
4058         lgrp_shm_policy_seg_t *b = (lgrp_shm_policy_seg_t *)y;
4059 
4060         if (a->shm_off < b->shm_off)
4061                 return (-1);
4062         if (a->shm_off >= b->shm_off + b->shm_size)
4063                 return (1);
4064         return (0);
4065 }
4066 
4067 /*
4068  * Concatenate seg1 with seg2 and remove seg2
4069  */
4070 static int
4071 lgrp_shm_policy_concat(avl_tree_t *tree, lgrp_shm_policy_seg_t *seg1,
4072     lgrp_shm_policy_seg_t *seg2)
4073 {
4074         if (!seg1 || !seg2 ||
4075             seg1->shm_off + seg1->shm_size != seg2->shm_off ||
4076             seg1->shm_policy.mem_policy != seg2->shm_policy.mem_policy)
4077                 return (-1);
4078 
4079         seg1->shm_size += seg2->shm_size;
4080         avl_remove(tree, seg2);
4081         kmem_free(seg2, sizeof (*seg2));
4082         return (0);
4083 }
4084 
4085 /*
4086  * Split segment at given offset and return rightmost (uppermost) segment
4087  * Assumes that there are no overlapping segments
4088  */
4089 static lgrp_shm_policy_seg_t *
4090 lgrp_shm_policy_split(avl_tree_t *tree, lgrp_shm_policy_seg_t *seg,
4091     u_offset_t off)
4092 {
4093         lgrp_shm_policy_seg_t   *newseg;
4094         avl_index_t             where;
4095 
4096         ASSERT(seg != NULL);
4097         ASSERT(off >= seg->shm_off && off <= seg->shm_off + seg->shm_size);
4098 
4099         if (!seg || off < seg->shm_off || off > seg->shm_off +
4100             seg->shm_size)
4101                 return (NULL);
4102 
4103         if (off == seg->shm_off || off == seg->shm_off + seg->shm_size)
4104                 return (seg);
4105 
4106         /*
4107          * Adjust size of left segment and allocate new (right) segment
4108          */
4109         newseg = kmem_alloc(sizeof (lgrp_shm_policy_seg_t), KM_SLEEP);
4110         newseg->shm_policy = seg->shm_policy;
4111         newseg->shm_off = off;
4112         newseg->shm_size = seg->shm_size - (off - seg->shm_off);
4113         seg->shm_size = off - seg->shm_off;
4114 
4115         /*
4116          * Find where to insert new segment in AVL tree and insert it
4117          */
4118         (void) avl_find(tree, &off, &where);
4119         avl_insert(tree, newseg, where);
4120 
4121         return (newseg);
4122 }
4123 
4124 /*
4125  * Set shared memory allocation policy on specified shared object at given
4126  * offset and length
4127  *
4128  * Return 0 if policy wasn't set already, 1 if policy was set already, and
4129  * -1 if can't set policy.
4130  */
4131 int
4132 lgrp_shm_policy_set(lgrp_mem_policy_t policy, struct anon_map *amp,
4133     ulong_t anon_index, vnode_t *vp, u_offset_t vn_off, size_t len)
4134 {
4135         u_offset_t              eoff;
4136         lgrp_shm_policy_seg_t   *next;
4137         lgrp_shm_policy_seg_t   *newseg;
4138         u_offset_t              off;
4139         u_offset_t              oldeoff;
4140         lgrp_shm_policy_seg_t   *prev;
4141         int                     retval;
4142         lgrp_shm_policy_seg_t   *seg;
4143         lgrp_shm_locality_t     *shm_locality;
4144         avl_tree_t              *tree;
4145         avl_index_t             where;
4146 
4147         ASSERT(amp || vp);
4148         ASSERT((len & PAGEOFFSET) == 0);
4149 
4150         if (len == 0)
4151                 return (-1);
4152 
4153         retval = 0;
4154 
4155         /*
4156          * Get locality info and starting offset into shared object
4157          * Try anon map first and then vnode
4158          * Assume that no locks need to be held on anon_map or vnode, since
4159          * it should be protected by its reference count which must be nonzero
4160          * for an existing segment.
4161          */
4162         if (amp) {
4163                 /*
4164                  * Get policy info from anon_map
4165                  *
4166                  */
4167                 ASSERT(amp->refcnt != 0);
4168                 if (amp->locality == NULL)
4169                         lgrp_shm_policy_init(amp, NULL);
4170                 shm_locality = amp->locality;
4171                 off = ptob(anon_index);
4172         } else if (vp) {
4173                 /*
4174                  * Get policy info from vnode
4175                  */
4176                 if ((vp->v_flag & V_LOCALITY) == 0 || vp->v_locality == NULL)
4177                         lgrp_shm_policy_init(NULL, vp);
4178                 shm_locality = vp->v_locality;
4179                 ASSERT(shm_locality->loc_count != 0);
4180                 off = vn_off;
4181         } else
4182                 return (-1);
4183 
4184         ASSERT((off & PAGEOFFSET) == 0);
4185 
4186         /*
4187          * Figure out default policy
4188          */
4189         if (policy == LGRP_MEM_POLICY_DEFAULT)
4190                 policy = lgrp_mem_policy_default(len, MAP_SHARED);
4191 
4192         /*
4193          * Create AVL tree if there isn't one yet
4194          * and set locality field to point at it
4195          */
4196         rw_enter(&shm_locality->loc_lock, RW_WRITER);
4197         tree = shm_locality->loc_tree;
4198         if (!tree) {
4199                 rw_exit(&shm_locality->loc_lock);
4200 
4201                 tree = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
4202 
4203                 rw_enter(&shm_locality->loc_lock, RW_WRITER);
4204                 if (shm_locality->loc_tree == NULL) {
4205                         avl_create(tree, lgrp_shm_policy_compar,
4206                             sizeof (lgrp_shm_policy_seg_t),
4207                             offsetof(lgrp_shm_policy_seg_t, shm_tree));
4208                         shm_locality->loc_tree = tree;
4209                 } else {
4210                         /*
4211                          * Another thread managed to set up the tree
4212                          * before we could. Free the tree we allocated
4213                          * and use the one that's already there.
4214                          */
4215                         kmem_free(tree, sizeof (*tree));
4216                         tree = shm_locality->loc_tree;
4217                 }
4218         }
4219 
4220         /*
4221          * Set policy
4222          *
4223          * Need to maintain hold on writer's lock to keep tree from
4224          * changing out from under us
4225          */
4226         while (len != 0) {
4227                 /*
4228                  * Find policy segment for specified offset into shared object
4229                  */
4230                 seg = avl_find(tree, &off, &where);
4231 
4232                 /*
4233                  * Didn't find any existing segment that contains specified
4234                  * offset, so allocate new segment, insert it, and concatenate
4235                  * with adjacent segments if possible
4236                  */
4237                 if (seg == NULL) {
4238                         newseg = kmem_alloc(sizeof (lgrp_shm_policy_seg_t),
4239                             KM_SLEEP);
4240                         newseg->shm_policy.mem_policy = policy;
4241                         newseg->shm_policy.mem_lgrpid = LGRP_NONE;
4242                         newseg->shm_off = off;
4243                         avl_insert(tree, newseg, where);
4244 
4245                         /*
4246                          * Check to see whether new segment overlaps with next
4247                          * one, set length of new segment accordingly, and
4248                          * calculate remaining length and next offset
4249                          */
4250                         seg = AVL_NEXT(tree, newseg);
4251                         if (seg == NULL || off + len <= seg->shm_off) {
4252                                 newseg->shm_size = len;
4253                                 len = 0;
4254                         } else {
4255                                 newseg->shm_size = seg->shm_off - off;
4256                                 off = seg->shm_off;
4257                                 len -= newseg->shm_size;
4258                         }
4259 
4260                         /*
4261                          * Try to concatenate new segment with next and
4262                          * previous ones, since they might have the same policy
4263                          * now.  Grab previous and next segments first because
4264                          * they will change on concatenation.
4265                          */
4266                         prev =  AVL_PREV(tree, newseg);
4267                         next = AVL_NEXT(tree, newseg);
4268                         (void) lgrp_shm_policy_concat(tree, newseg, next);
4269                         (void) lgrp_shm_policy_concat(tree, prev, newseg);
4270 
4271                         continue;
4272                 }
4273 
4274                 eoff = off + len;
4275                 oldeoff = seg->shm_off + seg->shm_size;
4276 
4277                 /*
4278                  * Policy set already?
4279                  */
4280                 if (policy == seg->shm_policy.mem_policy) {
4281                         /*
4282                          * Nothing left to do if offset and length
4283                          * fall within this segment
4284                          */
4285                         if (eoff <= oldeoff) {
4286                                 retval = 1;
4287                                 break;
4288                         } else {
4289                                 len = eoff - oldeoff;
4290                                 off = oldeoff;
4291                                 continue;
4292                         }
4293                 }
4294 
4295                 /*
4296                  * Specified offset and length match existing segment exactly
4297                  */
4298                 if (off == seg->shm_off && len == seg->shm_size) {
4299                         /*
4300                          * Set policy and update current length
4301                          */
4302                         seg->shm_policy.mem_policy = policy;
4303                         seg->shm_policy.mem_lgrpid = LGRP_NONE;
4304                         len = 0;
4305 
4306                         /*
4307                          * Try concatenating new segment with previous and next
4308                          * segments, since they might have the same policy now.
4309                          * Grab previous and next segments first because they
4310                          * will change on concatenation.
4311                          */
4312                         prev =  AVL_PREV(tree, seg);
4313                         next = AVL_NEXT(tree, seg);
4314                         (void) lgrp_shm_policy_concat(tree, seg, next);
4315                         (void) lgrp_shm_policy_concat(tree, prev, seg);
4316                 } else {
4317                         /*
4318                          * Specified offset and length only apply to part of
4319                          * existing segment
4320                          */
4321 
4322                         /*
4323                          * New segment starts in middle of old one, so split
4324                          * new one off near beginning of old one
4325                          */
4326                         newseg = NULL;
4327                         if (off > seg->shm_off) {
4328                                 newseg = lgrp_shm_policy_split(tree, seg, off);
4329 
4330                                 /*
4331                                  * New segment ends where old one did, so try
4332                                  * to concatenate with next segment
4333                                  */
4334                                 if (eoff == oldeoff) {
4335                                         newseg->shm_policy.mem_policy = policy;
4336                                         newseg->shm_policy.mem_lgrpid =
4337                                             LGRP_NONE;
4338                                         (void) lgrp_shm_policy_concat(tree,
4339                                             newseg, AVL_NEXT(tree, newseg));
4340                                         break;
4341                                 }
4342                         }
4343 
4344                         /*
4345                          * New segment ends before old one, so split off end of
4346                          * old one
4347                          */
4348                         if (eoff < oldeoff) {
4349                                 if (newseg) {
4350                                         (void) lgrp_shm_policy_split(tree,
4351                                             newseg, eoff);
4352                                         newseg->shm_policy.mem_policy = policy;
4353                                         newseg->shm_policy.mem_lgrpid =
4354                                             LGRP_NONE;
4355                                 } else {
4356                                         (void) lgrp_shm_policy_split(tree, seg,
4357                                             eoff);
4358                                         seg->shm_policy.mem_policy = policy;
4359                                         seg->shm_policy.mem_lgrpid = LGRP_NONE;
4360                                 }
4361 
4362                                 if (off == seg->shm_off)
4363                                         (void) lgrp_shm_policy_concat(tree,
4364                                             AVL_PREV(tree, seg), seg);
4365                                 break;
4366                         }
4367 
4368                         /*
4369                          * Calculate remaining length and next offset
4370                          */
4371                         len = eoff - oldeoff;
4372                         off = oldeoff;
4373                 }
4374         }
4375 
4376         rw_exit(&shm_locality->loc_lock);
4377         return (retval);
4378 }
4379 
4380 /*
4381  * Return the best memnode from which to allocate memory given
4382  * an lgroup.
4383  *
4384  * "c" is for cookie, which is good enough for me.
4385  * It references a cookie struct that should be zero'ed to initialize.
4386  * The cookie should live on the caller's stack.
4387  *
4388  * The routine returns -1 when:
4389  *      - traverse is 0, and all the memnodes in "lgrp" have been returned.
4390  *      - traverse is 1, and all the memnodes in the system have been
4391  *        returned.
4392  */
4393 int
4394 lgrp_memnode_choose(lgrp_mnode_cookie_t *c)
4395 {
4396         lgrp_t          *lp = c->lmc_lgrp;
4397         mnodeset_t      nodes = c->lmc_nodes;
4398         int             cnt = c->lmc_cnt;
4399         int             offset, mnode;
4400 
4401         extern int      max_mem_nodes;
4402 
4403         /*
4404          * If the set is empty, and the caller is willing, traverse
4405          * up the hierarchy until we find a non-empty set.
4406          */
4407         while (nodes == (mnodeset_t)0 || cnt <= 0) {
4408                 if (c->lmc_scope == LGRP_SRCH_LOCAL ||
4409                     ((lp = lp->lgrp_parent) == NULL))
4410                         return (-1);
4411 
4412                 nodes = lp->lgrp_mnodes & ~(c->lmc_tried);
4413                 cnt = lp->lgrp_nmnodes - c->lmc_ntried;
4414         }
4415 
4416         /*
4417          * Select a memnode by picking one at a "random" offset.
4418          * Because of DR, memnodes can come and go at any time.
4419          * This code must be able to cope with the possibility
4420          * that the nodes count "cnt" is inconsistent with respect
4421          * to the number of elements actually in "nodes", and
4422          * therefore that the offset chosen could be greater than
4423          * the number of elements in the set (some memnodes may
4424          * have dissapeared just before cnt was read).
4425          * If this happens, the search simply wraps back to the
4426          * beginning of the set.
4427          */
4428         ASSERT(nodes != (mnodeset_t)0 && cnt > 0);
4429         offset = c->lmc_rand % cnt;
4430         do {
4431                 for (mnode = 0; mnode < max_mem_nodes; mnode++)
4432                         if (nodes & ((mnodeset_t)1 << mnode))
4433                                 if (!offset--)
4434                                         break;
4435         } while (mnode >= max_mem_nodes);
4436 
4437         /* Found a node. Store state before returning. */
4438         c->lmc_lgrp = lp;
4439         c->lmc_nodes = (nodes & ~((mnodeset_t)1 << mnode));
4440         c->lmc_cnt = cnt - 1;
4441         c->lmc_tried = (c->lmc_tried | ((mnodeset_t)1 << mnode));
4442         c->lmc_ntried++;
4443 
4444         return (mnode);
4445 }