1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 * Copyright 2018 Joyent, Inc. 25 */ 26 27 /* 28 * Basic NUMA support in terms of locality groups 29 * 30 * Solaris needs to know which CPUs, memory, etc. are near each other to 31 * provide good performance on NUMA machines by optimizing for locality. 32 * In order to do this, a new abstraction called a "locality group (lgroup)" 33 * has been introduced to keep track of which CPU-like and memory-like hardware 34 * resources are close to each other. Currently, latency is the only measure 35 * used to determine how to group hardware resources into lgroups, but this 36 * does not limit the groupings to be based solely on latency. Other factors 37 * may be used to determine the groupings in the future. 38 * 39 * Lgroups are organized into a hieararchy or topology that represents the 40 * latency topology of the machine. There is always at least a root lgroup in 41 * the system. It represents all the hardware resources in the machine at a 42 * latency big enough that any hardware resource can at least access any other 43 * hardware resource within that latency. A Uniform Memory Access (UMA) 44 * machine is represented with one lgroup (the root). In contrast, a NUMA 45 * machine is represented at least by the root lgroup and some number of leaf 46 * lgroups where the leaf lgroups contain the hardware resources within the 47 * least latency of each other and the root lgroup still contains all the 48 * resources in the machine. Some number of intermediate lgroups may exist 49 * which represent more levels of locality than just the local latency of the 50 * leaf lgroups and the system latency of the root lgroup. Non-leaf lgroups 51 * (eg. root and intermediate lgroups) contain the next nearest resources to 52 * its children lgroups. Thus, the lgroup hierarchy from a given leaf lgroup 53 * to the root lgroup shows the hardware resources from closest to farthest 54 * from the leaf lgroup such that each successive ancestor lgroup contains 55 * the next nearest resources at the next level of locality from the previous. 56 * 57 * The kernel uses the lgroup abstraction to know how to allocate resources 58 * near a given process/thread. At fork() and lwp/thread_create() time, a 59 * "home" lgroup is chosen for a thread. This is done by picking the lgroup 60 * with the lowest load average. Binding to a processor or processor set will 61 * change the home lgroup for a thread. The scheduler has been modified to try 62 * to dispatch a thread on a CPU in its home lgroup. Physical memory 63 * allocation is lgroup aware too, so memory will be allocated from the current 64 * thread's home lgroup if possible. If the desired resources are not 65 * available, the kernel traverses the lgroup hierarchy going to the parent 66 * lgroup to find resources at the next level of locality until it reaches the 67 * root lgroup. 68 */ 69 70 #include <sys/lgrp.h> 71 #include <sys/lgrp_user.h> 72 #include <sys/types.h> 73 #include <sys/mman.h> 74 #include <sys/param.h> 75 #include <sys/var.h> 76 #include <sys/thread.h> 77 #include <sys/cpuvar.h> 78 #include <sys/cpupart.h> 79 #include <sys/kmem.h> 80 #include <vm/seg.h> 81 #include <vm/seg_kmem.h> 82 #include <vm/seg_spt.h> 83 #include <vm/seg_vn.h> 84 #include <vm/as.h> 85 #include <sys/atomic.h> 86 #include <sys/systm.h> 87 #include <sys/errno.h> 88 #include <sys/cmn_err.h> 89 #include <sys/kstat.h> 90 #include <sys/sysmacros.h> 91 #include <sys/pg.h> 92 #include <sys/promif.h> 93 #include <sys/sdt.h> 94 #include <sys/ht.h> 95 96 lgrp_gen_t lgrp_gen = 0; /* generation of lgroup hierarchy */ 97 lgrp_t *lgrp_table[NLGRPS_MAX]; /* table of all initialized lgrp_t structs */ 98 /* indexed by lgrp_id */ 99 int nlgrps; /* number of lgroups in machine */ 100 int lgrp_alloc_hint = -1; /* hint for where to try to allocate next */ 101 int lgrp_alloc_max = 0; /* max lgroup ID allocated so far */ 102 103 /* 104 * Kstat data for lgroups. 105 * 106 * Actual kstat data is collected in lgrp_stats array. 107 * The lgrp_kstat_data array of named kstats is used to extract data from 108 * lgrp_stats and present it to kstat framework. It is protected from partallel 109 * modifications by lgrp_kstat_mutex. This may cause some contention when 110 * several kstat commands run in parallel but this is not the 111 * performance-critical path. 112 */ 113 extern struct lgrp_stats lgrp_stats[]; /* table of per-lgrp stats */ 114 115 /* 116 * Declare kstat names statically for enums as defined in the header file. 117 */ 118 LGRP_KSTAT_NAMES; 119 120 static void lgrp_kstat_init(void); 121 static int lgrp_kstat_extract(kstat_t *, int); 122 static void lgrp_kstat_reset(lgrp_id_t); 123 124 static struct kstat_named lgrp_kstat_data[LGRP_NUM_STATS]; 125 static kmutex_t lgrp_kstat_mutex; 126 127 128 /* 129 * max number of lgroups supported by the platform 130 */ 131 int nlgrpsmax = 0; 132 133 /* 134 * The root lgroup. Represents the set of resources at the system wide 135 * level of locality. 136 */ 137 lgrp_t *lgrp_root = NULL; 138 139 /* 140 * During system bootstrap cp_default does not contain the list of lgrp load 141 * averages (cp_lgrploads). The list is allocated after the first CPU is brought 142 * on-line when cp_default is initialized by cpupart_initialize_default(). 143 * Configuring CPU0 may create a two-level topology with root and one leaf node 144 * containing CPU0. This topology is initially constructed in a special 145 * statically allocated 2-element lpl list lpl_bootstrap_list and later cloned 146 * to cp_default when cp_default is initialized. The lpl_bootstrap_list is used 147 * for all lpl operations until cp_default is fully constructed. 148 * 149 * The lpl_bootstrap_list is maintained by the code in lgrp.c. Every other 150 * consumer who needs default lpl should use lpl_bootstrap which is a pointer to 151 * the first element of lpl_bootstrap_list. 152 * 153 * CPUs that are added to the system, but have not yet been assigned to an 154 * lgrp will use lpl_bootstrap as a default lpl. This is necessary because 155 * on some architectures (x86) it's possible for the slave CPU startup thread 156 * to enter the dispatcher or allocate memory before calling lgrp_cpu_init(). 157 */ 158 #define LPL_BOOTSTRAP_SIZE 2 159 static lpl_t lpl_bootstrap_list[LPL_BOOTSTRAP_SIZE]; 160 lpl_t *lpl_bootstrap; 161 static lpl_t *lpl_bootstrap_rset[LPL_BOOTSTRAP_SIZE]; 162 static int lpl_bootstrap_id2rset[LPL_BOOTSTRAP_SIZE]; 163 164 /* 165 * If cp still references the bootstrap lpl, it has not yet been added to 166 * an lgrp. lgrp_mem_choose() uses this macro to detect the case where 167 * a thread is trying to allocate memory close to a CPU that has no lgrp. 168 */ 169 #define LGRP_CPU_HAS_NO_LGRP(cp) ((cp)->cpu_lpl == lpl_bootstrap) 170 171 static lgrp_t lroot; 172 173 /* 174 * Size, in bytes, beyond which random memory allocation policy is applied 175 * to non-shared memory. Default is the maximum size, so random memory 176 * allocation won't be used for non-shared memory by default. 177 */ 178 size_t lgrp_privm_random_thresh = (size_t)(-1); 179 180 /* the maximum effect that a single thread can have on it's lgroup's load */ 181 #define LGRP_LOADAVG_MAX_EFFECT(ncpu) \ 182 ((lgrp_loadavg_max_effect) / (ncpu)) 183 uint32_t lgrp_loadavg_max_effect = LGRP_LOADAVG_THREAD_MAX; 184 185 186 /* 187 * Size, in bytes, beyond which random memory allocation policy is applied to 188 * shared memory. Default is 8MB (2 ISM pages). 189 */ 190 size_t lgrp_shm_random_thresh = 8*1024*1024; 191 192 /* 193 * Whether to do processor set aware memory allocation by default 194 */ 195 int lgrp_mem_pset_aware = 0; 196 197 /* 198 * Set the default memory allocation policy for root lgroup 199 */ 200 lgrp_mem_policy_t lgrp_mem_policy_root = LGRP_MEM_POLICY_RANDOM; 201 202 /* 203 * Set the default memory allocation policy. For most platforms, 204 * next touch is sufficient, but some platforms may wish to override 205 * this. 206 */ 207 lgrp_mem_policy_t lgrp_mem_default_policy = LGRP_MEM_POLICY_NEXT; 208 209 210 /* 211 * lgroup CPU event handlers 212 */ 213 static void lgrp_cpu_init(struct cpu *); 214 static void lgrp_cpu_fini(struct cpu *, lgrp_id_t); 215 static lgrp_t *lgrp_cpu_to_lgrp(struct cpu *); 216 217 /* 218 * lgroup memory event handlers 219 */ 220 static void lgrp_mem_init(int, lgrp_handle_t, boolean_t); 221 static void lgrp_mem_fini(int, lgrp_handle_t, boolean_t); 222 static void lgrp_mem_rename(int, lgrp_handle_t, lgrp_handle_t); 223 224 /* 225 * lgroup CPU partition event handlers 226 */ 227 static void lgrp_part_add_cpu(struct cpu *, lgrp_id_t); 228 static void lgrp_part_del_cpu(struct cpu *); 229 230 /* 231 * lgroup framework initialization 232 */ 233 static void lgrp_main_init(void); 234 static void lgrp_main_mp_init(void); 235 static void lgrp_root_init(void); 236 static void lgrp_setup(void); 237 238 /* 239 * lpl topology 240 */ 241 static void lpl_init(lpl_t *, lpl_t *, lgrp_t *); 242 static void lpl_clear(lpl_t *); 243 static void lpl_leaf_insert(lpl_t *, struct cpupart *); 244 static void lpl_leaf_remove(lpl_t *, struct cpupart *); 245 static void lpl_rset_add(lpl_t *, lpl_t *); 246 static void lpl_rset_del(lpl_t *, lpl_t *); 247 static int lpl_rset_contains(lpl_t *, lpl_t *); 248 static void lpl_cpu_adjcnt(lpl_act_t, struct cpu *); 249 static void lpl_child_update(lpl_t *, struct cpupart *); 250 static int lpl_pick(lpl_t *, lpl_t *); 251 static void lpl_verify_wrapper(struct cpupart *); 252 253 /* 254 * defines for lpl topology verifier return codes 255 */ 256 257 #define LPL_TOPO_CORRECT 0 258 #define LPL_TOPO_PART_HAS_NO_LPL -1 259 #define LPL_TOPO_CPUS_NOT_EMPTY -2 260 #define LPL_TOPO_LGRP_MISMATCH -3 261 #define LPL_TOPO_MISSING_PARENT -4 262 #define LPL_TOPO_PARENT_MISMATCH -5 263 #define LPL_TOPO_BAD_CPUCNT -6 264 #define LPL_TOPO_RSET_MISMATCH -7 265 #define LPL_TOPO_LPL_ORPHANED -8 266 #define LPL_TOPO_LPL_BAD_NCPU -9 267 #define LPL_TOPO_RSET_MSSNG_LF -10 268 #define LPL_TOPO_CPU_HAS_BAD_LPL -11 269 #define LPL_TOPO_NONLEAF_HAS_CPUS -12 270 #define LPL_TOPO_LGRP_NOT_LEAF -13 271 #define LPL_TOPO_BAD_RSETCNT -14 272 273 /* 274 * Return whether lgroup optimizations should be enabled on this system 275 */ 276 int 277 lgrp_optimizations(void) 278 { 279 /* 280 * System must have more than 2 lgroups to enable lgroup optimizations 281 * 282 * XXX This assumes that a 2 lgroup system has an empty root lgroup 283 * with one child lgroup containing all the resources. A 2 lgroup 284 * system with a root lgroup directly containing CPUs or memory might 285 * need lgroup optimizations with its child lgroup, but there 286 * isn't such a machine for now.... 287 */ 288 if (nlgrps > 2) 289 return (1); 290 291 return (0); 292 } 293 294 /* 295 * Setup root lgroup 296 */ 297 static void 298 lgrp_root_init(void) 299 { 300 lgrp_handle_t hand; 301 int i; 302 lgrp_id_t id; 303 304 /* 305 * Create the "root" lgroup 306 */ 307 ASSERT(nlgrps == 0); 308 id = nlgrps++; 309 310 lgrp_root = &lroot; 311 312 lgrp_root->lgrp_cpu = NULL; 313 lgrp_root->lgrp_mnodes = 0; 314 lgrp_root->lgrp_nmnodes = 0; 315 hand = lgrp_plat_root_hand(); 316 lgrp_root->lgrp_plathand = hand; 317 318 lgrp_root->lgrp_id = id; 319 lgrp_root->lgrp_cpucnt = 0; 320 lgrp_root->lgrp_childcnt = 0; 321 klgrpset_clear(lgrp_root->lgrp_children); 322 klgrpset_clear(lgrp_root->lgrp_leaves); 323 lgrp_root->lgrp_parent = NULL; 324 lgrp_root->lgrp_latency = lgrp_plat_latency(hand, hand); 325 326 for (i = 0; i < LGRP_RSRC_COUNT; i++) 327 klgrpset_clear(lgrp_root->lgrp_set[i]); 328 329 lgrp_root->lgrp_kstat = NULL; 330 331 lgrp_table[id] = lgrp_root; 332 333 /* 334 * Setup initial lpl list for CPU0 and initial t0 home. 335 * The only lpl space we have so far is lpl_bootstrap. It is used for 336 * all topology operations until cp_default is initialized at which 337 * point t0.t_lpl will be updated. 338 */ 339 lpl_bootstrap = lpl_bootstrap_list; 340 t0.t_lpl = lpl_bootstrap; 341 cp_default.cp_nlgrploads = LPL_BOOTSTRAP_SIZE; 342 lpl_bootstrap_list[1].lpl_lgrpid = 1; 343 344 /* 345 * Set up the bootstrap rset 346 * Since the bootstrap toplogy has just the root, and a leaf, 347 * the rset contains just the leaf, and both lpls can use the same rset 348 */ 349 lpl_bootstrap_rset[0] = &lpl_bootstrap_list[1]; 350 lpl_bootstrap_list[0].lpl_rset_sz = 1; 351 lpl_bootstrap_list[0].lpl_rset = lpl_bootstrap_rset; 352 lpl_bootstrap_list[0].lpl_id2rset = lpl_bootstrap_id2rset; 353 354 lpl_bootstrap_list[1].lpl_rset_sz = 1; 355 lpl_bootstrap_list[1].lpl_rset = lpl_bootstrap_rset; 356 lpl_bootstrap_list[1].lpl_id2rset = lpl_bootstrap_id2rset; 357 358 cp_default.cp_lgrploads = lpl_bootstrap; 359 } 360 361 /* 362 * Initialize the lgroup framework and allow the platform to do the same 363 * 364 * This happens in stages during boot and is all funnelled through this routine 365 * (see definition of lgrp_init_stages_t to see what happens at each stage and 366 * when) 367 */ 368 void 369 lgrp_init(lgrp_init_stages_t stage) 370 { 371 /* 372 * Initialize the platform 373 */ 374 lgrp_plat_init(stage); 375 376 switch (stage) { 377 case LGRP_INIT_STAGE1: 378 /* 379 * Set max number of lgroups supported on this platform which 380 * must be less than the max number of lgroups supported by the 381 * common lgroup framework (eg. NLGRPS_MAX is max elements in 382 * lgrp_table[], etc.) 383 */ 384 nlgrpsmax = lgrp_plat_max_lgrps(); 385 ASSERT(nlgrpsmax <= NLGRPS_MAX); 386 break; 387 388 case LGRP_INIT_STAGE2: 389 lgrp_setup(); 390 break; 391 392 case LGRP_INIT_STAGE4: 393 lgrp_main_init(); 394 break; 395 396 case LGRP_INIT_STAGE5: 397 lgrp_main_mp_init(); 398 break; 399 400 default: 401 break; 402 } 403 } 404 405 /* 406 * Create the root and cpu0's lgroup, and set t0's home. 407 */ 408 static void 409 lgrp_setup(void) 410 { 411 /* 412 * Setup the root lgroup 413 */ 414 lgrp_root_init(); 415 416 /* 417 * Add cpu0 to an lgroup 418 */ 419 lgrp_config(LGRP_CONFIG_CPU_ADD, (uintptr_t)CPU, 0); 420 lgrp_config(LGRP_CONFIG_CPU_ONLINE, (uintptr_t)CPU, 0); 421 } 422 423 /* 424 * true when lgrp initialization has been completed. 425 */ 426 int lgrp_initialized = 0; 427 428 /* 429 * True when lgrp topology is constructed. 430 */ 431 int lgrp_topo_initialized = 0; 432 433 /* 434 * Init routine called after startup(), /etc/system has been processed, 435 * and cpu0 has been added to an lgroup. 436 */ 437 static void 438 lgrp_main_init(void) 439 { 440 cpu_t *cp = CPU; 441 lgrp_id_t lgrpid; 442 int i; 443 extern void pg_cpu0_reinit(); 444 445 /* 446 * Enforce a valid lgrp_mem_default_policy 447 */ 448 if ((lgrp_mem_default_policy <= LGRP_MEM_POLICY_DEFAULT) || 449 (lgrp_mem_default_policy >= LGRP_NUM_MEM_POLICIES) || 450 (lgrp_mem_default_policy == LGRP_MEM_POLICY_NEXT_SEG)) 451 lgrp_mem_default_policy = LGRP_MEM_POLICY_NEXT; 452 453 /* 454 * See if mpo should be disabled. 455 * This may happen in the case of null proc LPA on Starcat. 456 * The platform won't be able to detect null proc LPA until after 457 * cpu0 and memory have already been added to lgroups. 458 * When and if it is detected, the Starcat platform will return 459 * a different platform handle for cpu0 which is what we check for 460 * here. If mpo should be disabled move cpu0 to it's rightful place 461 * (the root), and destroy the remaining lgroups. This effectively 462 * provides an UMA lgroup topology. 463 */ 464 lgrpid = cp->cpu_lpl->lpl_lgrpid; 465 if (lgrp_table[lgrpid]->lgrp_plathand != 466 lgrp_plat_cpu_to_hand(cp->cpu_id)) { 467 lgrp_part_del_cpu(cp); 468 lgrp_cpu_fini(cp, lgrpid); 469 470 lgrp_cpu_init(cp); 471 lgrp_part_add_cpu(cp, cp->cpu_lpl->lpl_lgrpid); 472 473 ASSERT(cp->cpu_lpl->lpl_lgrpid == LGRP_ROOTID); 474 475 /* 476 * Notify the PG subsystem that the CPU's lgrp 477 * association has changed 478 */ 479 pg_cpu0_reinit(); 480 481 /* 482 * Destroy all lgroups except for root 483 */ 484 for (i = 0; i <= lgrp_alloc_max; i++) { 485 if (LGRP_EXISTS(lgrp_table[i]) && 486 lgrp_table[i] != lgrp_root) 487 lgrp_destroy(lgrp_table[i]); 488 } 489 490 /* 491 * Fix up root to point at itself for leaves and resources 492 * and not have any children 493 */ 494 lgrp_root->lgrp_childcnt = 0; 495 klgrpset_clear(lgrp_root->lgrp_children); 496 klgrpset_clear(lgrp_root->lgrp_leaves); 497 klgrpset_add(lgrp_root->lgrp_leaves, LGRP_ROOTID); 498 klgrpset_clear(lgrp_root->lgrp_set[LGRP_RSRC_MEM]); 499 klgrpset_add(lgrp_root->lgrp_set[LGRP_RSRC_MEM], LGRP_ROOTID); 500 } 501 502 /* 503 * Initialize kstats framework. 504 */ 505 lgrp_kstat_init(); 506 /* 507 * cpu0 is finally where it should be, so create it's lgroup's kstats 508 */ 509 mutex_enter(&cpu_lock); 510 lgrp_kstat_create(cp); 511 mutex_exit(&cpu_lock); 512 513 lgrp_initialized = 1; 514 } 515 516 /* 517 * Finish lgrp initialization after all CPUS are brought on-line. 518 * This routine is called after start_other_cpus(). 519 */ 520 static void 521 lgrp_main_mp_init(void) 522 { 523 klgrpset_t changed; 524 525 ht_init(); 526 527 /* 528 * Update lgroup topology (if necessary) 529 */ 530 klgrpset_clear(changed); 531 (void) lgrp_topo_update(lgrp_table, lgrp_alloc_max + 1, &changed); 532 lgrp_topo_initialized = 1; 533 } 534 535 /* 536 * Change latency of lgroup with specified lgroup platform handle (if one is 537 * given) or change all lgroups with old latency to new latency 538 */ 539 void 540 lgrp_latency_change(lgrp_handle_t hand, u_longlong_t oldtime, 541 u_longlong_t newtime) 542 { 543 lgrp_t *lgrp; 544 int i; 545 546 for (i = 0; i <= lgrp_alloc_max; i++) { 547 lgrp = lgrp_table[i]; 548 549 if (!LGRP_EXISTS(lgrp)) 550 continue; 551 552 if ((hand == LGRP_NULL_HANDLE && 553 lgrp->lgrp_latency == oldtime) || 554 (hand != LGRP_NULL_HANDLE && lgrp->lgrp_plathand == hand)) 555 lgrp->lgrp_latency = (int)newtime; 556 } 557 } 558 559 /* 560 * Handle lgroup (re)configuration events (eg. addition of CPU, etc.) 561 */ 562 void 563 lgrp_config(lgrp_config_flag_t event, uintptr_t resource, uintptr_t where) 564 { 565 klgrpset_t changed; 566 cpu_t *cp; 567 lgrp_id_t id; 568 int rc; 569 570 switch (event) { 571 /* 572 * The following (re)configuration events are common code 573 * initiated. lgrp_plat_config() is called here to inform the 574 * platform of the reconfiguration event. 575 */ 576 case LGRP_CONFIG_CPU_ADD: 577 cp = (cpu_t *)resource; 578 579 /* 580 * Initialize the new CPU's lgrp related next/prev 581 * links, and give it a bootstrap lpl so that it can 582 * survive should it need to enter the dispatcher. 583 */ 584 cp->cpu_next_lpl = cp; 585 cp->cpu_prev_lpl = cp; 586 cp->cpu_next_lgrp = cp; 587 cp->cpu_prev_lgrp = cp; 588 cp->cpu_lpl = lpl_bootstrap; 589 590 lgrp_plat_config(event, resource); 591 atomic_inc_32(&lgrp_gen); 592 593 break; 594 case LGRP_CONFIG_CPU_DEL: 595 lgrp_plat_config(event, resource); 596 atomic_inc_32(&lgrp_gen); 597 598 break; 599 case LGRP_CONFIG_CPU_ONLINE: 600 cp = (cpu_t *)resource; 601 lgrp_cpu_init(cp); 602 lgrp_part_add_cpu(cp, cp->cpu_lpl->lpl_lgrpid); 603 rc = lpl_topo_verify(cp->cpu_part); 604 if (rc != LPL_TOPO_CORRECT) { 605 panic("lpl_topo_verify failed: %d", rc); 606 } 607 lgrp_plat_config(event, resource); 608 atomic_inc_32(&lgrp_gen); 609 610 break; 611 case LGRP_CONFIG_CPU_OFFLINE: 612 cp = (cpu_t *)resource; 613 id = cp->cpu_lpl->lpl_lgrpid; 614 lgrp_part_del_cpu(cp); 615 lgrp_cpu_fini(cp, id); 616 rc = lpl_topo_verify(cp->cpu_part); 617 if (rc != LPL_TOPO_CORRECT) { 618 panic("lpl_topo_verify failed: %d", rc); 619 } 620 lgrp_plat_config(event, resource); 621 atomic_inc_32(&lgrp_gen); 622 623 break; 624 case LGRP_CONFIG_CPUPART_ADD: 625 cp = (cpu_t *)resource; 626 lgrp_part_add_cpu((cpu_t *)resource, (lgrp_id_t)where); 627 rc = lpl_topo_verify(cp->cpu_part); 628 if (rc != LPL_TOPO_CORRECT) { 629 panic("lpl_topo_verify failed: %d", rc); 630 } 631 lgrp_plat_config(event, resource); 632 633 break; 634 case LGRP_CONFIG_CPUPART_DEL: 635 cp = (cpu_t *)resource; 636 lgrp_part_del_cpu((cpu_t *)resource); 637 rc = lpl_topo_verify(cp->cpu_part); 638 if (rc != LPL_TOPO_CORRECT) { 639 panic("lpl_topo_verify failed: %d", rc); 640 } 641 lgrp_plat_config(event, resource); 642 643 break; 644 /* 645 * The following events are initiated by the memnode 646 * subsystem. 647 */ 648 case LGRP_CONFIG_MEM_ADD: 649 lgrp_mem_init((int)resource, where, B_FALSE); 650 atomic_inc_32(&lgrp_gen); 651 652 break; 653 case LGRP_CONFIG_MEM_DEL: 654 lgrp_mem_fini((int)resource, where, B_FALSE); 655 atomic_inc_32(&lgrp_gen); 656 657 break; 658 case LGRP_CONFIG_MEM_RENAME: { 659 lgrp_config_mem_rename_t *ren_arg = 660 (lgrp_config_mem_rename_t *)where; 661 662 lgrp_mem_rename((int)resource, 663 ren_arg->lmem_rename_from, 664 ren_arg->lmem_rename_to); 665 atomic_inc_32(&lgrp_gen); 666 667 break; 668 } 669 case LGRP_CONFIG_GEN_UPDATE: 670 atomic_inc_32(&lgrp_gen); 671 672 break; 673 case LGRP_CONFIG_FLATTEN: 674 if (where == 0) 675 lgrp_topo_levels = (int)resource; 676 else 677 (void) lgrp_topo_flatten(resource, 678 lgrp_table, lgrp_alloc_max, &changed); 679 680 break; 681 /* 682 * Update any lgroups with old latency to new latency 683 */ 684 case LGRP_CONFIG_LAT_CHANGE_ALL: 685 lgrp_latency_change(LGRP_NULL_HANDLE, (u_longlong_t)resource, 686 (u_longlong_t)where); 687 688 break; 689 /* 690 * Update lgroup with specified lgroup platform handle to have 691 * new latency 692 */ 693 case LGRP_CONFIG_LAT_CHANGE: 694 lgrp_latency_change((lgrp_handle_t)resource, 0, 695 (u_longlong_t)where); 696 697 break; 698 case LGRP_CONFIG_NOP: 699 700 break; 701 default: 702 break; 703 } 704 705 } 706 707 /* 708 * Called to add lgrp info into cpu structure from cpu_add_unit; 709 * do not assume cpu is in cpu[] yet! 710 * 711 * CPUs are brought online with all other CPUs paused so we can't 712 * allocate memory or we could deadlock the system, so we rely on 713 * the platform to statically allocate as much space as we need 714 * for the lgrp structs and stats. 715 */ 716 static void 717 lgrp_cpu_init(struct cpu *cp) 718 { 719 klgrpset_t changed; 720 int count; 721 lgrp_handle_t hand; 722 int first_cpu; 723 lgrp_t *my_lgrp; 724 lgrp_id_t lgrpid; 725 struct cpu *cptr; 726 727 /* 728 * This is the first time through if the resource set 729 * for the root lgroup is empty. After cpu0 has been 730 * initially added to an lgroup, the root's CPU resource 731 * set can never be empty, since the system's last CPU 732 * cannot be offlined. 733 */ 734 if (klgrpset_isempty(lgrp_root->lgrp_set[LGRP_RSRC_CPU])) { 735 /* 736 * First time through. 737 */ 738 first_cpu = 1; 739 } else { 740 /* 741 * If cpu0 needs to move lgroups, we may come 742 * through here again, at which time cpu_lock won't 743 * be held, and lgrp_initialized will be false. 744 */ 745 ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized); 746 ASSERT(cp->cpu_part != NULL); 747 first_cpu = 0; 748 } 749 750 hand = lgrp_plat_cpu_to_hand(cp->cpu_id); 751 my_lgrp = lgrp_hand_to_lgrp(hand); 752 753 if (my_lgrp == NULL) { 754 /* 755 * Create new lgrp and add it to lgroup topology 756 */ 757 my_lgrp = lgrp_create(); 758 my_lgrp->lgrp_plathand = hand; 759 my_lgrp->lgrp_latency = lgrp_plat_latency(hand, hand); 760 lgrpid = my_lgrp->lgrp_id; 761 klgrpset_add(my_lgrp->lgrp_leaves, lgrpid); 762 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 763 764 count = 0; 765 klgrpset_clear(changed); 766 count += lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1, 767 &changed); 768 /* 769 * May have added new intermediate lgroups, so need to add 770 * resources other than CPUs which are added below 771 */ 772 (void) lgrp_mnode_update(changed, NULL); 773 } else if (my_lgrp->lgrp_latency == 0 && lgrp_plat_latency(hand, hand) 774 > 0) { 775 /* 776 * Leaf lgroup was created, but latency wasn't available 777 * then. So, set latency for it and fill in rest of lgroup 778 * topology now that we know how far it is from other leaf 779 * lgroups. 780 */ 781 lgrpid = my_lgrp->lgrp_id; 782 klgrpset_clear(changed); 783 if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_CPU], 784 lgrpid)) 785 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 786 count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1, 787 &changed); 788 789 /* 790 * May have added new intermediate lgroups, so need to add 791 * resources other than CPUs which are added below 792 */ 793 (void) lgrp_mnode_update(changed, NULL); 794 } else if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_CPU], 795 my_lgrp->lgrp_id)) { 796 int i; 797 798 /* 799 * Update existing lgroup and lgroups containing it with CPU 800 * resource 801 */ 802 lgrpid = my_lgrp->lgrp_id; 803 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 804 for (i = 0; i <= lgrp_alloc_max; i++) { 805 lgrp_t *lgrp; 806 807 lgrp = lgrp_table[i]; 808 if (!LGRP_EXISTS(lgrp) || 809 !lgrp_rsets_member(lgrp->lgrp_set, lgrpid)) 810 continue; 811 812 klgrpset_add(lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 813 } 814 } 815 816 lgrpid = my_lgrp->lgrp_id; 817 cp->cpu_lpl = &cp->cpu_part->cp_lgrploads[lgrpid]; 818 819 /* 820 * For multi-lgroup systems, need to setup lpl for CPU0 or CPU0 will 821 * end up in lpl for lgroup 0 whether it is supposed to be in there or 822 * not since none of lgroup IDs in the lpl's have been set yet. 823 */ 824 if (first_cpu && nlgrpsmax > 1 && lgrpid != cp->cpu_lpl->lpl_lgrpid) 825 cp->cpu_lpl->lpl_lgrpid = lgrpid; 826 827 /* 828 * link the CPU into the lgrp's CPU list 829 */ 830 if (my_lgrp->lgrp_cpucnt == 0) { 831 my_lgrp->lgrp_cpu = cp; 832 cp->cpu_next_lgrp = cp->cpu_prev_lgrp = cp; 833 } else { 834 cptr = my_lgrp->lgrp_cpu; 835 cp->cpu_next_lgrp = cptr; 836 cp->cpu_prev_lgrp = cptr->cpu_prev_lgrp; 837 cptr->cpu_prev_lgrp->cpu_next_lgrp = cp; 838 cptr->cpu_prev_lgrp = cp; 839 } 840 my_lgrp->lgrp_cpucnt++; 841 } 842 843 lgrp_t * 844 lgrp_create(void) 845 { 846 lgrp_t *my_lgrp; 847 lgrp_id_t lgrpid; 848 int i; 849 850 ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock)); 851 852 /* 853 * Find an open slot in the lgroup table and recycle unused lgroup 854 * left there if any 855 */ 856 my_lgrp = NULL; 857 if (lgrp_alloc_hint == -1) 858 /* 859 * Allocate from end when hint not set yet because no lgroups 860 * have been deleted yet 861 */ 862 lgrpid = nlgrps++; 863 else { 864 /* 865 * Start looking for next open slot from hint and leave hint 866 * at slot allocated 867 */ 868 for (i = lgrp_alloc_hint; i < nlgrpsmax; i++) { 869 my_lgrp = lgrp_table[i]; 870 if (!LGRP_EXISTS(my_lgrp)) { 871 lgrpid = i; 872 nlgrps++; 873 break; 874 } 875 } 876 lgrp_alloc_hint = lgrpid; 877 } 878 879 /* 880 * Keep track of max lgroup ID allocated so far to cut down on searches 881 */ 882 if (lgrpid > lgrp_alloc_max) 883 lgrp_alloc_max = lgrpid; 884 885 /* 886 * Need to allocate new lgroup if next open slot didn't have one 887 * for recycling 888 */ 889 if (my_lgrp == NULL) 890 my_lgrp = lgrp_plat_alloc(lgrpid); 891 892 if (nlgrps > nlgrpsmax || my_lgrp == NULL) 893 panic("Too many lgrps for platform (%d)", nlgrps); 894 895 my_lgrp->lgrp_id = lgrpid; 896 my_lgrp->lgrp_latency = 0; 897 my_lgrp->lgrp_plathand = LGRP_NULL_HANDLE; 898 my_lgrp->lgrp_parent = NULL; 899 my_lgrp->lgrp_childcnt = 0; 900 my_lgrp->lgrp_mnodes = (mnodeset_t)0; 901 my_lgrp->lgrp_nmnodes = 0; 902 klgrpset_clear(my_lgrp->lgrp_children); 903 klgrpset_clear(my_lgrp->lgrp_leaves); 904 for (i = 0; i < LGRP_RSRC_COUNT; i++) 905 klgrpset_clear(my_lgrp->lgrp_set[i]); 906 907 my_lgrp->lgrp_cpu = NULL; 908 my_lgrp->lgrp_cpucnt = 0; 909 910 if (my_lgrp->lgrp_kstat != NULL) 911 lgrp_kstat_reset(lgrpid); 912 913 lgrp_table[my_lgrp->lgrp_id] = my_lgrp; 914 915 return (my_lgrp); 916 } 917 918 void 919 lgrp_destroy(lgrp_t *lgrp) 920 { 921 int i; 922 923 /* 924 * Unless this lgroup is being destroyed on behalf of 925 * the boot CPU, cpu_lock must be held 926 */ 927 ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock)); 928 929 if (nlgrps == 1) 930 cmn_err(CE_PANIC, "Can't destroy only lgroup!"); 931 932 if (!LGRP_EXISTS(lgrp)) 933 return; 934 935 /* 936 * Set hint to lgroup being deleted and try to keep lower numbered 937 * hints to facilitate finding empty slots 938 */ 939 if (lgrp_alloc_hint == -1 || lgrp->lgrp_id < lgrp_alloc_hint) 940 lgrp_alloc_hint = lgrp->lgrp_id; 941 942 /* 943 * Mark this lgroup to be recycled by setting its lgroup ID to 944 * LGRP_NONE and clear relevant fields 945 */ 946 lgrp->lgrp_id = LGRP_NONE; 947 lgrp->lgrp_latency = 0; 948 lgrp->lgrp_plathand = LGRP_NULL_HANDLE; 949 lgrp->lgrp_parent = NULL; 950 lgrp->lgrp_childcnt = 0; 951 952 klgrpset_clear(lgrp->lgrp_children); 953 klgrpset_clear(lgrp->lgrp_leaves); 954 for (i = 0; i < LGRP_RSRC_COUNT; i++) 955 klgrpset_clear(lgrp->lgrp_set[i]); 956 957 lgrp->lgrp_mnodes = (mnodeset_t)0; 958 lgrp->lgrp_nmnodes = 0; 959 960 lgrp->lgrp_cpu = NULL; 961 lgrp->lgrp_cpucnt = 0; 962 963 nlgrps--; 964 } 965 966 /* 967 * Initialize kstat data. Called from lgrp intialization code. 968 */ 969 static void 970 lgrp_kstat_init(void) 971 { 972 lgrp_stat_t stat; 973 974 mutex_init(&lgrp_kstat_mutex, NULL, MUTEX_DEFAULT, NULL); 975 976 for (stat = 0; stat < LGRP_NUM_STATS; stat++) 977 kstat_named_init(&lgrp_kstat_data[stat], 978 lgrp_kstat_names[stat], KSTAT_DATA_INT64); 979 } 980 981 /* 982 * initialize an lgrp's kstats if needed 983 * called with cpu_lock held but not with cpus paused. 984 * we don't tear these down now because we don't know about 985 * memory leaving the lgrp yet... 986 */ 987 988 void 989 lgrp_kstat_create(cpu_t *cp) 990 { 991 kstat_t *lgrp_kstat; 992 lgrp_id_t lgrpid; 993 lgrp_t *my_lgrp; 994 995 ASSERT(MUTEX_HELD(&cpu_lock)); 996 997 lgrpid = cp->cpu_lpl->lpl_lgrpid; 998 my_lgrp = lgrp_table[lgrpid]; 999 1000 if (my_lgrp->lgrp_kstat != NULL) 1001 return; /* already initialized */ 1002 1003 lgrp_kstat = kstat_create("lgrp", lgrpid, NULL, "misc", 1004 KSTAT_TYPE_NAMED, LGRP_NUM_STATS, 1005 KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE); 1006 1007 if (lgrp_kstat != NULL) { 1008 lgrp_kstat->ks_lock = &lgrp_kstat_mutex; 1009 lgrp_kstat->ks_private = my_lgrp; 1010 lgrp_kstat->ks_data = &lgrp_kstat_data; 1011 lgrp_kstat->ks_update = lgrp_kstat_extract; 1012 my_lgrp->lgrp_kstat = lgrp_kstat; 1013 kstat_install(lgrp_kstat); 1014 } 1015 } 1016 1017 /* 1018 * this will do something when we manage to remove now unused lgrps 1019 */ 1020 1021 /* ARGSUSED */ 1022 void 1023 lgrp_kstat_destroy(cpu_t *cp) 1024 { 1025 ASSERT(MUTEX_HELD(&cpu_lock)); 1026 } 1027 1028 /* 1029 * Called when a CPU is off-lined. 1030 */ 1031 static void 1032 lgrp_cpu_fini(struct cpu *cp, lgrp_id_t lgrpid) 1033 { 1034 lgrp_t *my_lgrp; 1035 struct cpu *prev; 1036 struct cpu *next; 1037 1038 ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized); 1039 1040 prev = cp->cpu_prev_lgrp; 1041 next = cp->cpu_next_lgrp; 1042 1043 prev->cpu_next_lgrp = next; 1044 next->cpu_prev_lgrp = prev; 1045 1046 /* 1047 * just because I'm paranoid doesn't mean... 1048 */ 1049 1050 cp->cpu_next_lgrp = cp->cpu_prev_lgrp = NULL; 1051 1052 my_lgrp = lgrp_table[lgrpid]; 1053 my_lgrp->lgrp_cpucnt--; 1054 1055 /* 1056 * Removing last CPU in lgroup, so update lgroup topology 1057 */ 1058 if (my_lgrp->lgrp_cpucnt == 0) { 1059 klgrpset_t changed; 1060 int count; 1061 int i; 1062 1063 my_lgrp->lgrp_cpu = NULL; 1064 1065 /* 1066 * Remove this lgroup from its lgroup CPU resources and remove 1067 * lgroup from lgroup topology if it doesn't have any more 1068 * resources in it now 1069 */ 1070 klgrpset_del(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 1071 if (lgrp_rsets_empty(my_lgrp->lgrp_set)) { 1072 count = 0; 1073 klgrpset_clear(changed); 1074 count += lgrp_leaf_delete(my_lgrp, lgrp_table, 1075 lgrp_alloc_max + 1, &changed); 1076 return; 1077 } 1078 1079 /* 1080 * This lgroup isn't empty, so just remove it from CPU 1081 * resources of any lgroups that contain it as such 1082 */ 1083 for (i = 0; i <= lgrp_alloc_max; i++) { 1084 lgrp_t *lgrp; 1085 1086 lgrp = lgrp_table[i]; 1087 if (!LGRP_EXISTS(lgrp) || 1088 !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_CPU], 1089 lgrpid)) 1090 continue; 1091 1092 klgrpset_del(lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 1093 } 1094 return; 1095 } 1096 1097 if (my_lgrp->lgrp_cpu == cp) 1098 my_lgrp->lgrp_cpu = next; 1099 1100 } 1101 1102 /* 1103 * Update memory nodes in target lgroups and return ones that get changed 1104 */ 1105 int 1106 lgrp_mnode_update(klgrpset_t target, klgrpset_t *changed) 1107 { 1108 int count; 1109 int i; 1110 int j; 1111 lgrp_t *lgrp; 1112 lgrp_t *lgrp_rsrc; 1113 1114 count = 0; 1115 if (changed) 1116 klgrpset_clear(*changed); 1117 1118 if (klgrpset_isempty(target)) 1119 return (0); 1120 1121 /* 1122 * Find each lgroup in target lgroups 1123 */ 1124 for (i = 0; i <= lgrp_alloc_max; i++) { 1125 /* 1126 * Skip any lgroups that don't exist or aren't in target group 1127 */ 1128 lgrp = lgrp_table[i]; 1129 if (!klgrpset_ismember(target, i) || !LGRP_EXISTS(lgrp)) { 1130 continue; 1131 } 1132 1133 /* 1134 * Initialize memnodes for intermediate lgroups to 0 1135 * and update them from scratch since they may have completely 1136 * changed 1137 */ 1138 if (lgrp->lgrp_childcnt && lgrp != lgrp_root) { 1139 lgrp->lgrp_mnodes = (mnodeset_t)0; 1140 lgrp->lgrp_nmnodes = 0; 1141 } 1142 1143 /* 1144 * Update memory nodes of of target lgroup with memory nodes 1145 * from each lgroup in its lgroup memory resource set 1146 */ 1147 for (j = 0; j <= lgrp_alloc_max; j++) { 1148 int k; 1149 1150 /* 1151 * Skip any lgroups that don't exist or aren't in 1152 * memory resources of target lgroup 1153 */ 1154 lgrp_rsrc = lgrp_table[j]; 1155 if (!LGRP_EXISTS(lgrp_rsrc) || 1156 !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM], 1157 j)) 1158 continue; 1159 1160 /* 1161 * Update target lgroup's memnodes to include memnodes 1162 * of this lgroup 1163 */ 1164 for (k = 0; k < sizeof (mnodeset_t) * NBBY; k++) { 1165 mnodeset_t mnode_mask; 1166 1167 mnode_mask = (mnodeset_t)1 << k; 1168 if ((lgrp_rsrc->lgrp_mnodes & mnode_mask) && 1169 !(lgrp->lgrp_mnodes & mnode_mask)) { 1170 lgrp->lgrp_mnodes |= mnode_mask; 1171 lgrp->lgrp_nmnodes++; 1172 } 1173 } 1174 count++; 1175 if (changed) 1176 klgrpset_add(*changed, lgrp->lgrp_id); 1177 } 1178 } 1179 1180 return (count); 1181 } 1182 1183 /* 1184 * Memory copy-rename. Called when the "mnode" containing the kernel cage memory 1185 * is moved from one board to another. The "from" and "to" arguments specify the 1186 * source and the destination of the move. 1187 * 1188 * See plat_lgrp_config() for a detailed description of the copy-rename 1189 * semantics. 1190 * 1191 * The lgrp_mem_rename() is called by the platform copy-rename code to update 1192 * the lgroup topology which is changing as memory moves from one lgroup to 1193 * another. It removes the mnode from the source lgroup and re-inserts it in the 1194 * target lgroup. 1195 * 1196 * The lgrp_mem_rename() function passes a flag to lgrp_mem_init() and 1197 * lgrp_mem_fini() telling that the insertion and deleteion are part of a DR 1198 * copy-rename operation. 1199 * 1200 * There is one case which requires special handling. If the system contains 1201 * only two boards (mnodes), the lgrp_mem_fini() removes the only mnode from the 1202 * lgroup hierarchy. This mnode is soon re-inserted back in the hierarchy by 1203 * lgrp_mem_init), but there is a window when the system has no memory in the 1204 * lgroup hierarchy. If another thread tries to allocate memory during this 1205 * window, the allocation will fail, although the system has physical memory. 1206 * This may cause a system panic or a deadlock (some sleeping memory allocations 1207 * happen with cpu_lock held which prevents lgrp_mem_init() from re-inserting 1208 * the mnode back). 1209 * 1210 * The lgrp_memnode_choose() function walks the lgroup hierarchy looking for the 1211 * lgrp with non-empty lgrp_mnodes. To deal with the special case above, 1212 * lgrp_mem_fini() does not remove the last mnode from the lroot->lgrp_mnodes, 1213 * but it updates the rest of the lgroup topology as if the mnode was actually 1214 * removed. The lgrp_mem_init() function recognizes that the mnode being 1215 * inserted represents such a special case and updates the topology 1216 * appropriately. 1217 */ 1218 void 1219 lgrp_mem_rename(int mnode, lgrp_handle_t from, lgrp_handle_t to) 1220 { 1221 /* 1222 * Remove the memory from the source node and add it to the destination 1223 * node. 1224 */ 1225 lgrp_mem_fini(mnode, from, B_TRUE); 1226 lgrp_mem_init(mnode, to, B_TRUE); 1227 } 1228 1229 /* 1230 * Called to indicate that the lgrp with platform handle "hand" now 1231 * contains the memory identified by "mnode". 1232 * 1233 * LOCKING for this routine is a bit tricky. Usually it is called without 1234 * cpu_lock and it must must grab cpu_lock here to prevent racing with other 1235 * callers. During DR of the board containing the caged memory it may be called 1236 * with cpu_lock already held and CPUs paused. 1237 * 1238 * If the insertion is part of the DR copy-rename and the inserted mnode (and 1239 * only this mnode) is already present in the lgrp_root->lgrp_mnodes set, we are 1240 * dealing with the special case of DR copy-rename described in 1241 * lgrp_mem_rename(). 1242 */ 1243 void 1244 lgrp_mem_init(int mnode, lgrp_handle_t hand, boolean_t is_copy_rename) 1245 { 1246 klgrpset_t changed; 1247 int count; 1248 int i; 1249 lgrp_t *my_lgrp; 1250 lgrp_id_t lgrpid; 1251 mnodeset_t mnodes_mask = ((mnodeset_t)1 << mnode); 1252 boolean_t drop_lock = B_FALSE; 1253 boolean_t need_synch = B_FALSE; 1254 1255 /* 1256 * Grab CPU lock (if we haven't already) 1257 */ 1258 if (!MUTEX_HELD(&cpu_lock)) { 1259 mutex_enter(&cpu_lock); 1260 drop_lock = B_TRUE; 1261 } 1262 1263 /* 1264 * This routine may be called from a context where we already 1265 * hold cpu_lock, and have already paused cpus. 1266 */ 1267 if (!cpus_paused()) 1268 need_synch = B_TRUE; 1269 1270 /* 1271 * Check if this mnode is already configured and return immediately if 1272 * it is. 1273 * 1274 * NOTE: in special case of copy-rename of the only remaining mnode, 1275 * lgrp_mem_fini() refuses to remove the last mnode from the root, so we 1276 * recognize this case and continue as usual, but skip the update to 1277 * the lgrp_mnodes and the lgrp_nmnodes. This restores the inconsistency 1278 * in topology, temporarily introduced by lgrp_mem_fini(). 1279 */ 1280 if (! (is_copy_rename && (lgrp_root->lgrp_mnodes == mnodes_mask)) && 1281 lgrp_root->lgrp_mnodes & mnodes_mask) { 1282 if (drop_lock) 1283 mutex_exit(&cpu_lock); 1284 return; 1285 } 1286 1287 /* 1288 * Update lgroup topology with new memory resources, keeping track of 1289 * which lgroups change 1290 */ 1291 count = 0; 1292 klgrpset_clear(changed); 1293 my_lgrp = lgrp_hand_to_lgrp(hand); 1294 if (my_lgrp == NULL) { 1295 /* new lgrp */ 1296 my_lgrp = lgrp_create(); 1297 lgrpid = my_lgrp->lgrp_id; 1298 my_lgrp->lgrp_plathand = hand; 1299 my_lgrp->lgrp_latency = lgrp_plat_latency(hand, hand); 1300 klgrpset_add(my_lgrp->lgrp_leaves, lgrpid); 1301 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid); 1302 1303 if (need_synch) 1304 pause_cpus(NULL, NULL); 1305 count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1, 1306 &changed); 1307 if (need_synch) 1308 start_cpus(); 1309 } else if (my_lgrp->lgrp_latency == 0 && lgrp_plat_latency(hand, hand) 1310 > 0) { 1311 /* 1312 * Leaf lgroup was created, but latency wasn't available 1313 * then. So, set latency for it and fill in rest of lgroup 1314 * topology now that we know how far it is from other leaf 1315 * lgroups. 1316 */ 1317 klgrpset_clear(changed); 1318 lgrpid = my_lgrp->lgrp_id; 1319 if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_MEM], 1320 lgrpid)) 1321 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid); 1322 if (need_synch) 1323 pause_cpus(NULL, NULL); 1324 count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1, 1325 &changed); 1326 if (need_synch) 1327 start_cpus(); 1328 } else if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_MEM], 1329 my_lgrp->lgrp_id)) { 1330 /* 1331 * Add new lgroup memory resource to existing lgroup 1332 */ 1333 lgrpid = my_lgrp->lgrp_id; 1334 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid); 1335 klgrpset_add(changed, lgrpid); 1336 count++; 1337 for (i = 0; i <= lgrp_alloc_max; i++) { 1338 lgrp_t *lgrp; 1339 1340 lgrp = lgrp_table[i]; 1341 if (!LGRP_EXISTS(lgrp) || 1342 !lgrp_rsets_member(lgrp->lgrp_set, lgrpid)) 1343 continue; 1344 1345 klgrpset_add(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid); 1346 klgrpset_add(changed, lgrp->lgrp_id); 1347 count++; 1348 } 1349 } 1350 1351 /* 1352 * Add memory node to lgroup and remove lgroup from ones that need 1353 * to be updated 1354 */ 1355 if (!(my_lgrp->lgrp_mnodes & mnodes_mask)) { 1356 my_lgrp->lgrp_mnodes |= mnodes_mask; 1357 my_lgrp->lgrp_nmnodes++; 1358 } 1359 klgrpset_del(changed, lgrpid); 1360 1361 /* 1362 * Update memory node information for all lgroups that changed and 1363 * contain new memory node as a resource 1364 */ 1365 if (count) 1366 (void) lgrp_mnode_update(changed, NULL); 1367 1368 if (drop_lock) 1369 mutex_exit(&cpu_lock); 1370 } 1371 1372 /* 1373 * Called to indicate that the lgroup associated with the platform 1374 * handle "hand" no longer contains given memory node 1375 * 1376 * LOCKING for this routine is a bit tricky. Usually it is called without 1377 * cpu_lock and it must must grab cpu_lock here to prevent racing with other 1378 * callers. During DR of the board containing the caged memory it may be called 1379 * with cpu_lock already held and CPUs paused. 1380 * 1381 * If the deletion is part of the DR copy-rename and the deleted mnode is the 1382 * only one present in the lgrp_root->lgrp_mnodes, all the topology is updated, 1383 * but lgrp_root->lgrp_mnodes is left intact. Later, lgrp_mem_init() will insert 1384 * the same mnode back into the topology. See lgrp_mem_rename() and 1385 * lgrp_mem_init() for additional details. 1386 */ 1387 void 1388 lgrp_mem_fini(int mnode, lgrp_handle_t hand, boolean_t is_copy_rename) 1389 { 1390 klgrpset_t changed; 1391 int count; 1392 int i; 1393 lgrp_t *my_lgrp; 1394 lgrp_id_t lgrpid; 1395 mnodeset_t mnodes_mask; 1396 boolean_t drop_lock = B_FALSE; 1397 boolean_t need_synch = B_FALSE; 1398 1399 /* 1400 * Grab CPU lock (if we haven't already) 1401 */ 1402 if (!MUTEX_HELD(&cpu_lock)) { 1403 mutex_enter(&cpu_lock); 1404 drop_lock = B_TRUE; 1405 } 1406 1407 /* 1408 * This routine may be called from a context where we already 1409 * hold cpu_lock and have already paused cpus. 1410 */ 1411 if (!cpus_paused()) 1412 need_synch = B_TRUE; 1413 1414 my_lgrp = lgrp_hand_to_lgrp(hand); 1415 1416 /* 1417 * The lgrp *must* be pre-existing 1418 */ 1419 ASSERT(my_lgrp != NULL); 1420 1421 /* 1422 * Delete memory node from lgroups which contain it 1423 */ 1424 mnodes_mask = ((mnodeset_t)1 << mnode); 1425 for (i = 0; i <= lgrp_alloc_max; i++) { 1426 lgrp_t *lgrp = lgrp_table[i]; 1427 /* 1428 * Skip any non-existent lgroups and any lgroups that don't 1429 * contain leaf lgroup of memory as a memory resource 1430 */ 1431 if (!LGRP_EXISTS(lgrp) || 1432 !(lgrp->lgrp_mnodes & mnodes_mask)) 1433 continue; 1434 1435 /* 1436 * Avoid removing the last mnode from the root in the DR 1437 * copy-rename case. See lgrp_mem_rename() for details. 1438 */ 1439 if (is_copy_rename && 1440 (lgrp == lgrp_root) && (lgrp->lgrp_mnodes == mnodes_mask)) 1441 continue; 1442 1443 /* 1444 * Remove memory node from lgroup. 1445 */ 1446 lgrp->lgrp_mnodes &= ~mnodes_mask; 1447 lgrp->lgrp_nmnodes--; 1448 ASSERT(lgrp->lgrp_nmnodes >= 0); 1449 } 1450 ASSERT(lgrp_root->lgrp_nmnodes > 0); 1451 1452 /* 1453 * Don't need to update lgroup topology if this lgroup still has memory. 1454 * 1455 * In the special case of DR copy-rename with the only mnode being 1456 * removed, the lgrp_mnodes for the root is always non-zero, but we 1457 * still need to update the lgroup topology. 1458 */ 1459 if ((my_lgrp->lgrp_nmnodes > 0) && 1460 !(is_copy_rename && (my_lgrp == lgrp_root) && 1461 (my_lgrp->lgrp_mnodes == mnodes_mask))) { 1462 if (drop_lock) 1463 mutex_exit(&cpu_lock); 1464 return; 1465 } 1466 1467 /* 1468 * This lgroup does not contain any memory now 1469 */ 1470 klgrpset_clear(my_lgrp->lgrp_set[LGRP_RSRC_MEM]); 1471 1472 /* 1473 * Remove this lgroup from lgroup topology if it does not contain any 1474 * resources now 1475 */ 1476 lgrpid = my_lgrp->lgrp_id; 1477 count = 0; 1478 klgrpset_clear(changed); 1479 if (lgrp_rsets_empty(my_lgrp->lgrp_set)) { 1480 /* 1481 * Delete lgroup when no more resources 1482 */ 1483 if (need_synch) 1484 pause_cpus(NULL, NULL); 1485 count = lgrp_leaf_delete(my_lgrp, lgrp_table, 1486 lgrp_alloc_max + 1, &changed); 1487 ASSERT(count > 0); 1488 if (need_synch) 1489 start_cpus(); 1490 } else { 1491 /* 1492 * Remove lgroup from memory resources of any lgroups that 1493 * contain it as such 1494 */ 1495 for (i = 0; i <= lgrp_alloc_max; i++) { 1496 lgrp_t *lgrp; 1497 1498 lgrp = lgrp_table[i]; 1499 if (!LGRP_EXISTS(lgrp) || 1500 !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM], 1501 lgrpid)) 1502 continue; 1503 1504 klgrpset_del(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid); 1505 } 1506 } 1507 if (drop_lock) 1508 mutex_exit(&cpu_lock); 1509 } 1510 1511 /* 1512 * Return lgroup with given platform handle 1513 */ 1514 lgrp_t * 1515 lgrp_hand_to_lgrp(lgrp_handle_t hand) 1516 { 1517 int i; 1518 lgrp_t *lgrp; 1519 1520 if (hand == LGRP_NULL_HANDLE) 1521 return (NULL); 1522 1523 for (i = 0; i <= lgrp_alloc_max; i++) { 1524 lgrp = lgrp_table[i]; 1525 if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand) 1526 return (lgrp); 1527 } 1528 return (NULL); 1529 } 1530 1531 /* 1532 * Return the home lgroup of the current thread. 1533 * We must do this with kernel preemption disabled, since we don't want our 1534 * thread to be re-homed while we're poking around with its lpl, and the lpl 1535 * should never be NULL. 1536 * 1537 * NOTE: Can't guarantee that lgroup will be valid once kernel preemption 1538 * is enabled because of DR. Callers can use disable kernel preemption 1539 * around this call to guarantee that the lgroup will be valid beyond this 1540 * routine, since kernel preemption can be recursive. 1541 */ 1542 lgrp_t * 1543 lgrp_home_lgrp(void) 1544 { 1545 lgrp_t *lgrp; 1546 lpl_t *lpl; 1547 1548 kpreempt_disable(); 1549 1550 lpl = curthread->t_lpl; 1551 ASSERT(lpl != NULL); 1552 ASSERT(lpl->lpl_lgrpid >= 0 && lpl->lpl_lgrpid <= lgrp_alloc_max); 1553 ASSERT(LGRP_EXISTS(lgrp_table[lpl->lpl_lgrpid])); 1554 lgrp = lgrp_table[lpl->lpl_lgrpid]; 1555 1556 kpreempt_enable(); 1557 1558 return (lgrp); 1559 } 1560 1561 /* 1562 * Return ID of home lgroup for given thread 1563 * (See comments for lgrp_home_lgrp() for special care and handling 1564 * instructions) 1565 */ 1566 lgrp_id_t 1567 lgrp_home_id(kthread_t *t) 1568 { 1569 lgrp_id_t lgrp; 1570 lpl_t *lpl; 1571 1572 ASSERT(t != NULL); 1573 /* 1574 * We'd like to ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock)), but we 1575 * cannot since the HAT layer can call into this routine to 1576 * determine the locality for its data structures in the context 1577 * of a page fault. 1578 */ 1579 1580 kpreempt_disable(); 1581 1582 lpl = t->t_lpl; 1583 ASSERT(lpl != NULL); 1584 ASSERT(lpl->lpl_lgrpid >= 0 && lpl->lpl_lgrpid <= lgrp_alloc_max); 1585 lgrp = lpl->lpl_lgrpid; 1586 1587 kpreempt_enable(); 1588 1589 return (lgrp); 1590 } 1591 1592 /* 1593 * Return lgroup containing the physical memory for the given page frame number 1594 */ 1595 lgrp_t * 1596 lgrp_pfn_to_lgrp(pfn_t pfn) 1597 { 1598 lgrp_handle_t hand; 1599 int i; 1600 lgrp_t *lgrp; 1601 1602 hand = lgrp_plat_pfn_to_hand(pfn); 1603 if (hand != LGRP_NULL_HANDLE) 1604 for (i = 0; i <= lgrp_alloc_max; i++) { 1605 lgrp = lgrp_table[i]; 1606 if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand) 1607 return (lgrp); 1608 } 1609 return (NULL); 1610 } 1611 1612 /* 1613 * Return lgroup containing the physical memory for the given page frame number 1614 */ 1615 lgrp_t * 1616 lgrp_phys_to_lgrp(u_longlong_t physaddr) 1617 { 1618 lgrp_handle_t hand; 1619 int i; 1620 lgrp_t *lgrp; 1621 pfn_t pfn; 1622 1623 pfn = btop(physaddr); 1624 hand = lgrp_plat_pfn_to_hand(pfn); 1625 if (hand != LGRP_NULL_HANDLE) 1626 for (i = 0; i <= lgrp_alloc_max; i++) { 1627 lgrp = lgrp_table[i]; 1628 if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand) 1629 return (lgrp); 1630 } 1631 return (NULL); 1632 } 1633 1634 /* 1635 * Return the leaf lgroup containing the given CPU 1636 * 1637 * The caller needs to take precautions necessary to prevent 1638 * "cpu", and it's lpl from going away across a call to this function. 1639 * hint: kpreempt_disable()/kpreempt_enable() 1640 */ 1641 static lgrp_t * 1642 lgrp_cpu_to_lgrp(cpu_t *cpu) 1643 { 1644 return (cpu->cpu_lpl->lpl_lgrp); 1645 } 1646 1647 /* 1648 * Return the sum of the partition loads in an lgrp divided by 1649 * the number of CPUs in the lgrp. This is our best approximation 1650 * of an 'lgroup load average' for a useful per-lgroup kstat. 1651 */ 1652 static uint64_t 1653 lgrp_sum_loadavgs(lgrp_t *lgrp) 1654 { 1655 cpu_t *cpu; 1656 int ncpu; 1657 uint64_t loads = 0; 1658 1659 mutex_enter(&cpu_lock); 1660 1661 cpu = lgrp->lgrp_cpu; 1662 ncpu = lgrp->lgrp_cpucnt; 1663 1664 if (cpu == NULL || ncpu == 0) { 1665 mutex_exit(&cpu_lock); 1666 return (0ull); 1667 } 1668 1669 do { 1670 loads += cpu->cpu_lpl->lpl_loadavg; 1671 cpu = cpu->cpu_next_lgrp; 1672 } while (cpu != lgrp->lgrp_cpu); 1673 1674 mutex_exit(&cpu_lock); 1675 1676 return (loads / ncpu); 1677 } 1678 1679 void 1680 lgrp_stat_add(lgrp_id_t lgrpid, lgrp_stat_t stat, int64_t val) 1681 { 1682 struct lgrp_stats *pstats; 1683 1684 /* 1685 * Verify that the caller isn't trying to add to 1686 * a statistic for an lgroup that has gone away 1687 */ 1688 if (lgrpid < 0 || lgrpid > lgrp_alloc_max) 1689 return; 1690 1691 pstats = &lgrp_stats[lgrpid]; 1692 atomic_add_64((uint64_t *)LGRP_STAT_WRITE_PTR(pstats, stat), val); 1693 } 1694 1695 int64_t 1696 lgrp_stat_read(lgrp_id_t lgrpid, lgrp_stat_t stat) 1697 { 1698 uint64_t val; 1699 struct lgrp_stats *pstats; 1700 1701 if (lgrpid < 0 || lgrpid > lgrp_alloc_max) 1702 return ((int64_t)0); 1703 1704 pstats = &lgrp_stats[lgrpid]; 1705 LGRP_STAT_READ(pstats, stat, val); 1706 return (val); 1707 } 1708 1709 /* 1710 * Reset all kstats for lgrp specified by its lgrpid. 1711 */ 1712 static void 1713 lgrp_kstat_reset(lgrp_id_t lgrpid) 1714 { 1715 lgrp_stat_t stat; 1716 1717 if (lgrpid < 0 || lgrpid > lgrp_alloc_max) 1718 return; 1719 1720 for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) { 1721 LGRP_STAT_RESET(&lgrp_stats[lgrpid], stat); 1722 } 1723 } 1724 1725 /* 1726 * Collect all per-lgrp statistics for the lgrp associated with this 1727 * kstat, and store them in the ks_data array. 1728 * 1729 * The superuser can reset all the running counter statistics for an 1730 * lgrp by writing to any of the lgrp's stats. 1731 */ 1732 static int 1733 lgrp_kstat_extract(kstat_t *ksp, int rw) 1734 { 1735 lgrp_stat_t stat; 1736 struct kstat_named *ksd; 1737 lgrp_t *lgrp; 1738 lgrp_id_t lgrpid; 1739 1740 lgrp = (lgrp_t *)ksp->ks_private; 1741 1742 ksd = (struct kstat_named *)ksp->ks_data; 1743 ASSERT(ksd == (struct kstat_named *)&lgrp_kstat_data); 1744 1745 lgrpid = lgrp->lgrp_id; 1746 1747 if (lgrpid == LGRP_NONE) { 1748 /* 1749 * Return all zeroes as stats for freed lgrp. 1750 */ 1751 for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) { 1752 ksd[stat].value.i64 = 0; 1753 } 1754 ksd[stat + LGRP_NUM_CPUS].value.i64 = 0; 1755 ksd[stat + LGRP_NUM_PG_INSTALL].value.i64 = 0; 1756 ksd[stat + LGRP_NUM_PG_AVAIL].value.i64 = 0; 1757 ksd[stat + LGRP_NUM_PG_FREE].value.i64 = 0; 1758 ksd[stat + LGRP_LOADAVG].value.i64 = 0; 1759 } else if (rw != KSTAT_WRITE) { 1760 /* 1761 * Handle counter stats 1762 */ 1763 for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) { 1764 ksd[stat].value.i64 = lgrp_stat_read(lgrpid, stat); 1765 } 1766 1767 /* 1768 * Handle kernel data snapshot stats 1769 */ 1770 ksd[stat + LGRP_NUM_CPUS].value.i64 = lgrp->lgrp_cpucnt; 1771 ksd[stat + LGRP_NUM_PG_INSTALL].value.i64 = 1772 lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_INSTALL); 1773 ksd[stat + LGRP_NUM_PG_AVAIL].value.i64 = 1774 lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_AVAIL); 1775 ksd[stat + LGRP_NUM_PG_FREE].value.i64 = 1776 lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_FREE); 1777 ksd[stat + LGRP_LOADAVG].value.i64 = lgrp_sum_loadavgs(lgrp); 1778 ksd[stat + LGRP_LOADAVG_SCALE].value.i64 = 1779 lgrp_loadavg_max_effect; 1780 } else { 1781 lgrp_kstat_reset(lgrpid); 1782 } 1783 1784 return (0); 1785 } 1786 1787 int 1788 lgrp_query_cpu(processorid_t id, lgrp_id_t *lp) 1789 { 1790 cpu_t *cp; 1791 1792 mutex_enter(&cpu_lock); 1793 1794 if ((cp = cpu_get(id)) == NULL) { 1795 mutex_exit(&cpu_lock); 1796 return (EINVAL); 1797 } 1798 1799 if (cpu_is_offline(cp) || cpu_is_poweredoff(cp)) { 1800 mutex_exit(&cpu_lock); 1801 return (EINVAL); 1802 } 1803 1804 ASSERT(cp->cpu_lpl != NULL); 1805 1806 *lp = cp->cpu_lpl->lpl_lgrpid; 1807 1808 mutex_exit(&cpu_lock); 1809 1810 return (0); 1811 } 1812 1813 int 1814 lgrp_query_load(processorid_t id, lgrp_load_t *lp) 1815 { 1816 cpu_t *cp; 1817 1818 mutex_enter(&cpu_lock); 1819 1820 if ((cp = cpu_get(id)) == NULL) { 1821 mutex_exit(&cpu_lock); 1822 return (EINVAL); 1823 } 1824 1825 ASSERT(cp->cpu_lpl != NULL); 1826 1827 *lp = cp->cpu_lpl->lpl_loadavg; 1828 1829 mutex_exit(&cpu_lock); 1830 1831 return (0); 1832 } 1833 1834 /* 1835 * Add a resource named by lpl_leaf to rset of lpl_target 1836 * 1837 * This routine also adjusts ncpu and nrset if the call succeeds in adding a 1838 * resource. It is adjusted here, as this is presently the only place that we 1839 * can be certain a resource addition has succeeded. 1840 * 1841 * We keep the list of rsets sorted so that the dispatcher can quickly walk the 1842 * list in order until it reaches a NULL. (This list is required to be NULL 1843 * terminated, too). This is done so that we can mark start pos + 1, so that 1844 * each lpl is traversed sequentially, but in a different order. We hope this 1845 * will improve performance a bit. (Hopefully, less read-to-own traffic...) 1846 */ 1847 1848 void 1849 lpl_rset_add(lpl_t *lpl_target, lpl_t *lpl_leaf) 1850 { 1851 int i; 1852 int entry_slot = 0; 1853 1854 /* return if leaf is already present */ 1855 for (i = 0; i < lpl_target->lpl_nrset; i++) { 1856 if (lpl_target->lpl_rset[i] == lpl_leaf) { 1857 return; 1858 } 1859 1860 if (lpl_target->lpl_rset[i]->lpl_lgrpid > 1861 lpl_leaf->lpl_lgrpid) { 1862 break; 1863 } 1864 } 1865 1866 /* insert leaf, update counts */ 1867 entry_slot = i; 1868 i = lpl_target->lpl_nrset++; 1869 1870 /* 1871 * Start at the end of the rset array and work backwards towards the 1872 * slot into which the new lpl will be inserted. This effectively 1873 * preserves the current ordering by scooting everybody over one entry, 1874 * and placing the new entry into the space created. 1875 */ 1876 while (i-- > entry_slot) { 1877 lpl_target->lpl_rset[i + 1] = lpl_target->lpl_rset[i]; 1878 lpl_target->lpl_id2rset[lpl_target->lpl_rset[i]->lpl_lgrpid] = 1879 i + 1; 1880 } 1881 1882 lpl_target->lpl_rset[entry_slot] = lpl_leaf; 1883 lpl_target->lpl_id2rset[lpl_leaf->lpl_lgrpid] = entry_slot; 1884 1885 lpl_target->lpl_ncpu += lpl_leaf->lpl_ncpu; 1886 } 1887 1888 /* 1889 * Update each of lpl_parent's children with a reference to their parent. 1890 * The lgrp topology is used as the reference since it is fully 1891 * consistent and correct at this point. 1892 * This should be called after any potential change in lpl_parent's 1893 * rset. 1894 */ 1895 static void 1896 lpl_child_update(lpl_t *lpl_parent, struct cpupart *cp) 1897 { 1898 klgrpset_t children; 1899 int i; 1900 1901 children = lgrp_table[lpl_parent->lpl_lgrpid]->lgrp_children; 1902 if (klgrpset_isempty(children)) 1903 return; /* nothing to do */ 1904 1905 for (i = 0; i <= lgrp_alloc_max; i++) { 1906 if (klgrpset_ismember(children, i)) { 1907 /* 1908 * (Re)set the parent. It may be incorrect if 1909 * lpl_parent is new in the topology. 1910 */ 1911 cp->cp_lgrploads[i].lpl_parent = lpl_parent; 1912 } 1913 } 1914 } 1915 1916 /* 1917 * Delete resource lpl_leaf from rset of lpl_target, assuming it's there. 1918 * 1919 * This routine also adjusts ncpu and nrset if the call succeeds in deleting a 1920 * resource. The values are adjusted here, as this is the only place that we can 1921 * be certain a resource was successfully deleted. 1922 */ 1923 void 1924 lpl_rset_del(lpl_t *lpl_target, lpl_t *lpl_leaf) 1925 { 1926 int i; 1927 lpl_t *leaf; 1928 1929 if (lpl_target->lpl_nrset == 0) 1930 return; 1931 1932 /* find leaf in intermediate node */ 1933 for (i = 0; i < lpl_target->lpl_nrset; i++) { 1934 if (lpl_target->lpl_rset[i] == lpl_leaf) 1935 break; 1936 } 1937 1938 /* return if leaf not found */ 1939 if (lpl_target->lpl_rset[i] != lpl_leaf) 1940 return; 1941 1942 /* prune leaf, compress array */ 1943 lpl_target->lpl_rset[lpl_target->lpl_nrset--] = NULL; 1944 lpl_target->lpl_id2rset[lpl_leaf->lpl_lgrpid] = -1; 1945 lpl_target->lpl_ncpu--; 1946 do { 1947 lpl_target->lpl_rset[i] = lpl_target->lpl_rset[i + 1]; 1948 /* 1949 * Update the lgrp id <=> rset mapping 1950 */ 1951 if ((leaf = lpl_target->lpl_rset[i]) != NULL) { 1952 lpl_target->lpl_id2rset[leaf->lpl_lgrpid] = i; 1953 } 1954 } while (i++ < lpl_target->lpl_nrset); 1955 } 1956 1957 /* 1958 * Check to see if the resource set of the target lpl contains the 1959 * supplied leaf lpl. This returns 1 if the lpl is found, 0 if it is not. 1960 */ 1961 1962 int 1963 lpl_rset_contains(lpl_t *lpl_target, lpl_t *lpl_leaf) 1964 { 1965 int i; 1966 1967 for (i = 0; i < lpl_target->lpl_nrset; i++) { 1968 if (lpl_target->lpl_rset[i] == lpl_leaf) 1969 return (1); 1970 } 1971 1972 return (0); 1973 } 1974 1975 /* 1976 * Called when we change cpu lpl membership. This increments or decrements the 1977 * per-cpu counter in every lpl in which our leaf appears. 1978 */ 1979 void 1980 lpl_cpu_adjcnt(lpl_act_t act, cpu_t *cp) 1981 { 1982 cpupart_t *cpupart; 1983 lgrp_t *lgrp_leaf; 1984 lgrp_t *lgrp_cur; 1985 lpl_t *lpl_leaf; 1986 lpl_t *lpl_cur; 1987 int i; 1988 1989 ASSERT(act == LPL_DECREMENT || act == LPL_INCREMENT); 1990 1991 cpupart = cp->cpu_part; 1992 lpl_leaf = cp->cpu_lpl; 1993 lgrp_leaf = lgrp_table[lpl_leaf->lpl_lgrpid]; 1994 1995 for (i = 0; i <= lgrp_alloc_max; i++) { 1996 lgrp_cur = lgrp_table[i]; 1997 1998 /* 1999 * Don't adjust if the lgrp isn't there, if we're the leaf lpl 2000 * for the cpu in question, or if the current lgrp and leaf 2001 * don't share the same resources. 2002 */ 2003 2004 if (!LGRP_EXISTS(lgrp_cur) || (lgrp_cur == lgrp_leaf) || 2005 !klgrpset_intersects(lgrp_leaf->lgrp_set[LGRP_RSRC_CPU], 2006 lgrp_cur->lgrp_set[LGRP_RSRC_CPU])) 2007 continue; 2008 2009 2010 lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id]; 2011 2012 if (lpl_cur->lpl_nrset > 0) { 2013 if (act == LPL_INCREMENT) { 2014 lpl_cur->lpl_ncpu++; 2015 } else if (act == LPL_DECREMENT) { 2016 lpl_cur->lpl_ncpu--; 2017 } 2018 } 2019 } 2020 } 2021 2022 /* 2023 * Initialize lpl with given resources and specified lgrp 2024 */ 2025 void 2026 lpl_init(lpl_t *lpl, lpl_t *lpl_leaf, lgrp_t *lgrp) 2027 { 2028 lpl->lpl_lgrpid = lgrp->lgrp_id; 2029 lpl->lpl_loadavg = 0; 2030 if (lpl == lpl_leaf) 2031 lpl->lpl_ncpu = 1; 2032 else 2033 lpl->lpl_ncpu = lpl_leaf->lpl_ncpu; 2034 lpl->lpl_nrset = 1; 2035 lpl->lpl_rset[0] = lpl_leaf; 2036 lpl->lpl_id2rset[lpl_leaf->lpl_lgrpid] = 0; 2037 lpl->lpl_lgrp = lgrp; 2038 lpl->lpl_parent = NULL; /* set by lpl_leaf_insert() */ 2039 lpl->lpl_cpus = NULL; /* set by lgrp_part_add_cpu() */ 2040 } 2041 2042 /* 2043 * Clear an unused lpl 2044 */ 2045 void 2046 lpl_clear(lpl_t *lpl) 2047 { 2048 /* 2049 * Clear out all fields in the lpl except: 2050 * lpl_lgrpid - to facilitate debugging 2051 * lpl_rset, lpl_rset_sz, lpl_id2rset - rset array references / size 2052 * 2053 * Note that the lpl's rset and id2rset mapping are cleared as well. 2054 */ 2055 lpl->lpl_loadavg = 0; 2056 lpl->lpl_ncpu = 0; 2057 lpl->lpl_lgrp = NULL; 2058 lpl->lpl_parent = NULL; 2059 lpl->lpl_cpus = NULL; 2060 lpl->lpl_nrset = 0; 2061 lpl->lpl_homed_time = 0; 2062 bzero(lpl->lpl_rset, sizeof (lpl->lpl_rset[0]) * lpl->lpl_rset_sz); 2063 bzero(lpl->lpl_id2rset, 2064 sizeof (lpl->lpl_id2rset[0]) * lpl->lpl_rset_sz); 2065 } 2066 2067 /* 2068 * Given a CPU-partition, verify that the lpl topology in the CPU-partition 2069 * is in sync with the lgroup toplogy in the system. The lpl topology may not 2070 * make full use of all of the lgroup topology, but this checks to make sure 2071 * that for the parts that it does use, it has correctly understood the 2072 * relationships that exist. This function returns 2073 * 0 if the topology is correct, and a non-zero error code, for non-debug 2074 * kernels if incorrect. Asserts are spread throughout the code to aid in 2075 * debugging on a DEBUG kernel. 2076 */ 2077 int 2078 lpl_topo_verify(cpupart_t *cpupart) 2079 { 2080 lgrp_t *lgrp; 2081 lpl_t *lpl; 2082 klgrpset_t rset; 2083 klgrpset_t cset; 2084 cpu_t *cpu; 2085 cpu_t *cp_start; 2086 int i; 2087 int j; 2088 int sum; 2089 2090 /* topology can't be incorrect if it doesn't exist */ 2091 if (!lgrp_topo_initialized || !lgrp_initialized) 2092 return (LPL_TOPO_CORRECT); 2093 2094 ASSERT(cpupart != NULL); 2095 2096 for (i = 0; i <= lgrp_alloc_max; i++) { 2097 lgrp = lgrp_table[i]; 2098 lpl = NULL; 2099 /* make sure lpls are allocated */ 2100 ASSERT(cpupart->cp_lgrploads); 2101 if (!cpupart->cp_lgrploads) 2102 return (LPL_TOPO_PART_HAS_NO_LPL); 2103 2104 lpl = &cpupart->cp_lgrploads[i]; 2105 /* make sure our index is good */ 2106 ASSERT(i < cpupart->cp_nlgrploads); 2107 2108 /* if lgroup doesn't exist, make sure lpl is empty */ 2109 if (!LGRP_EXISTS(lgrp)) { 2110 ASSERT(lpl->lpl_ncpu == 0); 2111 if (lpl->lpl_ncpu > 0) { 2112 return (LPL_TOPO_CPUS_NOT_EMPTY); 2113 } else { 2114 continue; 2115 } 2116 } 2117 2118 /* verify that lgroup and lpl are identically numbered */ 2119 ASSERT(lgrp->lgrp_id == lpl->lpl_lgrpid); 2120 2121 /* if lgroup isn't in our partition, make sure lpl is empty */ 2122 if (!klgrpset_intersects(lgrp->lgrp_leaves, 2123 cpupart->cp_lgrpset)) { 2124 ASSERT(lpl->lpl_ncpu == 0); 2125 if (lpl->lpl_ncpu > 0) { 2126 return (LPL_TOPO_CPUS_NOT_EMPTY); 2127 } 2128 /* 2129 * lpl is empty, and lgroup isn't in partition. verify 2130 * that lpl doesn't show up in anyone else's rsets (in 2131 * this partition, anyway) 2132 */ 2133 for (j = 0; j < cpupart->cp_nlgrploads; j++) { 2134 lpl_t *i_lpl; /* lpl we're iterating over */ 2135 2136 i_lpl = &cpupart->cp_lgrploads[j]; 2137 2138 ASSERT(!lpl_rset_contains(i_lpl, lpl)); 2139 if (lpl_rset_contains(i_lpl, lpl)) { 2140 return (LPL_TOPO_LPL_ORPHANED); 2141 } 2142 } 2143 /* lgroup is empty, and everything is ok. continue */ 2144 continue; 2145 } 2146 2147 2148 /* lgroup is in this partition, now check it against lpl */ 2149 2150 /* do both have matching lgrps? */ 2151 ASSERT(lgrp == lpl->lpl_lgrp); 2152 if (lgrp != lpl->lpl_lgrp) { 2153 return (LPL_TOPO_LGRP_MISMATCH); 2154 } 2155 2156 /* do the parent lgroups exist and do they match? */ 2157 if (lgrp->lgrp_parent) { 2158 ASSERT(lpl->lpl_parent); 2159 ASSERT(lgrp->lgrp_parent->lgrp_id == 2160 lpl->lpl_parent->lpl_lgrpid); 2161 2162 if (!lpl->lpl_parent) { 2163 return (LPL_TOPO_MISSING_PARENT); 2164 } else if (lgrp->lgrp_parent->lgrp_id != 2165 lpl->lpl_parent->lpl_lgrpid) { 2166 return (LPL_TOPO_PARENT_MISMATCH); 2167 } 2168 } 2169 2170 /* only leaf lgroups keep a cpucnt, only check leaves */ 2171 if ((lpl->lpl_nrset == 1) && (lpl == lpl->lpl_rset[0])) { 2172 2173 /* verify that lgrp is also a leaf */ 2174 ASSERT((lgrp->lgrp_childcnt == 0) && 2175 (klgrpset_ismember(lgrp->lgrp_leaves, 2176 lpl->lpl_lgrpid))); 2177 2178 if ((lgrp->lgrp_childcnt > 0) || 2179 (!klgrpset_ismember(lgrp->lgrp_leaves, 2180 lpl->lpl_lgrpid))) { 2181 return (LPL_TOPO_LGRP_NOT_LEAF); 2182 } 2183 2184 ASSERT((lgrp->lgrp_cpucnt >= lpl->lpl_ncpu) && 2185 (lpl->lpl_ncpu > 0)); 2186 if ((lgrp->lgrp_cpucnt < lpl->lpl_ncpu) || 2187 (lpl->lpl_ncpu <= 0)) { 2188 return (LPL_TOPO_BAD_CPUCNT); 2189 } 2190 2191 /* 2192 * Check that lpl_ncpu also matches the number of 2193 * cpus in the lpl's linked list. This only exists in 2194 * leaves, but they should always match. 2195 */ 2196 j = 0; 2197 cpu = cp_start = lpl->lpl_cpus; 2198 while (cpu != NULL) { 2199 j++; 2200 2201 /* check to make sure cpu's lpl is leaf lpl */ 2202 ASSERT(cpu->cpu_lpl == lpl); 2203 if (cpu->cpu_lpl != lpl) { 2204 return (LPL_TOPO_CPU_HAS_BAD_LPL); 2205 } 2206 2207 /* check next cpu */ 2208 if ((cpu = cpu->cpu_next_lpl) != cp_start) { 2209 continue; 2210 } else { 2211 cpu = NULL; 2212 } 2213 } 2214 2215 ASSERT(j == lpl->lpl_ncpu); 2216 if (j != lpl->lpl_ncpu) { 2217 return (LPL_TOPO_LPL_BAD_NCPU); 2218 } 2219 2220 /* 2221 * Also, check that leaf lpl is contained in all 2222 * intermediate lpls that name the leaf as a descendant 2223 */ 2224 for (j = 0; j <= lgrp_alloc_max; j++) { 2225 klgrpset_t intersect; 2226 lgrp_t *lgrp_cand; 2227 lpl_t *lpl_cand; 2228 2229 lgrp_cand = lgrp_table[j]; 2230 intersect = klgrpset_intersects( 2231 lgrp_cand->lgrp_set[LGRP_RSRC_CPU], 2232 cpupart->cp_lgrpset); 2233 2234 if (!LGRP_EXISTS(lgrp_cand) || 2235 !klgrpset_intersects(lgrp_cand->lgrp_leaves, 2236 cpupart->cp_lgrpset) || 2237 (intersect == 0)) 2238 continue; 2239 2240 lpl_cand = 2241 &cpupart->cp_lgrploads[lgrp_cand->lgrp_id]; 2242 2243 if (klgrpset_ismember(intersect, 2244 lgrp->lgrp_id)) { 2245 ASSERT(lpl_rset_contains(lpl_cand, 2246 lpl)); 2247 2248 if (!lpl_rset_contains(lpl_cand, lpl)) { 2249 return (LPL_TOPO_RSET_MSSNG_LF); 2250 } 2251 } 2252 } 2253 2254 } else { /* non-leaf specific checks */ 2255 2256 /* 2257 * Non-leaf lpls should have lpl_cpus == NULL 2258 * verify that this is so 2259 */ 2260 ASSERT(lpl->lpl_cpus == NULL); 2261 if (lpl->lpl_cpus != NULL) { 2262 return (LPL_TOPO_NONLEAF_HAS_CPUS); 2263 } 2264 2265 /* 2266 * verify that the sum of the cpus in the leaf resources 2267 * is equal to the total ncpu in the intermediate 2268 */ 2269 for (j = sum = 0; j < lpl->lpl_nrset; j++) { 2270 sum += lpl->lpl_rset[j]->lpl_ncpu; 2271 } 2272 2273 ASSERT(sum == lpl->lpl_ncpu); 2274 if (sum != lpl->lpl_ncpu) { 2275 return (LPL_TOPO_LPL_BAD_NCPU); 2276 } 2277 } 2278 2279 /* 2280 * Check the rset of the lpl in question. Make sure that each 2281 * rset contains a subset of the resources in 2282 * lgrp_set[LGRP_RSRC_CPU] and in cp_lgrpset. This also makes 2283 * sure that each rset doesn't include resources that are 2284 * outside of that set. (Which would be resources somehow not 2285 * accounted for). 2286 */ 2287 klgrpset_clear(rset); 2288 for (j = 0; j < lpl->lpl_nrset; j++) { 2289 klgrpset_add(rset, lpl->lpl_rset[j]->lpl_lgrpid); 2290 } 2291 klgrpset_copy(cset, rset); 2292 /* make sure lpl rset matches lgrp rset */ 2293 klgrpset_diff(rset, lgrp->lgrp_set[LGRP_RSRC_CPU]); 2294 /* make sure rset is contained with in partition, too */ 2295 klgrpset_diff(cset, cpupart->cp_lgrpset); 2296 2297 ASSERT(klgrpset_isempty(rset) && klgrpset_isempty(cset)); 2298 if (!klgrpset_isempty(rset) || !klgrpset_isempty(cset)) { 2299 return (LPL_TOPO_RSET_MISMATCH); 2300 } 2301 2302 /* 2303 * check to make sure lpl_nrset matches the number of rsets 2304 * contained in the lpl 2305 */ 2306 for (j = 0; j < lpl->lpl_nrset; j++) { 2307 if (lpl->lpl_rset[j] == NULL) 2308 break; 2309 } 2310 2311 ASSERT(j == lpl->lpl_nrset); 2312 if (j != lpl->lpl_nrset) { 2313 return (LPL_TOPO_BAD_RSETCNT); 2314 } 2315 2316 } 2317 return (LPL_TOPO_CORRECT); 2318 } 2319 2320 /* 2321 * Flatten lpl topology to given number of levels. This is presently only 2322 * implemented for a flatten to 2 levels, which will prune out the intermediates 2323 * and home the leaf lpls to the root lpl. 2324 */ 2325 int 2326 lpl_topo_flatten(int levels) 2327 { 2328 int i; 2329 uint_t sum; 2330 lgrp_t *lgrp_cur; 2331 lpl_t *lpl_cur; 2332 lpl_t *lpl_root; 2333 cpupart_t *cp; 2334 2335 if (levels != 2) 2336 return (0); 2337 2338 /* called w/ cpus paused - grab no locks! */ 2339 ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 || 2340 !lgrp_initialized); 2341 2342 cp = cp_list_head; 2343 do { 2344 lpl_root = &cp->cp_lgrploads[lgrp_root->lgrp_id]; 2345 ASSERT(LGRP_EXISTS(lgrp_root) && (lpl_root->lpl_ncpu > 0)); 2346 2347 for (i = 0; i <= lgrp_alloc_max; i++) { 2348 lgrp_cur = lgrp_table[i]; 2349 lpl_cur = &cp->cp_lgrploads[i]; 2350 2351 if ((lgrp_cur == lgrp_root) || 2352 (!LGRP_EXISTS(lgrp_cur) && 2353 (lpl_cur->lpl_ncpu == 0))) 2354 continue; 2355 2356 if (!LGRP_EXISTS(lgrp_cur) && (lpl_cur->lpl_ncpu > 0)) { 2357 /* 2358 * this should be a deleted intermediate, so 2359 * clear it 2360 */ 2361 lpl_clear(lpl_cur); 2362 } else if ((lpl_cur->lpl_nrset == 1) && 2363 (lpl_cur->lpl_rset[0] == lpl_cur) && 2364 ((lpl_cur->lpl_parent->lpl_ncpu == 0) || 2365 (!LGRP_EXISTS(lpl_cur->lpl_parent->lpl_lgrp)))) { 2366 /* 2367 * this is a leaf whose parent was deleted, or 2368 * whose parent had their lgrp deleted. (And 2369 * whose parent will soon be deleted). Point 2370 * this guy back to the root lpl. 2371 */ 2372 lpl_cur->lpl_parent = lpl_root; 2373 lpl_rset_add(lpl_root, lpl_cur); 2374 } 2375 2376 } 2377 2378 /* 2379 * Now that we're done, make sure the count on the root lpl is 2380 * correct, and update the hints of the children for the sake of 2381 * thoroughness 2382 */ 2383 for (i = sum = 0; i < lpl_root->lpl_nrset; i++) { 2384 sum += lpl_root->lpl_rset[i]->lpl_ncpu; 2385 } 2386 lpl_root->lpl_ncpu = sum; 2387 lpl_child_update(lpl_root, cp); 2388 2389 cp = cp->cp_next; 2390 } while (cp != cp_list_head); 2391 2392 return (levels); 2393 } 2394 2395 /* 2396 * Insert a lpl into the resource hierarchy and create any additional lpls that 2397 * are necessary to represent the varying states of locality for the cpu 2398 * resoruces newly added to the partition. 2399 * 2400 * This routine is clever enough that it can correctly add resources from the 2401 * new leaf into both direct and indirect resource sets in the hierarchy. (Ie, 2402 * those for which the lpl is a leaf as opposed to simply a named equally local 2403 * resource). The one special case that needs additional processing is when a 2404 * new intermediate lpl is introduced. Since the main loop only traverses 2405 * looking to add the leaf resource where it does not yet exist, additional work 2406 * is necessary to add other leaf resources that may need to exist in the newly 2407 * created intermediate. This is performed by the second inner loop, and is 2408 * only done when the check for more than one overlapping resource succeeds. 2409 */ 2410 2411 void 2412 lpl_leaf_insert(lpl_t *lpl_leaf, cpupart_t *cpupart) 2413 { 2414 int i; 2415 int j; 2416 int rset_num_intersect; 2417 lgrp_t *lgrp_cur; 2418 lpl_t *lpl_cur; 2419 lpl_t *lpl_parent; 2420 lgrp_id_t parent_id; 2421 klgrpset_t rset_intersect; /* resources in cpupart and lgrp */ 2422 2423 for (i = 0; i <= lgrp_alloc_max; i++) { 2424 lgrp_cur = lgrp_table[i]; 2425 2426 /* 2427 * Don't insert if the lgrp isn't there, if the leaf isn't 2428 * contained within the current lgrp, or if the current lgrp has 2429 * no leaves in this partition 2430 */ 2431 2432 if (!LGRP_EXISTS(lgrp_cur) || 2433 !klgrpset_ismember(lgrp_cur->lgrp_set[LGRP_RSRC_CPU], 2434 lpl_leaf->lpl_lgrpid) || 2435 !klgrpset_intersects(lgrp_cur->lgrp_leaves, 2436 cpupart->cp_lgrpset)) 2437 continue; 2438 2439 lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id]; 2440 if (lgrp_cur->lgrp_parent != NULL) { 2441 /* if lgrp has a parent, assign it properly */ 2442 parent_id = lgrp_cur->lgrp_parent->lgrp_id; 2443 lpl_parent = &cpupart->cp_lgrploads[parent_id]; 2444 } else { 2445 /* if not, make sure parent ptr gets set to null */ 2446 lpl_parent = NULL; 2447 } 2448 2449 if (lpl_cur == lpl_leaf) { 2450 /* 2451 * Almost all leaf state was initialized elsewhere. The 2452 * only thing left to do is to set the parent. 2453 */ 2454 lpl_cur->lpl_parent = lpl_parent; 2455 continue; 2456 } 2457 2458 lpl_clear(lpl_cur); 2459 lpl_init(lpl_cur, lpl_leaf, lgrp_cur); 2460 2461 lpl_cur->lpl_parent = lpl_parent; 2462 2463 /* does new lpl need to be populated with other resources? */ 2464 rset_intersect = 2465 klgrpset_intersects(lgrp_cur->lgrp_set[LGRP_RSRC_CPU], 2466 cpupart->cp_lgrpset); 2467 klgrpset_nlgrps(rset_intersect, rset_num_intersect); 2468 2469 if (rset_num_intersect > 1) { 2470 /* 2471 * If so, figure out what lpls have resources that 2472 * intersect this one, and add them. 2473 */ 2474 for (j = 0; j <= lgrp_alloc_max; j++) { 2475 lgrp_t *lgrp_cand; /* candidate lgrp */ 2476 lpl_t *lpl_cand; /* candidate lpl */ 2477 2478 lgrp_cand = lgrp_table[j]; 2479 if (!LGRP_EXISTS(lgrp_cand) || 2480 !klgrpset_ismember(rset_intersect, 2481 lgrp_cand->lgrp_id)) 2482 continue; 2483 lpl_cand = 2484 &cpupart->cp_lgrploads[lgrp_cand->lgrp_id]; 2485 lpl_rset_add(lpl_cur, lpl_cand); 2486 } 2487 } 2488 /* 2489 * This lpl's rset has changed. Update the hint in it's 2490 * children. 2491 */ 2492 lpl_child_update(lpl_cur, cpupart); 2493 } 2494 } 2495 2496 /* 2497 * remove a lpl from the hierarchy of resources, clearing its state when 2498 * finished. If the lpls at the intermediate levels of the hierarchy have no 2499 * remaining resources, or no longer name a leaf resource in the cpu-partition, 2500 * delete them as well. 2501 */ 2502 2503 void 2504 lpl_leaf_remove(lpl_t *lpl_leaf, cpupart_t *cpupart) 2505 { 2506 int i; 2507 lgrp_t *lgrp_cur; 2508 lpl_t *lpl_cur; 2509 klgrpset_t leaf_intersect; /* intersection of leaves */ 2510 2511 for (i = 0; i <= lgrp_alloc_max; i++) { 2512 lgrp_cur = lgrp_table[i]; 2513 2514 /* 2515 * Don't attempt to remove from lgrps that aren't there, that 2516 * don't contain our leaf, or from the leaf itself. (We do that 2517 * later) 2518 */ 2519 2520 if (!LGRP_EXISTS(lgrp_cur)) 2521 continue; 2522 2523 lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id]; 2524 2525 if (!klgrpset_ismember(lgrp_cur->lgrp_set[LGRP_RSRC_CPU], 2526 lpl_leaf->lpl_lgrpid) || 2527 (lpl_cur == lpl_leaf)) { 2528 continue; 2529 } 2530 2531 /* 2532 * This is a slightly sleazy simplification in that we have 2533 * already marked the cp_lgrpset as no longer containing the 2534 * leaf we've deleted. Any lpls that pass the above checks 2535 * based upon lgrp membership but not necessarily cpu-part 2536 * membership also get cleared by the checks below. Currently 2537 * this is harmless, as the lpls should be empty anyway. 2538 * 2539 * In particular, we want to preserve lpls that have additional 2540 * leaf resources, even though we don't yet have a processor 2541 * architecture that represents resources this way. 2542 */ 2543 2544 leaf_intersect = klgrpset_intersects(lgrp_cur->lgrp_leaves, 2545 cpupart->cp_lgrpset); 2546 2547 lpl_rset_del(lpl_cur, lpl_leaf); 2548 if ((lpl_cur->lpl_nrset == 0) || (!leaf_intersect)) { 2549 lpl_clear(lpl_cur); 2550 } else { 2551 /* 2552 * Update this lpl's children 2553 */ 2554 lpl_child_update(lpl_cur, cpupart); 2555 } 2556 } 2557 lpl_clear(lpl_leaf); 2558 } 2559 2560 /* 2561 * add a cpu to a partition in terms of lgrp load avg bookeeping 2562 * 2563 * The lpl (cpu partition load average information) is now arranged in a 2564 * hierarchical fashion whereby resources that are closest, ie. most local, to 2565 * the cpu in question are considered to be leaves in a tree of resources. 2566 * There are two general cases for cpu additon: 2567 * 2568 * 1. A lpl structure that contains resources already in the hierarchy tree. 2569 * In this case, all of the associated lpl relationships have been defined, and 2570 * all that is necessary is that we link the new cpu into the per-lpl list of 2571 * cpus, and increment the ncpu count of all places where this cpu resource will 2572 * be accounted for. lpl_cpu_adjcnt updates the cpu count, and the cpu pointer 2573 * pushing is accomplished by this routine. 2574 * 2575 * 2. The lpl to contain the resources in this cpu-partition for this lgrp does 2576 * not exist yet. In this case, it is necessary to build the leaf lpl, and 2577 * construct the hierarchy of state necessary to name it's more distant 2578 * resources, if they should exist. The leaf structure is initialized by this 2579 * routine, as is the cpu-partition state for the lgrp membership. This routine 2580 * also calls lpl_leaf_insert() which inserts the named lpl into the hierarchy 2581 * and builds all of the "ancestoral" state necessary to identify resources at 2582 * differing levels of locality. 2583 */ 2584 void 2585 lgrp_part_add_cpu(cpu_t *cp, lgrp_id_t lgrpid) 2586 { 2587 cpupart_t *cpupart; 2588 lgrp_t *lgrp_leaf; 2589 lpl_t *lpl_leaf; 2590 2591 /* called sometimes w/ cpus paused - grab no locks */ 2592 ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized); 2593 2594 cpupart = cp->cpu_part; 2595 lgrp_leaf = lgrp_table[lgrpid]; 2596 2597 /* don't add non-existent lgrp */ 2598 ASSERT(LGRP_EXISTS(lgrp_leaf)); 2599 lpl_leaf = &cpupart->cp_lgrploads[lgrpid]; 2600 cp->cpu_lpl = lpl_leaf; 2601 2602 /* only leaf lpls contain cpus */ 2603 2604 if (lpl_leaf->lpl_ncpu++ == 0) { 2605 lpl_init(lpl_leaf, lpl_leaf, lgrp_leaf); 2606 klgrpset_add(cpupart->cp_lgrpset, lgrpid); 2607 lpl_leaf_insert(lpl_leaf, cpupart); 2608 } else { 2609 /* 2610 * the lpl should already exist in the parent, so just update 2611 * the count of available CPUs 2612 */ 2613 lpl_cpu_adjcnt(LPL_INCREMENT, cp); 2614 } 2615 2616 /* link cpu into list of cpus in lpl */ 2617 2618 if (lpl_leaf->lpl_cpus) { 2619 cp->cpu_next_lpl = lpl_leaf->lpl_cpus; 2620 cp->cpu_prev_lpl = lpl_leaf->lpl_cpus->cpu_prev_lpl; 2621 lpl_leaf->lpl_cpus->cpu_prev_lpl->cpu_next_lpl = cp; 2622 lpl_leaf->lpl_cpus->cpu_prev_lpl = cp; 2623 } else { 2624 /* 2625 * We increment ncpu immediately after we create a new leaf 2626 * lpl, so assert that ncpu == 1 for the case where we don't 2627 * have any cpu pointers yet. 2628 */ 2629 ASSERT(lpl_leaf->lpl_ncpu == 1); 2630 lpl_leaf->lpl_cpus = cp->cpu_next_lpl = cp->cpu_prev_lpl = cp; 2631 } 2632 2633 } 2634 2635 2636 /* 2637 * remove a cpu from a partition in terms of lgrp load avg bookeeping 2638 * 2639 * The lpl (cpu partition load average information) is now arranged in a 2640 * hierarchical fashion whereby resources that are closest, ie. most local, to 2641 * the cpu in question are considered to be leaves in a tree of resources. 2642 * There are two removal cases in question: 2643 * 2644 * 1. Removal of the resource in the leaf leaves other resources remaining in 2645 * that leaf. (Another cpu still exists at this level of locality). In this 2646 * case, the count of available cpus is decremented in all assocated lpls by 2647 * calling lpl_adj_cpucnt(), and the pointer to the removed cpu is pruned 2648 * from the per-cpu lpl list. 2649 * 2650 * 2. Removal of the resource results in the lpl containing no resources. (It's 2651 * empty) In this case, all of what has occurred for the first step must take 2652 * place; however, additionally we must remove the lpl structure itself, prune 2653 * out any stranded lpls that do not directly name a leaf resource, and mark the 2654 * cpu partition in question as no longer containing resources from the lgrp of 2655 * the lpl that has been delted. Cpu-partition changes are handled by this 2656 * method, but the lpl_leaf_remove function deals with the details of pruning 2657 * out the empty lpl and any of its orphaned direct ancestors. 2658 */ 2659 void 2660 lgrp_part_del_cpu(cpu_t *cp) 2661 { 2662 lpl_t *lpl; 2663 lpl_t *leaf_lpl; 2664 lgrp_t *lgrp_leaf; 2665 2666 /* called sometimes w/ cpus paused - grab no locks */ 2667 2668 ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized); 2669 2670 lpl = leaf_lpl = cp->cpu_lpl; 2671 lgrp_leaf = leaf_lpl->lpl_lgrp; 2672 2673 /* don't delete a leaf that isn't there */ 2674 ASSERT(LGRP_EXISTS(lgrp_leaf)); 2675 2676 /* no double-deletes */ 2677 ASSERT(lpl->lpl_ncpu); 2678 if (--lpl->lpl_ncpu == 0) { 2679 /* 2680 * This was the last cpu in this lgroup for this partition, 2681 * clear its bit in the partition's lgroup bitmask 2682 */ 2683 klgrpset_del(cp->cpu_part->cp_lgrpset, lpl->lpl_lgrpid); 2684 2685 /* eliminate remaning lpl link pointers in cpu, lpl */ 2686 lpl->lpl_cpus = cp->cpu_next_lpl = cp->cpu_prev_lpl = NULL; 2687 2688 lpl_leaf_remove(leaf_lpl, cp->cpu_part); 2689 } else { 2690 2691 /* unlink cpu from lists of cpus in lpl */ 2692 cp->cpu_prev_lpl->cpu_next_lpl = cp->cpu_next_lpl; 2693 cp->cpu_next_lpl->cpu_prev_lpl = cp->cpu_prev_lpl; 2694 if (lpl->lpl_cpus == cp) { 2695 lpl->lpl_cpus = cp->cpu_next_lpl; 2696 } 2697 2698 /* 2699 * Update the cpu count in the lpls associated with parent 2700 * lgroups. 2701 */ 2702 lpl_cpu_adjcnt(LPL_DECREMENT, cp); 2703 2704 } 2705 /* clear cpu's lpl ptr when we're all done */ 2706 cp->cpu_lpl = NULL; 2707 } 2708 2709 /* 2710 * Recompute load average for the specified partition/lgrp fragment. 2711 * 2712 * We rely on the fact that this routine is called from the clock thread 2713 * at a point before the clock thread can block (i.e. before its first 2714 * lock request). Since the clock thread can not be preempted (since it 2715 * runs at highest priority), we know that cpu partitions can not change 2716 * (since doing so would require either the repartition requester or the 2717 * cpu_pause thread to run on this cpu), so we can update the cpu's load 2718 * without grabbing cpu_lock. 2719 */ 2720 void 2721 lgrp_loadavg(lpl_t *lpl, uint_t nrcpus, int ageflag) 2722 { 2723 uint_t ncpu; 2724 int64_t old, new, f; 2725 2726 /* 2727 * 1 - exp(-1/(20 * ncpu)) << 13 = 400 for 1 cpu... 2728 */ 2729 static short expval[] = { 2730 0, 3196, 1618, 1083, 2731 814, 652, 543, 466, 2732 408, 363, 326, 297, 2733 272, 251, 233, 218, 2734 204, 192, 181, 172, 2735 163, 155, 148, 142, 2736 136, 130, 125, 121, 2737 116, 112, 109, 105 2738 }; 2739 2740 /* ASSERT (called from clock level) */ 2741 2742 if ((lpl == NULL) || /* we're booting - this is easiest for now */ 2743 ((ncpu = lpl->lpl_ncpu) == 0)) { 2744 return; 2745 } 2746 2747 for (;;) { 2748 2749 if (ncpu >= sizeof (expval) / sizeof (expval[0])) 2750 f = expval[1]/ncpu; /* good approx. for large ncpu */ 2751 else 2752 f = expval[ncpu]; 2753 2754 /* 2755 * Modify the load average atomically to avoid losing 2756 * anticipatory load updates (see lgrp_move_thread()). 2757 */ 2758 if (ageflag) { 2759 /* 2760 * We're supposed to both update and age the load. 2761 * This happens 10 times/sec. per cpu. We do a 2762 * little hoop-jumping to avoid integer overflow. 2763 */ 2764 int64_t q, r; 2765 2766 do { 2767 old = new = lpl->lpl_loadavg; 2768 q = (old >> 16) << 7; 2769 r = (old & 0xffff) << 7; 2770 new += ((long long)(nrcpus - q) * f - 2771 ((r * f) >> 16)) >> 7; 2772 2773 /* 2774 * Check for overflow 2775 */ 2776 if (new > LGRP_LOADAVG_MAX) 2777 new = LGRP_LOADAVG_MAX; 2778 else if (new < 0) 2779 new = 0; 2780 } while (atomic_cas_32((lgrp_load_t *)&lpl->lpl_loadavg, 2781 old, new) != old); 2782 } else { 2783 /* 2784 * We're supposed to update the load, but not age it. 2785 * This option is used to update the load (which either 2786 * has already been aged in this 1/10 sec. interval or 2787 * soon will be) to account for a remotely executing 2788 * thread. 2789 */ 2790 do { 2791 old = new = lpl->lpl_loadavg; 2792 new += f; 2793 /* 2794 * Check for overflow 2795 * Underflow not possible here 2796 */ 2797 if (new < old) 2798 new = LGRP_LOADAVG_MAX; 2799 } while (atomic_cas_32((lgrp_load_t *)&lpl->lpl_loadavg, 2800 old, new) != old); 2801 } 2802 2803 /* 2804 * Do the same for this lpl's parent 2805 */ 2806 if ((lpl = lpl->lpl_parent) == NULL) 2807 break; 2808 ncpu = lpl->lpl_ncpu; 2809 } 2810 } 2811 2812 /* 2813 * Initialize lpl topology in the target based on topology currently present in 2814 * lpl_bootstrap. 2815 * 2816 * lpl_topo_bootstrap is only called once from cpupart_initialize_default() to 2817 * initialize cp_default list of lpls. Up to this point all topology operations 2818 * were performed using lpl_bootstrap. Now cp_default has its own list of lpls 2819 * and all subsequent lpl operations should use it instead of lpl_bootstrap. The 2820 * `target' points to the list of lpls in cp_default and `size' is the size of 2821 * this list. 2822 * 2823 * This function walks the lpl topology in lpl_bootstrap and does for things: 2824 * 2825 * 1) Copies all fields from lpl_bootstrap to the target. 2826 * 2827 * 2) Sets CPU0 lpl pointer to the correct element of the target list. 2828 * 2829 * 3) Updates lpl_parent pointers to point to the lpls in the target list 2830 * instead of lpl_bootstrap. 2831 * 2832 * 4) Updates pointers in the resource list of the target to point to the lpls 2833 * in the target list instead of lpl_bootstrap. 2834 * 2835 * After lpl_topo_bootstrap() completes, target contains the same information 2836 * that would be present there if it were used during boot instead of 2837 * lpl_bootstrap. There is no need in information in lpl_bootstrap after this 2838 * and it is bzeroed. 2839 */ 2840 void 2841 lpl_topo_bootstrap(lpl_t *target, int size) 2842 { 2843 lpl_t *lpl = lpl_bootstrap; 2844 lpl_t *target_lpl = target; 2845 lpl_t **rset; 2846 int *id2rset; 2847 int sz; 2848 int howmany; 2849 int id; 2850 int i; 2851 2852 /* 2853 * The only target that should be passed here is cp_default lpl list. 2854 */ 2855 ASSERT(target == cp_default.cp_lgrploads); 2856 ASSERT(size == cp_default.cp_nlgrploads); 2857 ASSERT(!lgrp_topo_initialized); 2858 ASSERT(ncpus == 1); 2859 2860 howmany = MIN(LPL_BOOTSTRAP_SIZE, size); 2861 for (i = 0; i < howmany; i++, lpl++, target_lpl++) { 2862 /* 2863 * Copy all fields from lpl, except for the rset, 2864 * lgrp id <=> rset mapping storage, 2865 * and amount of storage 2866 */ 2867 rset = target_lpl->lpl_rset; 2868 id2rset = target_lpl->lpl_id2rset; 2869 sz = target_lpl->lpl_rset_sz; 2870 2871 *target_lpl = *lpl; 2872 2873 target_lpl->lpl_rset_sz = sz; 2874 target_lpl->lpl_rset = rset; 2875 target_lpl->lpl_id2rset = id2rset; 2876 2877 /* 2878 * Substitute CPU0 lpl pointer with one relative to target. 2879 */ 2880 if (lpl->lpl_cpus == CPU) { 2881 ASSERT(CPU->cpu_lpl == lpl); 2882 CPU->cpu_lpl = target_lpl; 2883 } 2884 2885 /* 2886 * Substitute parent information with parent relative to target. 2887 */ 2888 if (lpl->lpl_parent != NULL) 2889 target_lpl->lpl_parent = (lpl_t *) 2890 (((uintptr_t)lpl->lpl_parent - 2891 (uintptr_t)lpl_bootstrap) + 2892 (uintptr_t)target); 2893 2894 /* 2895 * Walk over resource set substituting pointers relative to 2896 * lpl_bootstrap's rset to pointers relative to target's 2897 */ 2898 ASSERT(lpl->lpl_nrset <= 1); 2899 2900 for (id = 0; id < lpl->lpl_nrset; id++) { 2901 if (lpl->lpl_rset[id] != NULL) { 2902 target_lpl->lpl_rset[id] = (lpl_t *) 2903 (((uintptr_t)lpl->lpl_rset[id] - 2904 (uintptr_t)lpl_bootstrap) + 2905 (uintptr_t)target); 2906 } 2907 target_lpl->lpl_id2rset[id] = 2908 lpl->lpl_id2rset[id]; 2909 } 2910 } 2911 2912 /* 2913 * Clean up the bootstrap lpls since we have switched over to the 2914 * actual lpl array in the default cpu partition. 2915 * 2916 * We still need to keep one empty lpl around for newly starting 2917 * slave CPUs to reference should they need to make it through the 2918 * dispatcher prior to their lgrp/lpl initialization. 2919 * 2920 * The lpl related dispatcher code has been designed to work properly 2921 * (and without extra checks) for this special case of a zero'ed 2922 * bootstrap lpl. Such an lpl appears to the dispatcher as an lpl 2923 * with lgrpid 0 and an empty resource set. Iteration over the rset 2924 * array by the dispatcher is also NULL terminated for this reason. 2925 * 2926 * This provides the desired behaviour for an uninitialized CPU. 2927 * It shouldn't see any other CPU to either dispatch to or steal 2928 * from until it is properly initialized. 2929 */ 2930 bzero(lpl_bootstrap_list, sizeof (lpl_bootstrap_list)); 2931 bzero(lpl_bootstrap_id2rset, sizeof (lpl_bootstrap_id2rset)); 2932 bzero(lpl_bootstrap_rset, sizeof (lpl_bootstrap_rset)); 2933 2934 lpl_bootstrap_list[0].lpl_rset = lpl_bootstrap_rset; 2935 lpl_bootstrap_list[0].lpl_id2rset = lpl_bootstrap_id2rset; 2936 } 2937 2938 /* 2939 * If the lowest load among the lgroups a process' threads are currently 2940 * spread across is greater than lgrp_expand_proc_thresh, we'll consider 2941 * expanding the process to a new lgroup. 2942 */ 2943 #define LGRP_EXPAND_PROC_THRESH_DEFAULT 62250 2944 lgrp_load_t lgrp_expand_proc_thresh = LGRP_EXPAND_PROC_THRESH_DEFAULT; 2945 2946 #define LGRP_EXPAND_PROC_THRESH(ncpu) \ 2947 ((lgrp_expand_proc_thresh) / (ncpu)) 2948 2949 /* 2950 * A process will be expanded to a new lgroup only if the difference between 2951 * the lowest load on the lgroups the process' thread's are currently spread 2952 * across and the lowest load on the other lgroups in the process' partition 2953 * is greater than lgrp_expand_proc_diff. 2954 */ 2955 #define LGRP_EXPAND_PROC_DIFF_DEFAULT 60000 2956 lgrp_load_t lgrp_expand_proc_diff = LGRP_EXPAND_PROC_DIFF_DEFAULT; 2957 2958 #define LGRP_EXPAND_PROC_DIFF(ncpu) \ 2959 ((lgrp_expand_proc_diff) / (ncpu)) 2960 2961 /* 2962 * The loadavg tolerance accounts for "noise" inherent in the load, which may 2963 * be present due to impreciseness of the load average decay algorithm. 2964 * 2965 * The default tolerance is lgrp_loadavg_max_effect. Note that the tunable 2966 * tolerance is scaled by the number of cpus in the lgroup just like 2967 * lgrp_loadavg_max_effect. For example, if lgrp_loadavg_tolerance = 0x10000, 2968 * and ncpu = 4, then lgrp_choose will consider differences in lgroup loads 2969 * of: 0x10000 / 4 => 0x4000 or greater to be significant. 2970 */ 2971 uint32_t lgrp_loadavg_tolerance = LGRP_LOADAVG_THREAD_MAX; 2972 #define LGRP_LOADAVG_TOLERANCE(ncpu) \ 2973 ((lgrp_loadavg_tolerance) / ncpu) 2974 2975 /* 2976 * lgrp_choose() will choose root lgroup as home when lowest lgroup load 2977 * average is above this threshold 2978 */ 2979 uint32_t lgrp_load_thresh = UINT32_MAX; 2980 2981 /* 2982 * lgrp_choose() will try to skip any lgroups with less memory 2983 * than this free when choosing a home lgroup 2984 */ 2985 pgcnt_t lgrp_mem_free_thresh = 0; 2986 2987 /* 2988 * When choosing between similarly loaded lgroups, lgrp_choose() will pick 2989 * one based on one of the following policies: 2990 * - Random selection 2991 * - Pseudo round robin placement 2992 * - Longest time since a thread was last placed 2993 */ 2994 #define LGRP_CHOOSE_RANDOM 1 2995 #define LGRP_CHOOSE_RR 2 2996 #define LGRP_CHOOSE_TIME 3 2997 2998 int lgrp_choose_policy = LGRP_CHOOSE_TIME; 2999 3000 /* 3001 * Choose a suitable leaf lgroup for a kthread. The kthread is assumed not to 3002 * be bound to a CPU or processor set. 3003 * 3004 * Arguments: 3005 * t The thread 3006 * cpupart The partition the thread belongs to. 3007 * 3008 * NOTE: Should at least be called with the cpu_lock held, kernel preemption 3009 * disabled, or thread_lock held (at splhigh) to protect against the CPU 3010 * partitions changing out from under us and assumes that given thread is 3011 * protected. Also, called sometimes w/ cpus paused or kernel preemption 3012 * disabled, so don't grab any locks because we should never block under 3013 * those conditions. 3014 */ 3015 lpl_t * 3016 lgrp_choose(kthread_t *t, cpupart_t *cpupart) 3017 { 3018 lgrp_load_t bestload, bestrload; 3019 int lgrpid_offset, lgrp_count; 3020 lgrp_id_t lgrpid, lgrpid_start; 3021 lpl_t *lpl, *bestlpl, *bestrlpl; 3022 klgrpset_t lgrpset; 3023 proc_t *p; 3024 3025 ASSERT(t != NULL); 3026 ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 || 3027 THREAD_LOCK_HELD(t)); 3028 ASSERT(cpupart != NULL); 3029 3030 p = t->t_procp; 3031 3032 /* A process should always be in an active partition */ 3033 ASSERT(!klgrpset_isempty(cpupart->cp_lgrpset)); 3034 3035 bestlpl = bestrlpl = NULL; 3036 bestload = bestrload = LGRP_LOADAVG_MAX; 3037 lgrpset = cpupart->cp_lgrpset; 3038 3039 switch (lgrp_choose_policy) { 3040 case LGRP_CHOOSE_RR: 3041 lgrpid = cpupart->cp_lgrp_hint; 3042 do { 3043 if (++lgrpid > lgrp_alloc_max) 3044 lgrpid = 0; 3045 } while (!klgrpset_ismember(lgrpset, lgrpid)); 3046 3047 break; 3048 default: 3049 case LGRP_CHOOSE_TIME: 3050 case LGRP_CHOOSE_RANDOM: 3051 klgrpset_nlgrps(lgrpset, lgrp_count); 3052 lgrpid_offset = 3053 (((ushort_t)(gethrtime() >> 4)) % lgrp_count) + 1; 3054 for (lgrpid = 0; ; lgrpid++) { 3055 if (klgrpset_ismember(lgrpset, lgrpid)) { 3056 if (--lgrpid_offset == 0) 3057 break; 3058 } 3059 } 3060 break; 3061 } 3062 3063 lgrpid_start = lgrpid; 3064 3065 DTRACE_PROBE2(lgrp_choose_start, lgrp_id_t, lgrpid_start, 3066 lgrp_id_t, cpupart->cp_lgrp_hint); 3067 3068 /* 3069 * Use lgroup affinities (if any) to choose best lgroup 3070 * 3071 * NOTE: Assumes that thread is protected from going away and its 3072 * lgroup affinities won't change (ie. p_lock, or 3073 * thread_lock() being held and/or CPUs paused) 3074 */ 3075 if (t->t_lgrp_affinity) { 3076 lpl = lgrp_affinity_best(t, cpupart, lgrpid_start, B_FALSE); 3077 if (lpl != NULL) 3078 return (lpl); 3079 } 3080 3081 ASSERT(klgrpset_ismember(lgrpset, lgrpid_start)); 3082 3083 do { 3084 pgcnt_t npgs; 3085 3086 /* 3087 * Skip any lgroups outside of thread's pset 3088 */ 3089 if (!klgrpset_ismember(lgrpset, lgrpid)) { 3090 if (++lgrpid > lgrp_alloc_max) 3091 lgrpid = 0; /* wrap the search */ 3092 continue; 3093 } 3094 3095 /* 3096 * Skip any non-leaf lgroups 3097 */ 3098 if (lgrp_table[lgrpid]->lgrp_childcnt != 0) 3099 continue; 3100 3101 /* 3102 * Skip any lgroups without enough free memory 3103 * (when threshold set to nonzero positive value) 3104 */ 3105 if (lgrp_mem_free_thresh > 0) { 3106 npgs = lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_FREE); 3107 if (npgs < lgrp_mem_free_thresh) { 3108 if (++lgrpid > lgrp_alloc_max) 3109 lgrpid = 0; /* wrap the search */ 3110 continue; 3111 } 3112 } 3113 3114 lpl = &cpupart->cp_lgrploads[lgrpid]; 3115 if (klgrpset_isempty(p->p_lgrpset) || 3116 klgrpset_ismember(p->p_lgrpset, lgrpid)) { 3117 /* 3118 * Either this is a new process or the process already 3119 * has threads on this lgrp, so this is a preferred 3120 * lgroup for the thread. 3121 */ 3122 if (bestlpl == NULL || 3123 lpl_pick(lpl, bestlpl)) { 3124 bestload = lpl->lpl_loadavg; 3125 bestlpl = lpl; 3126 } 3127 } else { 3128 /* 3129 * The process doesn't have any threads on this lgrp, 3130 * but we're willing to consider this lgrp if the load 3131 * difference is big enough to justify splitting up 3132 * the process' threads. 3133 */ 3134 if (bestrlpl == NULL || 3135 lpl_pick(lpl, bestrlpl)) { 3136 bestrload = lpl->lpl_loadavg; 3137 bestrlpl = lpl; 3138 } 3139 } 3140 if (++lgrpid > lgrp_alloc_max) 3141 lgrpid = 0; /* wrap the search */ 3142 } while (lgrpid != lgrpid_start); 3143 3144 /* 3145 * Return root lgroup if threshold isn't set to maximum value and 3146 * lowest lgroup load average more than a certain threshold 3147 */ 3148 if (lgrp_load_thresh != UINT32_MAX && 3149 bestload >= lgrp_load_thresh && bestrload >= lgrp_load_thresh) 3150 return (&cpupart->cp_lgrploads[lgrp_root->lgrp_id]); 3151 3152 /* 3153 * If all the lgroups over which the thread's process is spread are 3154 * heavily loaded, or otherwise undesirable, we'll consider placing 3155 * the thread on one of the other leaf lgroups in the thread's 3156 * partition. 3157 */ 3158 if ((bestlpl == NULL) || 3159 ((bestload > LGRP_EXPAND_PROC_THRESH(bestlpl->lpl_ncpu)) && 3160 (bestrload < bestload) && /* paranoid about wraparound */ 3161 (bestrload + LGRP_EXPAND_PROC_DIFF(bestrlpl->lpl_ncpu) < 3162 bestload))) { 3163 bestlpl = bestrlpl; 3164 } 3165 3166 if (bestlpl == NULL) { 3167 /* 3168 * No lgroup looked particularly good, but we still 3169 * have to pick something. Go with the randomly selected 3170 * legal lgroup we started with above. 3171 */ 3172 bestlpl = &cpupart->cp_lgrploads[lgrpid_start]; 3173 } 3174 3175 cpupart->cp_lgrp_hint = bestlpl->lpl_lgrpid; 3176 bestlpl->lpl_homed_time = gethrtime_unscaled(); 3177 3178 ASSERT(bestlpl->lpl_ncpu > 0); 3179 return (bestlpl); 3180 } 3181 3182 /* 3183 * Decide if lpl1 is a better candidate than lpl2 for lgrp homing. 3184 * Returns non-zero if lpl1 is a better candidate, and 0 otherwise. 3185 */ 3186 static int 3187 lpl_pick(lpl_t *lpl1, lpl_t *lpl2) 3188 { 3189 lgrp_load_t l1, l2; 3190 lgrp_load_t tolerance = LGRP_LOADAVG_TOLERANCE(lpl1->lpl_ncpu); 3191 3192 l1 = lpl1->lpl_loadavg; 3193 l2 = lpl2->lpl_loadavg; 3194 3195 if ((l1 + tolerance < l2) && (l1 < l2)) { 3196 /* lpl1 is significantly less loaded than lpl2 */ 3197 return (1); 3198 } 3199 3200 if (lgrp_choose_policy == LGRP_CHOOSE_TIME && 3201 l1 + tolerance >= l2 && l1 < l2 && 3202 lpl1->lpl_homed_time < lpl2->lpl_homed_time) { 3203 /* 3204 * lpl1's load is within the tolerance of lpl2. We're 3205 * willing to consider it be to better however if 3206 * it has been longer since we last homed a thread there 3207 */ 3208 return (1); 3209 } 3210 3211 return (0); 3212 } 3213 3214 /* 3215 * lgrp_trthr_moves counts the number of times main thread (t_tid = 1) of a 3216 * process that uses text replication changed home lgrp. This info is used by 3217 * segvn asyncronous thread to detect if it needs to recheck what lgrps 3218 * should be used for text replication. 3219 */ 3220 static uint64_t lgrp_trthr_moves = 0; 3221 3222 uint64_t 3223 lgrp_get_trthr_migrations(void) 3224 { 3225 return (lgrp_trthr_moves); 3226 } 3227 3228 void 3229 lgrp_update_trthr_migrations(uint64_t incr) 3230 { 3231 atomic_add_64(&lgrp_trthr_moves, incr); 3232 } 3233 3234 /* 3235 * An LWP is expected to be assigned to an lgroup for at least this long 3236 * for its anticipatory load to be justified. NOTE that this value should 3237 * not be set extremely huge (say, larger than 100 years), to avoid problems 3238 * with overflow in the calculation that uses it. 3239 */ 3240 #define LGRP_MIN_NSEC (NANOSEC / 10) /* 1/10 of a second */ 3241 hrtime_t lgrp_min_nsec = LGRP_MIN_NSEC; 3242 3243 /* 3244 * Routine to change a thread's lgroup affiliation. This routine updates 3245 * the thread's kthread_t struct and its process' proc_t struct to note the 3246 * thread's new lgroup affiliation, and its lgroup affinities. 3247 * 3248 * Note that this is the only routine that modifies a thread's t_lpl field, 3249 * and that adds in or removes anticipatory load. 3250 * 3251 * If the thread is exiting, newlpl is NULL. 3252 * 3253 * Locking: 3254 * The following lock must be held on entry: 3255 * cpu_lock, kpreempt_disable(), or thread_lock -- to assure t's new lgrp 3256 * doesn't get removed from t's partition 3257 * 3258 * This routine is not allowed to grab any locks, since it may be called 3259 * with cpus paused (such as from cpu_offline). 3260 */ 3261 void 3262 lgrp_move_thread(kthread_t *t, lpl_t *newlpl, int do_lgrpset_delete) 3263 { 3264 proc_t *p; 3265 lpl_t *lpl, *oldlpl; 3266 lgrp_id_t oldid; 3267 kthread_t *tp; 3268 uint_t ncpu; 3269 lgrp_load_t old, new; 3270 3271 ASSERT(t); 3272 ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 || 3273 THREAD_LOCK_HELD(t)); 3274 3275 /* 3276 * If not changing lpls, just return 3277 */ 3278 if ((oldlpl = t->t_lpl) == newlpl) 3279 return; 3280 3281 /* 3282 * Make sure the thread's lwp hasn't exited (if so, this thread is now 3283 * associated with process 0 rather than with its original process). 3284 */ 3285 if (t->t_proc_flag & TP_LWPEXIT) { 3286 if (newlpl != NULL) { 3287 t->t_lpl = newlpl; 3288 } 3289 return; 3290 } 3291 3292 p = ttoproc(t); 3293 3294 /* 3295 * If the thread had a previous lgroup, update its process' p_lgrpset 3296 * to account for it being moved from its old lgroup. 3297 */ 3298 if ((oldlpl != NULL) && /* thread had a previous lgroup */ 3299 (p->p_tlist != NULL)) { 3300 oldid = oldlpl->lpl_lgrpid; 3301 3302 if (newlpl != NULL) 3303 lgrp_stat_add(oldid, LGRP_NUM_MIGR, 1); 3304 3305 if ((do_lgrpset_delete) && 3306 (klgrpset_ismember(p->p_lgrpset, oldid))) { 3307 for (tp = p->p_tlist->t_forw; ; tp = tp->t_forw) { 3308 /* 3309 * Check if a thread other than the thread 3310 * that's moving is assigned to the same 3311 * lgroup as the thread that's moving. Note 3312 * that we have to compare lgroup IDs, rather 3313 * than simply comparing t_lpl's, since the 3314 * threads may belong to different partitions 3315 * but be assigned to the same lgroup. 3316 */ 3317 ASSERT(tp->t_lpl != NULL); 3318 3319 if ((tp != t) && 3320 (tp->t_lpl->lpl_lgrpid == oldid)) { 3321 /* 3322 * Another thread is assigned to the 3323 * same lgroup as the thread that's 3324 * moving, p_lgrpset doesn't change. 3325 */ 3326 break; 3327 } else if (tp == p->p_tlist) { 3328 /* 3329 * No other thread is assigned to the 3330 * same lgroup as the exiting thread, 3331 * clear the lgroup's bit in p_lgrpset. 3332 */ 3333 klgrpset_del(p->p_lgrpset, oldid); 3334 break; 3335 } 3336 } 3337 } 3338 3339 /* 3340 * If this thread was assigned to its old lgroup for such a 3341 * short amount of time that the anticipatory load that was 3342 * added on its behalf has aged very little, remove that 3343 * anticipatory load. 3344 */ 3345 if ((t->t_anttime + lgrp_min_nsec > gethrtime()) && 3346 ((ncpu = oldlpl->lpl_ncpu) > 0)) { 3347 lpl = oldlpl; 3348 for (;;) { 3349 do { 3350 old = new = lpl->lpl_loadavg; 3351 new -= LGRP_LOADAVG_MAX_EFFECT(ncpu); 3352 if (new > old) { 3353 /* 3354 * this can happen if the load 3355 * average was aged since we 3356 * added in the anticipatory 3357 * load 3358 */ 3359 new = 0; 3360 } 3361 } while (atomic_cas_32( 3362 (lgrp_load_t *)&lpl->lpl_loadavg, old, 3363 new) != old); 3364 3365 lpl = lpl->lpl_parent; 3366 if (lpl == NULL) 3367 break; 3368 3369 ncpu = lpl->lpl_ncpu; 3370 ASSERT(ncpu > 0); 3371 } 3372 } 3373 } 3374 /* 3375 * If the thread has a new lgroup (i.e. it's not exiting), update its 3376 * t_lpl and its process' p_lgrpset, and apply an anticipatory load 3377 * to its new lgroup to account for its move to its new lgroup. 3378 */ 3379 if (newlpl != NULL) { 3380 /* 3381 * This thread is moving to a new lgroup 3382 */ 3383 t->t_lpl = newlpl; 3384 if (t->t_tid == 1 && p->p_t1_lgrpid != newlpl->lpl_lgrpid) { 3385 p->p_t1_lgrpid = newlpl->lpl_lgrpid; 3386 membar_producer(); 3387 if (p->p_tr_lgrpid != LGRP_NONE && 3388 p->p_tr_lgrpid != p->p_t1_lgrpid) { 3389 lgrp_update_trthr_migrations(1); 3390 } 3391 } 3392 3393 /* 3394 * Reflect move in load average of new lgroup 3395 * unless it is root lgroup 3396 */ 3397 if (lgrp_table[newlpl->lpl_lgrpid] == lgrp_root) 3398 return; 3399 3400 if (!klgrpset_ismember(p->p_lgrpset, newlpl->lpl_lgrpid)) { 3401 klgrpset_add(p->p_lgrpset, newlpl->lpl_lgrpid); 3402 } 3403 3404 /* 3405 * It'll take some time for the load on the new lgroup 3406 * to reflect this thread's placement on it. We'd 3407 * like not, however, to have all threads between now 3408 * and then also piling on to this lgroup. To avoid 3409 * this pileup, we anticipate the load this thread 3410 * will generate on its new lgroup. The goal is to 3411 * make the lgroup's load appear as though the thread 3412 * had been there all along. We're very conservative 3413 * in calculating this anticipatory load, we assume 3414 * the worst case case (100% CPU-bound thread). This 3415 * may be modified in the future to be more accurate. 3416 */ 3417 lpl = newlpl; 3418 for (;;) { 3419 ncpu = lpl->lpl_ncpu; 3420 ASSERT(ncpu > 0); 3421 do { 3422 old = new = lpl->lpl_loadavg; 3423 new += LGRP_LOADAVG_MAX_EFFECT(ncpu); 3424 /* 3425 * Check for overflow 3426 * Underflow not possible here 3427 */ 3428 if (new < old) 3429 new = UINT32_MAX; 3430 } while (atomic_cas_32((lgrp_load_t *)&lpl->lpl_loadavg, 3431 old, new) != old); 3432 3433 lpl = lpl->lpl_parent; 3434 if (lpl == NULL) 3435 break; 3436 } 3437 t->t_anttime = gethrtime(); 3438 } 3439 } 3440 3441 /* 3442 * Return lgroup memory allocation policy given advice from madvise(3C) 3443 */ 3444 lgrp_mem_policy_t 3445 lgrp_madv_to_policy(uchar_t advice, size_t size, int type) 3446 { 3447 switch (advice) { 3448 case MADV_ACCESS_LWP: 3449 return (LGRP_MEM_POLICY_NEXT); 3450 case MADV_ACCESS_MANY: 3451 return (LGRP_MEM_POLICY_RANDOM); 3452 default: 3453 return (lgrp_mem_policy_default(size, type)); 3454 } 3455 } 3456 3457 /* 3458 * Figure out default policy 3459 */ 3460 lgrp_mem_policy_t 3461 lgrp_mem_policy_default(size_t size, int type) 3462 { 3463 cpupart_t *cp; 3464 lgrp_mem_policy_t policy; 3465 size_t pset_mem_size; 3466 3467 /* 3468 * Randomly allocate memory across lgroups for shared memory 3469 * beyond a certain threshold 3470 */ 3471 if ((type != MAP_SHARED && size > lgrp_privm_random_thresh) || 3472 (type == MAP_SHARED && size > lgrp_shm_random_thresh)) { 3473 /* 3474 * Get total memory size of current thread's pset 3475 */ 3476 kpreempt_disable(); 3477 cp = curthread->t_cpupart; 3478 klgrpset_totalsize(cp->cp_lgrpset, pset_mem_size); 3479 kpreempt_enable(); 3480 3481 /* 3482 * Choose policy to randomly allocate memory across 3483 * lgroups in pset if it will fit and is not default 3484 * partition. Otherwise, allocate memory randomly 3485 * across machine. 3486 */ 3487 if (lgrp_mem_pset_aware && size < pset_mem_size) 3488 policy = LGRP_MEM_POLICY_RANDOM_PSET; 3489 else 3490 policy = LGRP_MEM_POLICY_RANDOM; 3491 } else 3492 /* 3493 * Apply default policy for private memory and 3494 * shared memory under the respective random 3495 * threshold. 3496 */ 3497 policy = lgrp_mem_default_policy; 3498 3499 return (policy); 3500 } 3501 3502 /* 3503 * Get memory allocation policy for this segment 3504 */ 3505 lgrp_mem_policy_info_t * 3506 lgrp_mem_policy_get(struct seg *seg, caddr_t vaddr) 3507 { 3508 lgrp_mem_policy_info_t *policy_info; 3509 extern struct seg_ops segspt_ops; 3510 extern struct seg_ops segspt_shmops; 3511 3512 /* 3513 * This is for binary compatibility to protect against third party 3514 * segment drivers which haven't recompiled to allow for 3515 * SEGOP_GETPOLICY() 3516 */ 3517 if (seg->s_ops != &segvn_ops && seg->s_ops != &segspt_ops && 3518 seg->s_ops != &segspt_shmops) 3519 return (NULL); 3520 3521 policy_info = NULL; 3522 if (seg->s_ops->getpolicy != NULL) 3523 policy_info = SEGOP_GETPOLICY(seg, vaddr); 3524 3525 return (policy_info); 3526 } 3527 3528 /* 3529 * Set policy for allocating private memory given desired policy, policy info, 3530 * size in bytes of memory that policy is being applied. 3531 * Return 0 if policy wasn't set already and 1 if policy was set already 3532 */ 3533 int 3534 lgrp_privm_policy_set(lgrp_mem_policy_t policy, 3535 lgrp_mem_policy_info_t *policy_info, size_t size) 3536 { 3537 3538 ASSERT(policy_info != NULL); 3539 3540 if (policy == LGRP_MEM_POLICY_DEFAULT) 3541 policy = lgrp_mem_policy_default(size, MAP_PRIVATE); 3542 3543 /* 3544 * Policy set already? 3545 */ 3546 if (policy == policy_info->mem_policy) 3547 return (1); 3548 3549 /* 3550 * Set policy 3551 */ 3552 policy_info->mem_policy = policy; 3553 policy_info->mem_lgrpid = LGRP_NONE; 3554 3555 return (0); 3556 } 3557 3558 3559 /* 3560 * Get shared memory allocation policy with given tree and offset 3561 */ 3562 lgrp_mem_policy_info_t * 3563 lgrp_shm_policy_get(struct anon_map *amp, ulong_t anon_index, vnode_t *vp, 3564 u_offset_t vn_off) 3565 { 3566 u_offset_t off; 3567 lgrp_mem_policy_info_t *policy_info; 3568 lgrp_shm_policy_seg_t *policy_seg; 3569 lgrp_shm_locality_t *shm_locality; 3570 avl_tree_t *tree; 3571 avl_index_t where; 3572 3573 /* 3574 * Get policy segment tree from anon_map or vnode and use specified 3575 * anon index or vnode offset as offset 3576 * 3577 * Assume that no lock needs to be held on anon_map or vnode, since 3578 * they should be protected by their reference count which must be 3579 * nonzero for an existing segment 3580 */ 3581 if (amp) { 3582 ASSERT(amp->refcnt != 0); 3583 shm_locality = amp->locality; 3584 if (shm_locality == NULL) 3585 return (NULL); 3586 tree = shm_locality->loc_tree; 3587 off = ptob(anon_index); 3588 } else if (vp) { 3589 shm_locality = vp->v_locality; 3590 if (shm_locality == NULL) 3591 return (NULL); 3592 ASSERT(shm_locality->loc_count != 0); 3593 tree = shm_locality->loc_tree; 3594 off = vn_off; 3595 } 3596 3597 if (tree == NULL) 3598 return (NULL); 3599 3600 /* 3601 * Lookup policy segment for offset into shared object and return 3602 * policy info 3603 */ 3604 rw_enter(&shm_locality->loc_lock, RW_READER); 3605 policy_info = NULL; 3606 policy_seg = avl_find(tree, &off, &where); 3607 if (policy_seg) 3608 policy_info = &policy_seg->shm_policy; 3609 rw_exit(&shm_locality->loc_lock); 3610 3611 return (policy_info); 3612 } 3613 3614 /* 3615 * Default memory allocation policy for kernel segmap pages 3616 */ 3617 lgrp_mem_policy_t lgrp_segmap_default_policy = LGRP_MEM_POLICY_RANDOM; 3618 3619 /* 3620 * Return lgroup to use for allocating memory 3621 * given the segment and address 3622 * 3623 * There isn't any mutual exclusion that exists between calls 3624 * to this routine and DR, so this routine and whomever calls it 3625 * should be mindful of the possibility that the lgrp returned 3626 * may be deleted. If this happens, dereferences of the lgrp 3627 * pointer will still be safe, but the resources in the lgrp will 3628 * be gone, and LGRP_EXISTS() will no longer be true. 3629 */ 3630 lgrp_t * 3631 lgrp_mem_choose(struct seg *seg, caddr_t vaddr, size_t pgsz) 3632 { 3633 int i; 3634 lgrp_t *lgrp; 3635 klgrpset_t lgrpset; 3636 int lgrps_spanned; 3637 unsigned long off; 3638 lgrp_mem_policy_t policy; 3639 lgrp_mem_policy_info_t *policy_info; 3640 ushort_t random; 3641 int stat = 0; 3642 extern struct seg *segkmap; 3643 3644 /* 3645 * Just return null if the lgrp framework hasn't finished 3646 * initializing or if this is a UMA machine. 3647 */ 3648 if (nlgrps == 1 || !lgrp_initialized) 3649 return (lgrp_root); 3650 3651 /* 3652 * Get memory allocation policy for this segment 3653 */ 3654 policy = lgrp_mem_default_policy; 3655 if (seg != NULL) { 3656 if (seg->s_as == &kas) { 3657 if (seg == segkmap) 3658 policy = lgrp_segmap_default_policy; 3659 if (policy == LGRP_MEM_POLICY_RANDOM_PROC || 3660 policy == LGRP_MEM_POLICY_RANDOM_PSET) 3661 policy = LGRP_MEM_POLICY_RANDOM; 3662 } else { 3663 policy_info = lgrp_mem_policy_get(seg, vaddr); 3664 if (policy_info != NULL) { 3665 policy = policy_info->mem_policy; 3666 if (policy == LGRP_MEM_POLICY_NEXT_SEG) { 3667 lgrp_id_t id = policy_info->mem_lgrpid; 3668 ASSERT(id != LGRP_NONE); 3669 ASSERT(id < NLGRPS_MAX); 3670 lgrp = lgrp_table[id]; 3671 if (!LGRP_EXISTS(lgrp)) { 3672 policy = LGRP_MEM_POLICY_NEXT; 3673 } else { 3674 lgrp_stat_add(id, 3675 LGRP_NUM_NEXT_SEG, 1); 3676 return (lgrp); 3677 } 3678 } 3679 } 3680 } 3681 } 3682 lgrpset = 0; 3683 3684 /* 3685 * Initialize lgroup to home by default 3686 */ 3687 lgrp = lgrp_home_lgrp(); 3688 3689 /* 3690 * When homing threads on root lgrp, override default memory 3691 * allocation policies with root lgroup memory allocation policy 3692 */ 3693 if (lgrp == lgrp_root) 3694 policy = lgrp_mem_policy_root; 3695 3696 /* 3697 * Implement policy 3698 */ 3699 switch (policy) { 3700 case LGRP_MEM_POLICY_NEXT_CPU: 3701 3702 /* 3703 * Return lgroup of current CPU which faulted on memory 3704 * If the CPU isn't currently in an lgrp, then opt to 3705 * allocate from the root. 3706 * 3707 * Kernel preemption needs to be disabled here to prevent 3708 * the current CPU from going away before lgrp is found. 3709 */ 3710 if (LGRP_CPU_HAS_NO_LGRP(CPU)) { 3711 lgrp = lgrp_root; 3712 } else { 3713 kpreempt_disable(); 3714 lgrp = lgrp_cpu_to_lgrp(CPU); 3715 kpreempt_enable(); 3716 } 3717 break; 3718 3719 case LGRP_MEM_POLICY_NEXT: 3720 case LGRP_MEM_POLICY_DEFAULT: 3721 default: 3722 3723 /* 3724 * Just return current thread's home lgroup 3725 * for default policy (next touch) 3726 * If the thread is homed to the root, 3727 * then the default policy is random across lgroups. 3728 * Fallthrough to the random case. 3729 */ 3730 if (lgrp != lgrp_root) { 3731 if (policy == LGRP_MEM_POLICY_NEXT) 3732 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_NEXT, 1); 3733 else 3734 lgrp_stat_add(lgrp->lgrp_id, 3735 LGRP_NUM_DEFAULT, 1); 3736 break; 3737 } 3738 /* FALLTHROUGH */ 3739 case LGRP_MEM_POLICY_RANDOM: 3740 3741 /* 3742 * Return a random leaf lgroup with memory 3743 */ 3744 lgrpset = lgrp_root->lgrp_set[LGRP_RSRC_MEM]; 3745 /* 3746 * Count how many lgroups are spanned 3747 */ 3748 klgrpset_nlgrps(lgrpset, lgrps_spanned); 3749 3750 /* 3751 * There may be no memnodes in the root lgroup during DR copy 3752 * rename on a system with only two boards (memnodes) 3753 * configured. In this case just return the root lgrp. 3754 */ 3755 if (lgrps_spanned == 0) { 3756 lgrp = lgrp_root; 3757 break; 3758 } 3759 3760 /* 3761 * Pick a random offset within lgroups spanned 3762 * and return lgroup at that offset 3763 */ 3764 random = (ushort_t)gethrtime() >> 4; 3765 off = random % lgrps_spanned; 3766 ASSERT(off <= lgrp_alloc_max); 3767 3768 for (i = 0; i <= lgrp_alloc_max; i++) { 3769 if (!klgrpset_ismember(lgrpset, i)) 3770 continue; 3771 if (off) 3772 off--; 3773 else { 3774 lgrp = lgrp_table[i]; 3775 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_RANDOM, 3776 1); 3777 break; 3778 } 3779 } 3780 break; 3781 3782 case LGRP_MEM_POLICY_RANDOM_PROC: 3783 3784 /* 3785 * Grab copy of bitmask of lgroups spanned by 3786 * this process 3787 */ 3788 klgrpset_copy(lgrpset, curproc->p_lgrpset); 3789 stat = LGRP_NUM_RANDOM_PROC; 3790 3791 /* FALLTHROUGH */ 3792 case LGRP_MEM_POLICY_RANDOM_PSET: 3793 3794 if (!stat) 3795 stat = LGRP_NUM_RANDOM_PSET; 3796 3797 if (klgrpset_isempty(lgrpset)) { 3798 /* 3799 * Grab copy of bitmask of lgroups spanned by 3800 * this processor set 3801 */ 3802 kpreempt_disable(); 3803 klgrpset_copy(lgrpset, 3804 curthread->t_cpupart->cp_lgrpset); 3805 kpreempt_enable(); 3806 } 3807 3808 /* 3809 * Count how many lgroups are spanned 3810 */ 3811 klgrpset_nlgrps(lgrpset, lgrps_spanned); 3812 ASSERT(lgrps_spanned <= nlgrps); 3813 3814 /* 3815 * Probably lgrps_spanned should be always non-zero, but to be 3816 * on the safe side we return lgrp_root if it is empty. 3817 */ 3818 if (lgrps_spanned == 0) { 3819 lgrp = lgrp_root; 3820 break; 3821 } 3822 3823 /* 3824 * Pick a random offset within lgroups spanned 3825 * and return lgroup at that offset 3826 */ 3827 random = (ushort_t)gethrtime() >> 4; 3828 off = random % lgrps_spanned; 3829 ASSERT(off <= lgrp_alloc_max); 3830 3831 for (i = 0; i <= lgrp_alloc_max; i++) { 3832 if (!klgrpset_ismember(lgrpset, i)) 3833 continue; 3834 if (off) 3835 off--; 3836 else { 3837 lgrp = lgrp_table[i]; 3838 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_RANDOM, 3839 1); 3840 break; 3841 } 3842 } 3843 break; 3844 3845 case LGRP_MEM_POLICY_ROUNDROBIN: 3846 3847 /* 3848 * Use offset within segment to determine 3849 * offset from home lgroup to choose for 3850 * next lgroup to allocate memory from 3851 */ 3852 off = ((unsigned long)(vaddr - seg->s_base) / pgsz) % 3853 (lgrp_alloc_max + 1); 3854 3855 kpreempt_disable(); 3856 lgrpset = lgrp_root->lgrp_set[LGRP_RSRC_MEM]; 3857 i = lgrp->lgrp_id; 3858 kpreempt_enable(); 3859 3860 while (off > 0) { 3861 i = (i + 1) % (lgrp_alloc_max + 1); 3862 lgrp = lgrp_table[i]; 3863 if (klgrpset_ismember(lgrpset, i)) 3864 off--; 3865 } 3866 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ROUNDROBIN, 1); 3867 3868 break; 3869 } 3870 3871 ASSERT(lgrp != NULL); 3872 return (lgrp); 3873 } 3874 3875 /* 3876 * Return the number of pages in an lgroup 3877 * 3878 * NOTE: NUMA test (numat) driver uses this, so changing arguments or semantics 3879 * could cause tests that rely on the numat driver to fail.... 3880 */ 3881 pgcnt_t 3882 lgrp_mem_size(lgrp_id_t lgrpid, lgrp_mem_query_t query) 3883 { 3884 lgrp_t *lgrp; 3885 3886 lgrp = lgrp_table[lgrpid]; 3887 if (!LGRP_EXISTS(lgrp) || 3888 klgrpset_isempty(lgrp->lgrp_set[LGRP_RSRC_MEM]) || 3889 !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid)) 3890 return (0); 3891 3892 return (lgrp_plat_mem_size(lgrp->lgrp_plathand, query)); 3893 } 3894 3895 /* 3896 * Initialize lgroup shared memory allocation policy support 3897 */ 3898 void 3899 lgrp_shm_policy_init(struct anon_map *amp, vnode_t *vp) 3900 { 3901 lgrp_shm_locality_t *shm_locality; 3902 3903 /* 3904 * Initialize locality field in anon_map 3905 * Don't need any locks because this is called when anon_map is 3906 * allocated, but not used anywhere yet. 3907 */ 3908 if (amp) { 3909 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 3910 if (amp->locality == NULL) { 3911 /* 3912 * Allocate and initialize shared memory locality info 3913 * and set anon_map locality pointer to it 3914 * Drop lock across kmem_alloc(KM_SLEEP) 3915 */ 3916 ANON_LOCK_EXIT(&->a_rwlock); 3917 shm_locality = kmem_alloc(sizeof (*shm_locality), 3918 KM_SLEEP); 3919 rw_init(&shm_locality->loc_lock, NULL, RW_DEFAULT, 3920 NULL); 3921 shm_locality->loc_count = 1; /* not used for amp */ 3922 shm_locality->loc_tree = NULL; 3923 3924 /* 3925 * Reacquire lock and check to see whether anyone beat 3926 * us to initializing the locality info 3927 */ 3928 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 3929 if (amp->locality != NULL) { 3930 rw_destroy(&shm_locality->loc_lock); 3931 kmem_free(shm_locality, 3932 sizeof (*shm_locality)); 3933 } else 3934 amp->locality = shm_locality; 3935 } 3936 ANON_LOCK_EXIT(&->a_rwlock); 3937 return; 3938 } 3939 3940 /* 3941 * Allocate shared vnode policy info if vnode is not locality aware yet 3942 */ 3943 mutex_enter(&vp->v_lock); 3944 if ((vp->v_flag & V_LOCALITY) == 0) { 3945 /* 3946 * Allocate and initialize shared memory locality info 3947 */ 3948 mutex_exit(&vp->v_lock); 3949 shm_locality = kmem_alloc(sizeof (*shm_locality), KM_SLEEP); 3950 rw_init(&shm_locality->loc_lock, NULL, RW_DEFAULT, NULL); 3951 shm_locality->loc_count = 1; 3952 shm_locality->loc_tree = NULL; 3953 3954 /* 3955 * Point vnode locality field at shared vnode policy info 3956 * and set locality aware flag in vnode 3957 */ 3958 mutex_enter(&vp->v_lock); 3959 if ((vp->v_flag & V_LOCALITY) == 0) { 3960 vp->v_locality = shm_locality; 3961 vp->v_flag |= V_LOCALITY; 3962 } else { 3963 /* 3964 * Lost race so free locality info and increment count. 3965 */ 3966 rw_destroy(&shm_locality->loc_lock); 3967 kmem_free(shm_locality, sizeof (*shm_locality)); 3968 shm_locality = vp->v_locality; 3969 shm_locality->loc_count++; 3970 } 3971 mutex_exit(&vp->v_lock); 3972 3973 return; 3974 } 3975 3976 /* 3977 * Increment reference count of number of segments mapping this vnode 3978 * shared 3979 */ 3980 shm_locality = vp->v_locality; 3981 shm_locality->loc_count++; 3982 mutex_exit(&vp->v_lock); 3983 } 3984 3985 /* 3986 * Destroy the given shared memory policy segment tree 3987 */ 3988 void 3989 lgrp_shm_policy_tree_destroy(avl_tree_t *tree) 3990 { 3991 lgrp_shm_policy_seg_t *cur; 3992 lgrp_shm_policy_seg_t *next; 3993 3994 if (tree == NULL) 3995 return; 3996 3997 cur = (lgrp_shm_policy_seg_t *)avl_first(tree); 3998 while (cur != NULL) { 3999 next = AVL_NEXT(tree, cur); 4000 avl_remove(tree, cur); 4001 kmem_free(cur, sizeof (*cur)); 4002 cur = next; 4003 } 4004 kmem_free(tree, sizeof (avl_tree_t)); 4005 } 4006 4007 /* 4008 * Uninitialize lgroup shared memory allocation policy support 4009 */ 4010 void 4011 lgrp_shm_policy_fini(struct anon_map *amp, vnode_t *vp) 4012 { 4013 lgrp_shm_locality_t *shm_locality; 4014 4015 /* 4016 * For anon_map, deallocate shared memory policy tree and 4017 * zero locality field 4018 * Don't need any locks because anon_map is being freed 4019 */ 4020 if (amp) { 4021 if (amp->locality == NULL) 4022 return; 4023 shm_locality = amp->locality; 4024 shm_locality->loc_count = 0; /* not really used for amp */ 4025 rw_destroy(&shm_locality->loc_lock); 4026 lgrp_shm_policy_tree_destroy(shm_locality->loc_tree); 4027 kmem_free(shm_locality, sizeof (*shm_locality)); 4028 amp->locality = 0; 4029 return; 4030 } 4031 4032 /* 4033 * For vnode, decrement reference count of segments mapping this vnode 4034 * shared and delete locality info if reference count drops to 0 4035 */ 4036 mutex_enter(&vp->v_lock); 4037 shm_locality = vp->v_locality; 4038 shm_locality->loc_count--; 4039 4040 if (shm_locality->loc_count == 0) { 4041 rw_destroy(&shm_locality->loc_lock); 4042 lgrp_shm_policy_tree_destroy(shm_locality->loc_tree); 4043 kmem_free(shm_locality, sizeof (*shm_locality)); 4044 vp->v_locality = 0; 4045 vp->v_flag &= ~V_LOCALITY; 4046 } 4047 mutex_exit(&vp->v_lock); 4048 } 4049 4050 /* 4051 * Compare two shared memory policy segments 4052 * Used by AVL tree code for searching 4053 */ 4054 int 4055 lgrp_shm_policy_compar(const void *x, const void *y) 4056 { 4057 lgrp_shm_policy_seg_t *a = (lgrp_shm_policy_seg_t *)x; 4058 lgrp_shm_policy_seg_t *b = (lgrp_shm_policy_seg_t *)y; 4059 4060 if (a->shm_off < b->shm_off) 4061 return (-1); 4062 if (a->shm_off >= b->shm_off + b->shm_size) 4063 return (1); 4064 return (0); 4065 } 4066 4067 /* 4068 * Concatenate seg1 with seg2 and remove seg2 4069 */ 4070 static int 4071 lgrp_shm_policy_concat(avl_tree_t *tree, lgrp_shm_policy_seg_t *seg1, 4072 lgrp_shm_policy_seg_t *seg2) 4073 { 4074 if (!seg1 || !seg2 || 4075 seg1->shm_off + seg1->shm_size != seg2->shm_off || 4076 seg1->shm_policy.mem_policy != seg2->shm_policy.mem_policy) 4077 return (-1); 4078 4079 seg1->shm_size += seg2->shm_size; 4080 avl_remove(tree, seg2); 4081 kmem_free(seg2, sizeof (*seg2)); 4082 return (0); 4083 } 4084 4085 /* 4086 * Split segment at given offset and return rightmost (uppermost) segment 4087 * Assumes that there are no overlapping segments 4088 */ 4089 static lgrp_shm_policy_seg_t * 4090 lgrp_shm_policy_split(avl_tree_t *tree, lgrp_shm_policy_seg_t *seg, 4091 u_offset_t off) 4092 { 4093 lgrp_shm_policy_seg_t *newseg; 4094 avl_index_t where; 4095 4096 ASSERT(seg != NULL); 4097 ASSERT(off >= seg->shm_off && off <= seg->shm_off + seg->shm_size); 4098 4099 if (!seg || off < seg->shm_off || off > seg->shm_off + 4100 seg->shm_size) 4101 return (NULL); 4102 4103 if (off == seg->shm_off || off == seg->shm_off + seg->shm_size) 4104 return (seg); 4105 4106 /* 4107 * Adjust size of left segment and allocate new (right) segment 4108 */ 4109 newseg = kmem_alloc(sizeof (lgrp_shm_policy_seg_t), KM_SLEEP); 4110 newseg->shm_policy = seg->shm_policy; 4111 newseg->shm_off = off; 4112 newseg->shm_size = seg->shm_size - (off - seg->shm_off); 4113 seg->shm_size = off - seg->shm_off; 4114 4115 /* 4116 * Find where to insert new segment in AVL tree and insert it 4117 */ 4118 (void) avl_find(tree, &off, &where); 4119 avl_insert(tree, newseg, where); 4120 4121 return (newseg); 4122 } 4123 4124 /* 4125 * Set shared memory allocation policy on specified shared object at given 4126 * offset and length 4127 * 4128 * Return 0 if policy wasn't set already, 1 if policy was set already, and 4129 * -1 if can't set policy. 4130 */ 4131 int 4132 lgrp_shm_policy_set(lgrp_mem_policy_t policy, struct anon_map *amp, 4133 ulong_t anon_index, vnode_t *vp, u_offset_t vn_off, size_t len) 4134 { 4135 u_offset_t eoff; 4136 lgrp_shm_policy_seg_t *next; 4137 lgrp_shm_policy_seg_t *newseg; 4138 u_offset_t off; 4139 u_offset_t oldeoff; 4140 lgrp_shm_policy_seg_t *prev; 4141 int retval; 4142 lgrp_shm_policy_seg_t *seg; 4143 lgrp_shm_locality_t *shm_locality; 4144 avl_tree_t *tree; 4145 avl_index_t where; 4146 4147 ASSERT(amp || vp); 4148 ASSERT((len & PAGEOFFSET) == 0); 4149 4150 if (len == 0) 4151 return (-1); 4152 4153 retval = 0; 4154 4155 /* 4156 * Get locality info and starting offset into shared object 4157 * Try anon map first and then vnode 4158 * Assume that no locks need to be held on anon_map or vnode, since 4159 * it should be protected by its reference count which must be nonzero 4160 * for an existing segment. 4161 */ 4162 if (amp) { 4163 /* 4164 * Get policy info from anon_map 4165 * 4166 */ 4167 ASSERT(amp->refcnt != 0); 4168 if (amp->locality == NULL) 4169 lgrp_shm_policy_init(amp, NULL); 4170 shm_locality = amp->locality; 4171 off = ptob(anon_index); 4172 } else if (vp) { 4173 /* 4174 * Get policy info from vnode 4175 */ 4176 if ((vp->v_flag & V_LOCALITY) == 0 || vp->v_locality == NULL) 4177 lgrp_shm_policy_init(NULL, vp); 4178 shm_locality = vp->v_locality; 4179 ASSERT(shm_locality->loc_count != 0); 4180 off = vn_off; 4181 } else 4182 return (-1); 4183 4184 ASSERT((off & PAGEOFFSET) == 0); 4185 4186 /* 4187 * Figure out default policy 4188 */ 4189 if (policy == LGRP_MEM_POLICY_DEFAULT) 4190 policy = lgrp_mem_policy_default(len, MAP_SHARED); 4191 4192 /* 4193 * Create AVL tree if there isn't one yet 4194 * and set locality field to point at it 4195 */ 4196 rw_enter(&shm_locality->loc_lock, RW_WRITER); 4197 tree = shm_locality->loc_tree; 4198 if (!tree) { 4199 rw_exit(&shm_locality->loc_lock); 4200 4201 tree = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP); 4202 4203 rw_enter(&shm_locality->loc_lock, RW_WRITER); 4204 if (shm_locality->loc_tree == NULL) { 4205 avl_create(tree, lgrp_shm_policy_compar, 4206 sizeof (lgrp_shm_policy_seg_t), 4207 offsetof(lgrp_shm_policy_seg_t, shm_tree)); 4208 shm_locality->loc_tree = tree; 4209 } else { 4210 /* 4211 * Another thread managed to set up the tree 4212 * before we could. Free the tree we allocated 4213 * and use the one that's already there. 4214 */ 4215 kmem_free(tree, sizeof (*tree)); 4216 tree = shm_locality->loc_tree; 4217 } 4218 } 4219 4220 /* 4221 * Set policy 4222 * 4223 * Need to maintain hold on writer's lock to keep tree from 4224 * changing out from under us 4225 */ 4226 while (len != 0) { 4227 /* 4228 * Find policy segment for specified offset into shared object 4229 */ 4230 seg = avl_find(tree, &off, &where); 4231 4232 /* 4233 * Didn't find any existing segment that contains specified 4234 * offset, so allocate new segment, insert it, and concatenate 4235 * with adjacent segments if possible 4236 */ 4237 if (seg == NULL) { 4238 newseg = kmem_alloc(sizeof (lgrp_shm_policy_seg_t), 4239 KM_SLEEP); 4240 newseg->shm_policy.mem_policy = policy; 4241 newseg->shm_policy.mem_lgrpid = LGRP_NONE; 4242 newseg->shm_off = off; 4243 avl_insert(tree, newseg, where); 4244 4245 /* 4246 * Check to see whether new segment overlaps with next 4247 * one, set length of new segment accordingly, and 4248 * calculate remaining length and next offset 4249 */ 4250 seg = AVL_NEXT(tree, newseg); 4251 if (seg == NULL || off + len <= seg->shm_off) { 4252 newseg->shm_size = len; 4253 len = 0; 4254 } else { 4255 newseg->shm_size = seg->shm_off - off; 4256 off = seg->shm_off; 4257 len -= newseg->shm_size; 4258 } 4259 4260 /* 4261 * Try to concatenate new segment with next and 4262 * previous ones, since they might have the same policy 4263 * now. Grab previous and next segments first because 4264 * they will change on concatenation. 4265 */ 4266 prev = AVL_PREV(tree, newseg); 4267 next = AVL_NEXT(tree, newseg); 4268 (void) lgrp_shm_policy_concat(tree, newseg, next); 4269 (void) lgrp_shm_policy_concat(tree, prev, newseg); 4270 4271 continue; 4272 } 4273 4274 eoff = off + len; 4275 oldeoff = seg->shm_off + seg->shm_size; 4276 4277 /* 4278 * Policy set already? 4279 */ 4280 if (policy == seg->shm_policy.mem_policy) { 4281 /* 4282 * Nothing left to do if offset and length 4283 * fall within this segment 4284 */ 4285 if (eoff <= oldeoff) { 4286 retval = 1; 4287 break; 4288 } else { 4289 len = eoff - oldeoff; 4290 off = oldeoff; 4291 continue; 4292 } 4293 } 4294 4295 /* 4296 * Specified offset and length match existing segment exactly 4297 */ 4298 if (off == seg->shm_off && len == seg->shm_size) { 4299 /* 4300 * Set policy and update current length 4301 */ 4302 seg->shm_policy.mem_policy = policy; 4303 seg->shm_policy.mem_lgrpid = LGRP_NONE; 4304 len = 0; 4305 4306 /* 4307 * Try concatenating new segment with previous and next 4308 * segments, since they might have the same policy now. 4309 * Grab previous and next segments first because they 4310 * will change on concatenation. 4311 */ 4312 prev = AVL_PREV(tree, seg); 4313 next = AVL_NEXT(tree, seg); 4314 (void) lgrp_shm_policy_concat(tree, seg, next); 4315 (void) lgrp_shm_policy_concat(tree, prev, seg); 4316 } else { 4317 /* 4318 * Specified offset and length only apply to part of 4319 * existing segment 4320 */ 4321 4322 /* 4323 * New segment starts in middle of old one, so split 4324 * new one off near beginning of old one 4325 */ 4326 newseg = NULL; 4327 if (off > seg->shm_off) { 4328 newseg = lgrp_shm_policy_split(tree, seg, off); 4329 4330 /* 4331 * New segment ends where old one did, so try 4332 * to concatenate with next segment 4333 */ 4334 if (eoff == oldeoff) { 4335 newseg->shm_policy.mem_policy = policy; 4336 newseg->shm_policy.mem_lgrpid = 4337 LGRP_NONE; 4338 (void) lgrp_shm_policy_concat(tree, 4339 newseg, AVL_NEXT(tree, newseg)); 4340 break; 4341 } 4342 } 4343 4344 /* 4345 * New segment ends before old one, so split off end of 4346 * old one 4347 */ 4348 if (eoff < oldeoff) { 4349 if (newseg) { 4350 (void) lgrp_shm_policy_split(tree, 4351 newseg, eoff); 4352 newseg->shm_policy.mem_policy = policy; 4353 newseg->shm_policy.mem_lgrpid = 4354 LGRP_NONE; 4355 } else { 4356 (void) lgrp_shm_policy_split(tree, seg, 4357 eoff); 4358 seg->shm_policy.mem_policy = policy; 4359 seg->shm_policy.mem_lgrpid = LGRP_NONE; 4360 } 4361 4362 if (off == seg->shm_off) 4363 (void) lgrp_shm_policy_concat(tree, 4364 AVL_PREV(tree, seg), seg); 4365 break; 4366 } 4367 4368 /* 4369 * Calculate remaining length and next offset 4370 */ 4371 len = eoff - oldeoff; 4372 off = oldeoff; 4373 } 4374 } 4375 4376 rw_exit(&shm_locality->loc_lock); 4377 return (retval); 4378 } 4379 4380 /* 4381 * Return the best memnode from which to allocate memory given 4382 * an lgroup. 4383 * 4384 * "c" is for cookie, which is good enough for me. 4385 * It references a cookie struct that should be zero'ed to initialize. 4386 * The cookie should live on the caller's stack. 4387 * 4388 * The routine returns -1 when: 4389 * - traverse is 0, and all the memnodes in "lgrp" have been returned. 4390 * - traverse is 1, and all the memnodes in the system have been 4391 * returned. 4392 */ 4393 int 4394 lgrp_memnode_choose(lgrp_mnode_cookie_t *c) 4395 { 4396 lgrp_t *lp = c->lmc_lgrp; 4397 mnodeset_t nodes = c->lmc_nodes; 4398 int cnt = c->lmc_cnt; 4399 int offset, mnode; 4400 4401 extern int max_mem_nodes; 4402 4403 /* 4404 * If the set is empty, and the caller is willing, traverse 4405 * up the hierarchy until we find a non-empty set. 4406 */ 4407 while (nodes == (mnodeset_t)0 || cnt <= 0) { 4408 if (c->lmc_scope == LGRP_SRCH_LOCAL || 4409 ((lp = lp->lgrp_parent) == NULL)) 4410 return (-1); 4411 4412 nodes = lp->lgrp_mnodes & ~(c->lmc_tried); 4413 cnt = lp->lgrp_nmnodes - c->lmc_ntried; 4414 } 4415 4416 /* 4417 * Select a memnode by picking one at a "random" offset. 4418 * Because of DR, memnodes can come and go at any time. 4419 * This code must be able to cope with the possibility 4420 * that the nodes count "cnt" is inconsistent with respect 4421 * to the number of elements actually in "nodes", and 4422 * therefore that the offset chosen could be greater than 4423 * the number of elements in the set (some memnodes may 4424 * have dissapeared just before cnt was read). 4425 * If this happens, the search simply wraps back to the 4426 * beginning of the set. 4427 */ 4428 ASSERT(nodes != (mnodeset_t)0 && cnt > 0); 4429 offset = c->lmc_rand % cnt; 4430 do { 4431 for (mnode = 0; mnode < max_mem_nodes; mnode++) 4432 if (nodes & ((mnodeset_t)1 << mnode)) 4433 if (!offset--) 4434 break; 4435 } while (mnode >= max_mem_nodes); 4436 4437 /* Found a node. Store state before returning. */ 4438 c->lmc_lgrp = lp; 4439 c->lmc_nodes = (nodes & ~((mnodeset_t)1 << mnode)); 4440 c->lmc_cnt = cnt - 1; 4441 c->lmc_tried = (c->lmc_tried | ((mnodeset_t)1 << mnode)); 4442 c->lmc_ntried++; 4443 4444 return (mnode); 4445 }