1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 1996, 2010, Oracle and/or its affiliates. All rights reserved. 23 * 24 * Copyright 2018 Joyent, Inc. 25 * Copyright (c) 2017 by Delphix. All rights reserved. 26 */ 27 28 #include <sys/types.h> 29 #include <sys/systm.h> 30 #include <sys/cmn_err.h> 31 #include <sys/cpuvar.h> 32 #include <sys/thread.h> 33 #include <sys/disp.h> 34 #include <sys/kmem.h> 35 #include <sys/debug.h> 36 #include <sys/cpupart.h> 37 #include <sys/pset.h> 38 #include <sys/var.h> 39 #include <sys/cyclic.h> 40 #include <sys/lgrp.h> 41 #include <sys/pghw.h> 42 #include <sys/loadavg.h> 43 #include <sys/class.h> 44 #include <sys/fss.h> 45 #include <sys/pool.h> 46 #include <sys/pool_pset.h> 47 #include <sys/policy.h> 48 49 /* 50 * Calling pool_lock() protects the pools configuration, which includes 51 * CPU partitions. cpu_lock protects the CPU partition list, and prevents 52 * partitions from being created or destroyed while the lock is held. 53 * The lock ordering with respect to related locks is: 54 * 55 * pool_lock() ---> cpu_lock ---> pidlock --> p_lock 56 * 57 * Blocking memory allocations may be made while holding "pool_lock" 58 * or cpu_lock. 59 */ 60 61 /* 62 * The cp_default partition is allocated statically, but its lgroup load average 63 * (lpl) list is allocated dynamically after kmem subsystem is initialized. This 64 * saves some memory since the space allocated reflects the actual number of 65 * lgroups supported by the platform. The lgrp facility provides a temporary 66 * space to hold lpl information during system bootstrap. 67 */ 68 69 cpupart_t *cp_list_head; 70 cpupart_t cp_default; 71 static cpupartid_t cp_id_next; 72 uint_t cp_numparts; 73 uint_t cp_numparts_nonempty; 74 75 /* 76 * Need to limit total number of partitions to avoid slowing down the 77 * clock code too much. The clock code traverses the list of 78 * partitions and needs to be able to execute in a reasonable amount 79 * of time (less than 1/hz seconds). The maximum is sized based on 80 * max_ncpus so it shouldn't be a problem unless there are large 81 * numbers of empty partitions. 82 */ 83 static uint_t cp_max_numparts; 84 85 /* 86 * Processor sets and CPU partitions are different but related concepts. 87 * A processor set is a user-level abstraction allowing users to create 88 * sets of CPUs and bind threads exclusively to those sets. A CPU 89 * partition is a kernel dispatcher object consisting of a set of CPUs 90 * and a global dispatch queue. The processor set abstraction is 91 * implemented via a CPU partition, and currently there is a 1-1 92 * mapping between processor sets and partitions (excluding the default 93 * partition, which is not visible as a processor set). Hence, the 94 * numbering for processor sets and CPU partitions is identical. This 95 * may not always be true in the future, and these macros could become 96 * less trivial if we support e.g. a processor set containing multiple 97 * CPU partitions. 98 */ 99 #define PSTOCP(psid) ((cpupartid_t)((psid) == PS_NONE ? CP_DEFAULT : (psid))) 100 #define CPTOPS(cpid) ((psetid_t)((cpid) == CP_DEFAULT ? PS_NONE : (cpid))) 101 102 static int cpupart_unbind_threads(cpupart_t *, boolean_t); 103 104 /* 105 * Find a CPU partition given a processor set ID. 106 */ 107 static cpupart_t * 108 cpupart_find_all(psetid_t psid) 109 { 110 cpupart_t *cp; 111 cpupartid_t cpid = PSTOCP(psid); 112 113 ASSERT(MUTEX_HELD(&cpu_lock)); 114 115 /* default partition not visible as a processor set */ 116 if (psid == CP_DEFAULT) 117 return (NULL); 118 119 if (psid == PS_MYID) 120 return (curthread->t_cpupart); 121 122 cp = cp_list_head; 123 do { 124 if (cp->cp_id == cpid) 125 return (cp); 126 cp = cp->cp_next; 127 } while (cp != cp_list_head); 128 return (NULL); 129 } 130 131 /* 132 * Find a CPU partition given a processor set ID if the processor set 133 * should be visible from the calling zone. 134 */ 135 cpupart_t * 136 cpupart_find(psetid_t psid) 137 { 138 cpupart_t *cp; 139 140 ASSERT(MUTEX_HELD(&cpu_lock)); 141 cp = cpupart_find_all(psid); 142 if (cp != NULL && !INGLOBALZONE(curproc) && pool_pset_enabled() && 143 zone_pset_get(curproc->p_zone) != CPTOPS(cp->cp_id)) 144 return (NULL); 145 return (cp); 146 } 147 148 static int 149 cpupart_kstat_update(kstat_t *ksp, int rw) 150 { 151 cpupart_t *cp = (cpupart_t *)ksp->ks_private; 152 cpupart_kstat_t *cpksp = ksp->ks_data; 153 154 if (rw == KSTAT_WRITE) 155 return (EACCES); 156 157 cpksp->cpk_updates.value.ui64 = cp->cp_updates; 158 cpksp->cpk_runnable.value.ui64 = cp->cp_nrunnable_cum; 159 cpksp->cpk_waiting.value.ui64 = cp->cp_nwaiting_cum; 160 cpksp->cpk_ncpus.value.ui32 = cp->cp_ncpus; 161 cpksp->cpk_avenrun_1min.value.ui32 = cp->cp_hp_avenrun[0] >> 162 (16 - FSHIFT); 163 cpksp->cpk_avenrun_5min.value.ui32 = cp->cp_hp_avenrun[1] >> 164 (16 - FSHIFT); 165 cpksp->cpk_avenrun_15min.value.ui32 = cp->cp_hp_avenrun[2] >> 166 (16 - FSHIFT); 167 return (0); 168 } 169 170 static void 171 cpupart_kstat_create(cpupart_t *cp) 172 { 173 kstat_t *ksp; 174 zoneid_t zoneid; 175 176 ASSERT(MUTEX_HELD(&cpu_lock)); 177 178 /* 179 * We have a bit of a chicken-egg problem since this code will 180 * get called to create the kstats for CP_DEFAULT before the 181 * pools framework gets initialized. We circumvent the problem 182 * by special-casing cp_default. 183 */ 184 if (cp != &cp_default && pool_pset_enabled()) 185 zoneid = GLOBAL_ZONEID; 186 else 187 zoneid = ALL_ZONES; 188 ksp = kstat_create_zone("unix", cp->cp_id, "pset", "misc", 189 KSTAT_TYPE_NAMED, 190 sizeof (cpupart_kstat_t) / sizeof (kstat_named_t), 0, zoneid); 191 if (ksp != NULL) { 192 cpupart_kstat_t *cpksp = ksp->ks_data; 193 194 kstat_named_init(&cpksp->cpk_updates, "updates", 195 KSTAT_DATA_UINT64); 196 kstat_named_init(&cpksp->cpk_runnable, "runnable", 197 KSTAT_DATA_UINT64); 198 kstat_named_init(&cpksp->cpk_waiting, "waiting", 199 KSTAT_DATA_UINT64); 200 kstat_named_init(&cpksp->cpk_ncpus, "ncpus", 201 KSTAT_DATA_UINT32); 202 kstat_named_init(&cpksp->cpk_avenrun_1min, "avenrun_1min", 203 KSTAT_DATA_UINT32); 204 kstat_named_init(&cpksp->cpk_avenrun_5min, "avenrun_5min", 205 KSTAT_DATA_UINT32); 206 kstat_named_init(&cpksp->cpk_avenrun_15min, "avenrun_15min", 207 KSTAT_DATA_UINT32); 208 209 ksp->ks_update = cpupart_kstat_update; 210 ksp->ks_private = cp; 211 212 kstat_install(ksp); 213 } 214 cp->cp_kstat = ksp; 215 } 216 217 /* 218 * Initialize the cpupart's lgrp partions (lpls) 219 */ 220 static void 221 cpupart_lpl_initialize(cpupart_t *cp) 222 { 223 int i, sz; 224 225 sz = cp->cp_nlgrploads = lgrp_plat_max_lgrps(); 226 cp->cp_lgrploads = kmem_zalloc(sizeof (lpl_t) * sz, KM_SLEEP); 227 228 for (i = 0; i < sz; i++) { 229 /* 230 * The last entry of the lpl's resource set is always NULL 231 * by design (to facilitate iteration)...hence the "oversizing" 232 * by 1. 233 */ 234 cp->cp_lgrploads[i].lpl_rset_sz = sz + 1; 235 cp->cp_lgrploads[i].lpl_rset = 236 kmem_zalloc(sizeof (struct lgrp_ld *) * (sz + 1), KM_SLEEP); 237 cp->cp_lgrploads[i].lpl_id2rset = 238 kmem_zalloc(sizeof (int) * (sz + 1), KM_SLEEP); 239 cp->cp_lgrploads[i].lpl_lgrpid = i; 240 } 241 } 242 243 /* 244 * Teardown the cpupart's lgrp partitions 245 */ 246 static void 247 cpupart_lpl_teardown(cpupart_t *cp) 248 { 249 int i, sz; 250 lpl_t *lpl; 251 252 for (i = 0; i < cp->cp_nlgrploads; i++) { 253 lpl = &cp->cp_lgrploads[i]; 254 255 sz = lpl->lpl_rset_sz; 256 kmem_free(lpl->lpl_rset, sizeof (struct lgrp_ld *) * sz); 257 kmem_free(lpl->lpl_id2rset, sizeof (int) * sz); 258 lpl->lpl_rset = NULL; 259 lpl->lpl_id2rset = NULL; 260 } 261 kmem_free(cp->cp_lgrploads, sizeof (lpl_t) * cp->cp_nlgrploads); 262 cp->cp_lgrploads = NULL; 263 } 264 265 /* 266 * Initialize the default partition and kpreempt disp queue. 267 */ 268 void 269 cpupart_initialize_default(void) 270 { 271 lgrp_id_t i; 272 273 cp_list_head = &cp_default; 274 cp_default.cp_next = &cp_default; 275 cp_default.cp_prev = &cp_default; 276 cp_default.cp_id = CP_DEFAULT; 277 cp_default.cp_kp_queue.disp_maxrunpri = -1; 278 cp_default.cp_kp_queue.disp_max_unbound_pri = -1; 279 cp_default.cp_kp_queue.disp_cpu = NULL; 280 cp_default.cp_gen = 0; 281 cp_default.cp_loadavg.lg_cur = 0; 282 cp_default.cp_loadavg.lg_len = 0; 283 cp_default.cp_loadavg.lg_total = 0; 284 for (i = 0; i < S_LOADAVG_SZ; i++) { 285 cp_default.cp_loadavg.lg_loads[i] = 0; 286 } 287 DISP_LOCK_INIT(&cp_default.cp_kp_queue.disp_lock); 288 cp_id_next = CP_DEFAULT + 1; 289 cpupart_kstat_create(&cp_default); 290 cp_numparts = 1; 291 if (cp_max_numparts == 0) /* allow for /etc/system tuning */ 292 cp_max_numparts = max_ncpus * 2 + 1; 293 /* 294 * Allocate space for cp_default list of lgrploads 295 */ 296 cpupart_lpl_initialize(&cp_default); 297 298 /* 299 * The initial lpl topology is created in a special lpl list 300 * lpl_bootstrap. It should be copied to cp_default. 301 * NOTE: lpl_topo_bootstrap() also updates CPU0 cpu_lpl pointer to point 302 * to the correct lpl in the cp_default.cp_lgrploads list. 303 */ 304 lpl_topo_bootstrap(cp_default.cp_lgrploads, 305 cp_default.cp_nlgrploads); 306 307 308 cp_default.cp_attr = PSET_NOESCAPE; 309 cp_numparts_nonempty = 1; 310 /* 311 * Set t0's home 312 */ 313 t0.t_lpl = &cp_default.cp_lgrploads[LGRP_ROOTID]; 314 315 bitset_init(&cp_default.cp_cmt_pgs); 316 bitset_init_fanout(&cp_default.cp_haltset, cp_haltset_fanout); 317 318 bitset_resize(&cp_default.cp_haltset, max_ncpus); 319 } 320 321 322 static int 323 cpupart_move_cpu(cpu_t *cp, cpupart_t *newpp, int forced) 324 { 325 cpupart_t *oldpp; 326 cpu_t *ncp, *newlist; 327 kthread_t *t; 328 int move_threads = 1; 329 lgrp_id_t lgrpid; 330 proc_t *p; 331 int lgrp_diff_lpl; 332 lpl_t *cpu_lpl; 333 int ret; 334 boolean_t unbind_all_threads = (forced != 0); 335 336 ASSERT(MUTEX_HELD(&cpu_lock)); 337 ASSERT(newpp != NULL); 338 339 oldpp = cp->cpu_part; 340 ASSERT(oldpp != NULL); 341 ASSERT(oldpp->cp_ncpus > 0); 342 343 if (newpp == oldpp) { 344 /* 345 * Don't need to do anything. 346 */ 347 return (0); 348 } 349 350 cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_OUT); 351 352 if (!disp_bound_partition(cp, 0)) { 353 /* 354 * Don't need to move threads if there are no threads in 355 * the partition. Note that threads can't enter the 356 * partition while we're holding cpu_lock. 357 */ 358 move_threads = 0; 359 } else if (oldpp->cp_ncpus == 1) { 360 /* 361 * The last CPU is removed from a partition which has threads 362 * running in it. Some of these threads may be bound to this 363 * CPU. 364 * 365 * Attempt to unbind threads from the CPU and from the processor 366 * set. Note that no threads should be bound to this CPU since 367 * cpupart_move_threads will refuse to move bound threads to 368 * other CPUs. 369 */ 370 (void) cpu_unbind(oldpp->cp_cpulist->cpu_id, B_FALSE); 371 (void) cpupart_unbind_threads(oldpp, B_FALSE); 372 373 if (!disp_bound_partition(cp, 0)) { 374 /* 375 * No bound threads in this partition any more 376 */ 377 move_threads = 0; 378 } else { 379 /* 380 * There are still threads bound to the partition 381 */ 382 cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN); 383 return (EBUSY); 384 } 385 } 386 387 /* 388 * If forced flag is set unbind any threads from this CPU. 389 * Otherwise unbind soft-bound threads only. 390 */ 391 if ((ret = cpu_unbind(cp->cpu_id, unbind_all_threads)) != 0) { 392 cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN); 393 return (ret); 394 } 395 396 /* 397 * Stop further threads weak binding to this cpu. 398 */ 399 cpu_inmotion = cp; 400 membar_enter(); 401 402 /* 403 * Notify the Processor Groups subsystem that the CPU 404 * will be moving cpu partitions. This is done before 405 * CPUs are paused to provide an opportunity for any 406 * needed memory allocations. 407 */ 408 pg_cpupart_out(cp, oldpp); 409 pg_cpupart_in(cp, newpp); 410 411 again: 412 if (move_threads) { 413 int loop_count; 414 /* 415 * Check for threads strong or weak bound to this CPU. 416 */ 417 for (loop_count = 0; disp_bound_threads(cp, 0); loop_count++) { 418 if (loop_count >= 5) { 419 cpu_state_change_notify(cp->cpu_id, 420 CPU_CPUPART_IN); 421 pg_cpupart_out(cp, newpp); 422 pg_cpupart_in(cp, oldpp); 423 cpu_inmotion = NULL; 424 return (EBUSY); /* some threads still bound */ 425 } 426 delay(1); 427 } 428 } 429 430 /* 431 * Before we actually start changing data structures, notify 432 * the cyclic subsystem that we want to move this CPU out of its 433 * partition. 434 */ 435 if (!cyclic_move_out(cp)) { 436 /* 437 * This CPU must be the last CPU in a processor set with 438 * a bound cyclic. 439 */ 440 cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN); 441 pg_cpupart_out(cp, newpp); 442 pg_cpupart_in(cp, oldpp); 443 cpu_inmotion = NULL; 444 return (EBUSY); 445 } 446 447 pause_cpus(cp, NULL); 448 449 if (move_threads) { 450 /* 451 * The thread on cpu before the pause thread may have read 452 * cpu_inmotion before we raised the barrier above. Check 453 * again. 454 */ 455 if (disp_bound_threads(cp, 1)) { 456 start_cpus(); 457 goto again; 458 } 459 460 } 461 462 /* 463 * Now that CPUs are paused, let the PG subsystem perform 464 * any necessary data structure updates. 465 */ 466 pg_cpupart_move(cp, oldpp, newpp); 467 468 /* save this cpu's lgroup -- it'll be the same in the new partition */ 469 lgrpid = cp->cpu_lpl->lpl_lgrpid; 470 471 cpu_lpl = cp->cpu_lpl; 472 /* 473 * let the lgroup framework know cp has left the partition 474 */ 475 lgrp_config(LGRP_CONFIG_CPUPART_DEL, (uintptr_t)cp, lgrpid); 476 477 /* move out of old partition */ 478 oldpp->cp_ncpus--; 479 if (oldpp->cp_ncpus > 0) { 480 481 ncp = cp->cpu_prev_part->cpu_next_part = cp->cpu_next_part; 482 cp->cpu_next_part->cpu_prev_part = cp->cpu_prev_part; 483 if (oldpp->cp_cpulist == cp) { 484 oldpp->cp_cpulist = ncp; 485 } 486 } else { 487 ncp = oldpp->cp_cpulist = NULL; 488 cp_numparts_nonempty--; 489 ASSERT(cp_numparts_nonempty != 0); 490 } 491 oldpp->cp_gen++; 492 493 /* move into new partition */ 494 newlist = newpp->cp_cpulist; 495 if (newlist == NULL) { 496 newpp->cp_cpulist = cp->cpu_next_part = cp->cpu_prev_part = cp; 497 cp_numparts_nonempty++; 498 ASSERT(cp_numparts_nonempty != 0); 499 } else { 500 cp->cpu_next_part = newlist; 501 cp->cpu_prev_part = newlist->cpu_prev_part; 502 newlist->cpu_prev_part->cpu_next_part = cp; 503 newlist->cpu_prev_part = cp; 504 } 505 cp->cpu_part = newpp; 506 newpp->cp_ncpus++; 507 newpp->cp_gen++; 508 509 ASSERT(bitset_is_null(&newpp->cp_haltset)); 510 ASSERT(bitset_is_null(&oldpp->cp_haltset)); 511 512 /* 513 * let the lgroup framework know cp has entered the partition 514 */ 515 lgrp_config(LGRP_CONFIG_CPUPART_ADD, (uintptr_t)cp, lgrpid); 516 517 /* 518 * If necessary, move threads off processor. 519 */ 520 if (move_threads) { 521 ASSERT(ncp != NULL); 522 523 /* 524 * Walk thru the active process list to look for 525 * threads that need to have a new home lgroup, 526 * or the last CPU they run on is the same CPU 527 * being moved out of the partition. 528 */ 529 530 for (p = practive; p != NULL; p = p->p_next) { 531 532 t = p->p_tlist; 533 534 if (t == NULL) 535 continue; 536 537 lgrp_diff_lpl = 0; 538 539 do { 540 541 ASSERT(t->t_lpl != NULL); 542 543 /* 544 * Update the count of how many threads are 545 * in this CPU's lgroup but have a different lpl 546 */ 547 548 if (t->t_lpl != cpu_lpl && 549 t->t_lpl->lpl_lgrpid == lgrpid) 550 lgrp_diff_lpl++; 551 /* 552 * If the lgroup that t is assigned to no 553 * longer has any CPUs in t's partition, 554 * we'll have to choose a new lgroup for t. 555 */ 556 557 if (!LGRP_CPUS_IN_PART(t->t_lpl->lpl_lgrpid, 558 t->t_cpupart)) { 559 lgrp_move_thread(t, 560 lgrp_choose(t, t->t_cpupart), 0); 561 } 562 563 /* 564 * make sure lpl points to our own partition 565 */ 566 ASSERT(t->t_lpl >= t->t_cpupart->cp_lgrploads && 567 (t->t_lpl < t->t_cpupart->cp_lgrploads + 568 t->t_cpupart->cp_nlgrploads)); 569 570 ASSERT(t->t_lpl->lpl_ncpu > 0); 571 572 /* Update CPU last ran on if it was this CPU */ 573 if (t->t_cpu == cp && t->t_cpupart == oldpp && 574 t->t_bound_cpu != cp) { 575 t->t_cpu = disp_lowpri_cpu(ncp, t, 576 t->t_pri); 577 } 578 t = t->t_forw; 579 } while (t != p->p_tlist); 580 581 /* 582 * Didn't find any threads in the same lgroup as this 583 * CPU with a different lpl, so remove the lgroup from 584 * the process lgroup bitmask. 585 */ 586 587 if (lgrp_diff_lpl) 588 klgrpset_del(p->p_lgrpset, lgrpid); 589 } 590 591 /* 592 * Walk thread list looking for threads that need to be 593 * rehomed, since there are some threads that are not in 594 * their process's p_tlist. 595 */ 596 597 t = curthread; 598 599 do { 600 ASSERT(t != NULL && t->t_lpl != NULL); 601 602 /* 603 * If the lgroup that t is assigned to no 604 * longer has any CPUs in t's partition, 605 * we'll have to choose a new lgroup for t. 606 * Also, choose best lgroup for home when 607 * thread has specified lgroup affinities, 608 * since there may be an lgroup with more 609 * affinity available after moving CPUs 610 * around. 611 */ 612 if (!LGRP_CPUS_IN_PART(t->t_lpl->lpl_lgrpid, 613 t->t_cpupart) || t->t_lgrp_affinity) { 614 lgrp_move_thread(t, 615 lgrp_choose(t, t->t_cpupart), 1); 616 } 617 618 /* make sure lpl points to our own partition */ 619 ASSERT((t->t_lpl >= t->t_cpupart->cp_lgrploads) && 620 (t->t_lpl < t->t_cpupart->cp_lgrploads + 621 t->t_cpupart->cp_nlgrploads)); 622 623 ASSERT(t->t_lpl->lpl_ncpu > 0); 624 625 /* Update CPU last ran on if it was this CPU */ 626 if (t->t_cpu == cp && t->t_cpupart == oldpp && 627 t->t_bound_cpu != cp) { 628 t->t_cpu = disp_lowpri_cpu(ncp, t, 629 t->t_pri); 630 } 631 632 t = t->t_next; 633 } while (t != curthread); 634 635 /* 636 * Clear off the CPU's run queue, and the kp queue if the 637 * partition is now empty. 638 */ 639 disp_cpu_inactive(cp); 640 641 /* 642 * Make cp switch to a thread from the new partition. 643 */ 644 cp->cpu_runrun = 1; 645 cp->cpu_kprunrun = 1; 646 } 647 648 cpu_inmotion = NULL; 649 start_cpus(); 650 651 /* 652 * Let anyone interested know that cpu has been added to the set. 653 */ 654 cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN); 655 656 /* 657 * Now let the cyclic subsystem know that it can reshuffle cyclics 658 * bound to the new processor set. 659 */ 660 cyclic_move_in(cp); 661 662 return (0); 663 } 664 665 /* 666 * Check if thread can be moved to a new cpu partition. Called by 667 * cpupart_move_thread() and pset_bind_start(). 668 */ 669 int 670 cpupart_movable_thread(kthread_id_t tp, cpupart_t *cp, int ignore) 671 { 672 ASSERT(MUTEX_HELD(&cpu_lock)); 673 ASSERT(MUTEX_HELD(&ttoproc(tp)->p_lock)); 674 ASSERT(cp != NULL); 675 ASSERT(THREAD_LOCK_HELD(tp)); 676 677 /* 678 * CPU-bound threads can't be moved. 679 */ 680 if (!ignore) { 681 cpu_t *boundcpu = tp->t_bound_cpu ? tp->t_bound_cpu : 682 tp->t_weakbound_cpu; 683 if (boundcpu != NULL && boundcpu->cpu_part != cp) 684 return (EBUSY); 685 } 686 687 if (tp->t_cid == sysdccid) { 688 return (EINVAL); /* For now, sysdc threads can't move */ 689 } 690 691 return (0); 692 } 693 694 /* 695 * Move thread to new partition. If ignore is non-zero, then CPU 696 * bindings should be ignored (this is used when destroying a 697 * partition). 698 */ 699 static int 700 cpupart_move_thread(kthread_id_t tp, cpupart_t *newpp, int ignore, 701 void *projbuf, void *zonebuf) 702 { 703 cpupart_t *oldpp = tp->t_cpupart; 704 int ret; 705 706 ASSERT(MUTEX_HELD(&cpu_lock)); 707 ASSERT(MUTEX_HELD(&pidlock)); 708 ASSERT(MUTEX_HELD(&ttoproc(tp)->p_lock)); 709 ASSERT(newpp != NULL); 710 711 if (newpp->cp_cpulist == NULL) 712 return (EINVAL); 713 714 /* 715 * Check for errors first. 716 */ 717 thread_lock(tp); 718 if ((ret = cpupart_movable_thread(tp, newpp, ignore)) != 0) { 719 thread_unlock(tp); 720 return (ret); 721 } 722 723 /* move the thread */ 724 if (oldpp != newpp) { 725 /* 726 * Make the thread switch to the new partition. 727 */ 728 tp->t_cpupart = newpp; 729 ASSERT(tp->t_lpl != NULL); 730 /* 731 * Leave the thread on the same lgroup if possible; otherwise 732 * choose a new lgroup for it. In either case, update its 733 * t_lpl. 734 */ 735 if (LGRP_CPUS_IN_PART(tp->t_lpl->lpl_lgrpid, newpp) && 736 tp->t_lgrp_affinity == NULL) { 737 /* 738 * The thread's lgroup has CPUs in the thread's new 739 * partition, so the thread can stay assigned to the 740 * same lgroup. Update its t_lpl to point to the 741 * lpl_t for its lgroup in its new partition. 742 */ 743 lgrp_move_thread(tp, &tp->t_cpupart->\ 744 cp_lgrploads[tp->t_lpl->lpl_lgrpid], 1); 745 } else { 746 /* 747 * The thread's lgroup has no cpus in its new 748 * partition or it has specified lgroup affinities, 749 * so choose the best lgroup for the thread and 750 * assign it to that lgroup. 751 */ 752 lgrp_move_thread(tp, lgrp_choose(tp, tp->t_cpupart), 753 1); 754 } 755 /* 756 * make sure lpl points to our own partition 757 */ 758 ASSERT((tp->t_lpl >= tp->t_cpupart->cp_lgrploads) && 759 (tp->t_lpl < tp->t_cpupart->cp_lgrploads + 760 tp->t_cpupart->cp_nlgrploads)); 761 762 ASSERT(tp->t_lpl->lpl_ncpu > 0); 763 764 if (tp->t_state == TS_ONPROC) { 765 cpu_surrender(tp); 766 } else if (tp->t_state == TS_RUN) { 767 (void) dispdeq(tp); 768 setbackdq(tp); 769 } 770 } 771 772 /* 773 * Our binding has changed; set TP_CHANGEBIND. 774 */ 775 tp->t_proc_flag |= TP_CHANGEBIND; 776 aston(tp); 777 778 thread_unlock(tp); 779 fss_changepset(tp, newpp, projbuf, zonebuf); 780 781 return (0); /* success */ 782 } 783 784 785 /* 786 * This function binds a thread to a partition. Must be called with the 787 * p_lock of the containing process held (to keep the thread from going 788 * away), and thus also with cpu_lock held (since cpu_lock must be 789 * acquired before p_lock). If ignore is non-zero, then CPU bindings 790 * should be ignored (this is used when destroying a partition). 791 */ 792 int 793 cpupart_bind_thread(kthread_id_t tp, psetid_t psid, int ignore, void *projbuf, 794 void *zonebuf) 795 { 796 cpupart_t *newpp; 797 798 ASSERT(pool_lock_held()); 799 ASSERT(MUTEX_HELD(&cpu_lock)); 800 ASSERT(MUTEX_HELD(&pidlock)); 801 ASSERT(MUTEX_HELD(&ttoproc(tp)->p_lock)); 802 803 if (psid == PS_NONE) 804 newpp = &cp_default; 805 else { 806 newpp = cpupart_find(psid); 807 if (newpp == NULL) { 808 return (EINVAL); 809 } 810 } 811 return (cpupart_move_thread(tp, newpp, ignore, projbuf, zonebuf)); 812 } 813 814 815 /* 816 * Create a new partition. On MP systems, this also allocates a 817 * kpreempt disp queue for that partition. 818 */ 819 int 820 cpupart_create(psetid_t *psid) 821 { 822 cpupart_t *pp; 823 824 ASSERT(pool_lock_held()); 825 826 pp = kmem_zalloc(sizeof (cpupart_t), KM_SLEEP); 827 828 mutex_enter(&cpu_lock); 829 if (cp_numparts == cp_max_numparts) { 830 mutex_exit(&cpu_lock); 831 kmem_free(pp, sizeof (cpupart_t)); 832 return (ENOMEM); 833 } 834 cp_numparts++; 835 /* find the next free partition ID */ 836 while (cpupart_find(CPTOPS(cp_id_next)) != NULL) 837 cp_id_next++; 838 pp->cp_id = cp_id_next++; 839 pp->cp_ncpus = 0; 840 pp->cp_cpulist = NULL; 841 pp->cp_attr = 0; 842 klgrpset_clear(pp->cp_lgrpset); 843 pp->cp_kp_queue.disp_maxrunpri = -1; 844 pp->cp_kp_queue.disp_max_unbound_pri = -1; 845 pp->cp_kp_queue.disp_cpu = NULL; 846 pp->cp_gen = 0; 847 DISP_LOCK_INIT(&pp->cp_kp_queue.disp_lock); 848 *psid = CPTOPS(pp->cp_id); 849 disp_kp_alloc(&pp->cp_kp_queue, v.v_nglobpris); 850 cpupart_kstat_create(pp); 851 cpupart_lpl_initialize(pp); 852 853 bitset_init(&pp->cp_cmt_pgs); 854 855 /* 856 * Initialize and size the partition's bitset of halted CPUs. 857 */ 858 bitset_init_fanout(&pp->cp_haltset, cp_haltset_fanout); 859 bitset_resize(&pp->cp_haltset, max_ncpus); 860 861 /* 862 * Pause all CPUs while changing the partition list, to make sure 863 * the clock thread (which traverses the list without holding 864 * cpu_lock) isn't running. 865 */ 866 pause_cpus(NULL, NULL); 867 pp->cp_next = cp_list_head; 868 pp->cp_prev = cp_list_head->cp_prev; 869 cp_list_head->cp_prev->cp_next = pp; 870 cp_list_head->cp_prev = pp; 871 start_cpus(); 872 mutex_exit(&cpu_lock); 873 874 return (0); 875 } 876 877 /* 878 * Move threads from specified partition to cp_default. If `force' is specified, 879 * move all threads, otherwise move only soft-bound threads. 880 */ 881 static int 882 cpupart_unbind_threads(cpupart_t *pp, boolean_t unbind_all) 883 { 884 void *projbuf, *zonebuf; 885 kthread_t *t; 886 proc_t *p; 887 int err = 0; 888 psetid_t psid = pp->cp_id; 889 890 ASSERT(pool_lock_held()); 891 ASSERT(MUTEX_HELD(&cpu_lock)); 892 893 if (pp == NULL || pp == &cp_default) { 894 return (EINVAL); 895 } 896 897 /* 898 * Pre-allocate enough buffers for FSS for all active projects and 899 * for all active zones on the system. Unused buffers will be 900 * freed later by fss_freebuf(). 901 */ 902 projbuf = fss_allocbuf(FSS_NPROJ_BUF, FSS_ALLOC_PROJ); 903 zonebuf = fss_allocbuf(FSS_NPROJ_BUF, FSS_ALLOC_ZONE); 904 905 mutex_enter(&pidlock); 906 t = curthread; 907 do { 908 if (t->t_bind_pset == psid) { 909 again: p = ttoproc(t); 910 mutex_enter(&p->p_lock); 911 if (ttoproc(t) != p) { 912 /* 913 * lwp_exit has changed this thread's process 914 * pointer before we grabbed its p_lock. 915 */ 916 mutex_exit(&p->p_lock); 917 goto again; 918 } 919 920 /* 921 * Can only unbind threads which have revocable binding 922 * unless force unbinding requested. 923 */ 924 if (unbind_all || TB_PSET_IS_SOFT(t)) { 925 err = cpupart_bind_thread(t, PS_NONE, 1, 926 projbuf, zonebuf); 927 if (err) { 928 mutex_exit(&p->p_lock); 929 mutex_exit(&pidlock); 930 fss_freebuf(projbuf, FSS_ALLOC_PROJ); 931 fss_freebuf(zonebuf, FSS_ALLOC_ZONE); 932 return (err); 933 } 934 t->t_bind_pset = PS_NONE; 935 } 936 mutex_exit(&p->p_lock); 937 } 938 t = t->t_next; 939 } while (t != curthread); 940 941 mutex_exit(&pidlock); 942 fss_freebuf(projbuf, FSS_ALLOC_PROJ); 943 fss_freebuf(zonebuf, FSS_ALLOC_ZONE); 944 return (err); 945 } 946 947 /* 948 * Destroy a partition. 949 */ 950 int 951 cpupart_destroy(psetid_t psid) 952 { 953 cpu_t *cp, *first_cp; 954 cpupart_t *pp, *newpp; 955 int err = 0; 956 957 ASSERT(pool_lock_held()); 958 mutex_enter(&cpu_lock); 959 960 pp = cpupart_find(psid); 961 if (pp == NULL || pp == &cp_default) { 962 mutex_exit(&cpu_lock); 963 return (EINVAL); 964 } 965 966 /* 967 * Unbind all the threads currently bound to the partition. 968 */ 969 err = cpupart_unbind_threads(pp, B_TRUE); 970 if (err) { 971 mutex_exit(&cpu_lock); 972 return (err); 973 } 974 975 newpp = &cp_default; 976 while ((cp = pp->cp_cpulist) != NULL) { 977 if (err = cpupart_move_cpu(cp, newpp, 0)) { 978 mutex_exit(&cpu_lock); 979 return (err); 980 } 981 } 982 983 ASSERT(bitset_is_null(&pp->cp_cmt_pgs)); 984 ASSERT(bitset_is_null(&pp->cp_haltset)); 985 986 /* 987 * Teardown the partition's group of active CMT PGs and halted 988 * CPUs now that they have all left. 989 */ 990 bitset_fini(&pp->cp_cmt_pgs); 991 bitset_fini(&pp->cp_haltset); 992 993 /* 994 * Reset the pointers in any offline processors so they won't 995 * try to rejoin the destroyed partition when they're turned 996 * online. 997 */ 998 first_cp = cp = CPU; 999 do { 1000 if (cp->cpu_part == pp) { 1001 ASSERT(cp->cpu_flags & CPU_OFFLINE); 1002 cp->cpu_part = newpp; 1003 } 1004 cp = cp->cpu_next; 1005 } while (cp != first_cp); 1006 1007 /* 1008 * Pause all CPUs while changing the partition list, to make sure 1009 * the clock thread (which traverses the list without holding 1010 * cpu_lock) isn't running. 1011 */ 1012 pause_cpus(NULL, NULL); 1013 pp->cp_prev->cp_next = pp->cp_next; 1014 pp->cp_next->cp_prev = pp->cp_prev; 1015 if (cp_list_head == pp) 1016 cp_list_head = pp->cp_next; 1017 start_cpus(); 1018 1019 if (cp_id_next > pp->cp_id) 1020 cp_id_next = pp->cp_id; 1021 1022 if (pp->cp_kstat) 1023 kstat_delete(pp->cp_kstat); 1024 1025 cp_numparts--; 1026 1027 disp_kp_free(&pp->cp_kp_queue); 1028 1029 cpupart_lpl_teardown(pp); 1030 1031 kmem_free(pp, sizeof (cpupart_t)); 1032 mutex_exit(&cpu_lock); 1033 1034 return (err); 1035 } 1036 1037 1038 /* 1039 * Return the ID of the partition to which the specified processor belongs. 1040 */ 1041 psetid_t 1042 cpupart_query_cpu(cpu_t *cp) 1043 { 1044 ASSERT(MUTEX_HELD(&cpu_lock)); 1045 1046 return (CPTOPS(cp->cpu_part->cp_id)); 1047 } 1048 1049 1050 /* 1051 * Attach a processor to an existing partition. 1052 */ 1053 int 1054 cpupart_attach_cpu(psetid_t psid, cpu_t *cp, int forced) 1055 { 1056 cpupart_t *pp; 1057 int err; 1058 1059 ASSERT(pool_lock_held()); 1060 ASSERT(MUTEX_HELD(&cpu_lock)); 1061 1062 pp = cpupart_find(psid); 1063 if (pp == NULL) 1064 return (EINVAL); 1065 if (cp->cpu_flags & CPU_OFFLINE) 1066 return (EINVAL); 1067 1068 err = cpupart_move_cpu(cp, pp, forced); 1069 return (err); 1070 } 1071 1072 /* 1073 * Get a list of cpus belonging to the partition. If numcpus is NULL, 1074 * this just checks for a valid partition. If numcpus is non-NULL but 1075 * cpulist is NULL, the current number of cpus is stored in *numcpus. 1076 * If both are non-NULL, the current number of cpus is stored in *numcpus, 1077 * and a list of those cpus up to the size originally in *numcpus is 1078 * stored in cpulist[]. Also, store the processor set id in *psid. 1079 * This is useful in case the processor set id passed in was PS_MYID. 1080 */ 1081 int 1082 cpupart_get_cpus(psetid_t *psid, processorid_t *cpulist, uint_t *numcpus) 1083 { 1084 cpupart_t *pp; 1085 uint_t ncpus; 1086 cpu_t *c; 1087 int i; 1088 1089 mutex_enter(&cpu_lock); 1090 pp = cpupart_find(*psid); 1091 if (pp == NULL) { 1092 mutex_exit(&cpu_lock); 1093 return (EINVAL); 1094 } 1095 *psid = CPTOPS(pp->cp_id); 1096 ncpus = pp->cp_ncpus; 1097 if (numcpus) { 1098 if (ncpus > *numcpus) { 1099 /* 1100 * Only copy as many cpus as were passed in, but 1101 * pass back the real number. 1102 */ 1103 uint_t t = ncpus; 1104 ncpus = *numcpus; 1105 *numcpus = t; 1106 } else 1107 *numcpus = ncpus; 1108 1109 if (cpulist) { 1110 c = pp->cp_cpulist; 1111 for (i = 0; i < ncpus; i++) { 1112 ASSERT(c != NULL); 1113 cpulist[i] = c->cpu_id; 1114 c = c->cpu_next_part; 1115 } 1116 } 1117 } 1118 mutex_exit(&cpu_lock); 1119 return (0); 1120 } 1121 1122 /* 1123 * Reallocate kpreempt queues for each CPU partition. Called from 1124 * disp_setup when a new scheduling class is loaded that increases the 1125 * number of priorities in the system. 1126 */ 1127 void 1128 cpupart_kpqalloc(pri_t npri) 1129 { 1130 cpupart_t *cpp; 1131 1132 ASSERT(MUTEX_HELD(&cpu_lock)); 1133 cpp = cp_list_head; 1134 do { 1135 disp_kp_alloc(&cpp->cp_kp_queue, npri); 1136 cpp = cpp->cp_next; 1137 } while (cpp != cp_list_head); 1138 } 1139 1140 int 1141 cpupart_get_loadavg(psetid_t psid, int *buf, int nelem) 1142 { 1143 cpupart_t *cp; 1144 int i; 1145 1146 ASSERT(nelem >= 0); 1147 ASSERT(nelem <= LOADAVG_NSTATS); 1148 ASSERT(MUTEX_HELD(&cpu_lock)); 1149 1150 cp = cpupart_find(psid); 1151 if (cp == NULL) 1152 return (EINVAL); 1153 for (i = 0; i < nelem; i++) 1154 buf[i] = cp->cp_hp_avenrun[i] >> (16 - FSHIFT); 1155 1156 return (0); 1157 } 1158 1159 1160 uint_t 1161 cpupart_list(psetid_t *list, uint_t nelem, int flag) 1162 { 1163 uint_t numpart = 0; 1164 cpupart_t *cp; 1165 1166 ASSERT(MUTEX_HELD(&cpu_lock)); 1167 ASSERT(flag == CP_ALL || flag == CP_NONEMPTY); 1168 1169 if (list != NULL) { 1170 cp = cp_list_head; 1171 do { 1172 if (((flag == CP_ALL) && (cp != &cp_default)) || 1173 ((flag == CP_NONEMPTY) && (cp->cp_ncpus != 0))) { 1174 if (numpart == nelem) 1175 break; 1176 list[numpart++] = CPTOPS(cp->cp_id); 1177 } 1178 cp = cp->cp_next; 1179 } while (cp != cp_list_head); 1180 } 1181 1182 ASSERT(numpart < cp_numparts); 1183 1184 if (flag == CP_ALL) 1185 numpart = cp_numparts - 1; /* leave out default partition */ 1186 else if (flag == CP_NONEMPTY) 1187 numpart = cp_numparts_nonempty; 1188 1189 return (numpart); 1190 } 1191 1192 int 1193 cpupart_setattr(psetid_t psid, uint_t attr) 1194 { 1195 cpupart_t *cp; 1196 1197 ASSERT(pool_lock_held()); 1198 1199 mutex_enter(&cpu_lock); 1200 if ((cp = cpupart_find(psid)) == NULL) { 1201 mutex_exit(&cpu_lock); 1202 return (EINVAL); 1203 } 1204 /* 1205 * PSET_NOESCAPE attribute for default cpu partition is always set 1206 */ 1207 if (cp == &cp_default && !(attr & PSET_NOESCAPE)) { 1208 mutex_exit(&cpu_lock); 1209 return (EINVAL); 1210 } 1211 cp->cp_attr = attr; 1212 mutex_exit(&cpu_lock); 1213 return (0); 1214 } 1215 1216 int 1217 cpupart_getattr(psetid_t psid, uint_t *attrp) 1218 { 1219 cpupart_t *cp; 1220 1221 mutex_enter(&cpu_lock); 1222 if ((cp = cpupart_find(psid)) == NULL) { 1223 mutex_exit(&cpu_lock); 1224 return (EINVAL); 1225 } 1226 *attrp = cp->cp_attr; 1227 mutex_exit(&cpu_lock); 1228 return (0); 1229 }