1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1996, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2017 by Delphix. All rights reserved.
  24  */
  25 
  26 #include <sys/types.h>
  27 #include <sys/systm.h>
  28 #include <sys/cmn_err.h>
  29 #include <sys/cpuvar.h>
  30 #include <sys/thread.h>
  31 #include <sys/disp.h>
  32 #include <sys/kmem.h>
  33 #include <sys/debug.h>
  34 #include <sys/cpupart.h>
  35 #include <sys/pset.h>
  36 #include <sys/var.h>
  37 #include <sys/cyclic.h>
  38 #include <sys/lgrp.h>
  39 #include <sys/pghw.h>
  40 #include <sys/loadavg.h>
  41 #include <sys/class.h>
  42 #include <sys/fss.h>
  43 #include <sys/pool.h>
  44 #include <sys/pool_pset.h>
  45 #include <sys/policy.h>
  46 
  47 /*
  48  * Calling pool_lock() protects the pools configuration, which includes
  49  * CPU partitions.  cpu_lock protects the CPU partition list, and prevents
  50  * partitions from being created or destroyed while the lock is held.
  51  * The lock ordering with respect to related locks is:
  52  *
  53  *    pool_lock() ---> cpu_lock  --->  pidlock  -->  p_lock
  54  *
  55  * Blocking memory allocations may be made while holding "pool_lock"
  56  * or cpu_lock.
  57  */
  58 
  59 /*
  60  * The cp_default partition is allocated statically, but its lgroup load average
  61  * (lpl) list is allocated dynamically after kmem subsystem is initialized. This
  62  * saves some memory since the space allocated reflects the actual number of
  63  * lgroups supported by the platform. The lgrp facility provides a temporary
  64  * space to hold lpl information during system bootstrap.
  65  */
  66 
  67 cpupart_t               *cp_list_head;
  68 cpupart_t               cp_default;
  69 static cpupartid_t      cp_id_next;
  70 uint_t                  cp_numparts;
  71 uint_t                  cp_numparts_nonempty;
  72 
  73 /*
  74  * Need to limit total number of partitions to avoid slowing down the
  75  * clock code too much.  The clock code traverses the list of
  76  * partitions and needs to be able to execute in a reasonable amount
  77  * of time (less than 1/hz seconds).  The maximum is sized based on
  78  * max_ncpus so it shouldn't be a problem unless there are large
  79  * numbers of empty partitions.
  80  */
  81 static uint_t           cp_max_numparts;
  82 
  83 /*
  84  * Processor sets and CPU partitions are different but related concepts.
  85  * A processor set is a user-level abstraction allowing users to create
  86  * sets of CPUs and bind threads exclusively to those sets.  A CPU
  87  * partition is a kernel dispatcher object consisting of a set of CPUs
  88  * and a global dispatch queue.  The processor set abstraction is
  89  * implemented via a CPU partition, and currently there is a 1-1
  90  * mapping between processor sets and partitions (excluding the default
  91  * partition, which is not visible as a processor set).  Hence, the
  92  * numbering for processor sets and CPU partitions is identical.  This
  93  * may not always be true in the future, and these macros could become
  94  * less trivial if we support e.g. a processor set containing multiple
  95  * CPU partitions.
  96  */
  97 #define PSTOCP(psid)    ((cpupartid_t)((psid) == PS_NONE ? CP_DEFAULT : (psid)))
  98 #define CPTOPS(cpid)    ((psetid_t)((cpid) == CP_DEFAULT ? PS_NONE : (cpid)))
  99 
 100 static int cpupart_unbind_threads(cpupart_t *, boolean_t);
 101 
 102 /*
 103  * Find a CPU partition given a processor set ID.
 104  */
 105 static cpupart_t *
 106 cpupart_find_all(psetid_t psid)
 107 {
 108         cpupart_t *cp;
 109         cpupartid_t cpid = PSTOCP(psid);
 110 
 111         ASSERT(MUTEX_HELD(&cpu_lock));
 112 
 113         /* default partition not visible as a processor set */
 114         if (psid == CP_DEFAULT)
 115                 return (NULL);
 116 
 117         if (psid == PS_MYID)
 118                 return (curthread->t_cpupart);
 119 
 120         cp = cp_list_head;
 121         do {
 122                 if (cp->cp_id == cpid)
 123                         return (cp);
 124                 cp = cp->cp_next;
 125         } while (cp != cp_list_head);
 126         return (NULL);
 127 }
 128 
 129 /*
 130  * Find a CPU partition given a processor set ID if the processor set
 131  * should be visible from the calling zone.
 132  */
 133 cpupart_t *
 134 cpupart_find(psetid_t psid)
 135 {
 136         cpupart_t *cp;
 137 
 138         ASSERT(MUTEX_HELD(&cpu_lock));
 139         cp = cpupart_find_all(psid);
 140         if (cp != NULL && !INGLOBALZONE(curproc) && pool_pset_enabled() &&
 141             zone_pset_get(curproc->p_zone) != CPTOPS(cp->cp_id))
 142                         return (NULL);
 143         return (cp);
 144 }
 145 
 146 static int
 147 cpupart_kstat_update(kstat_t *ksp, int rw)
 148 {
 149         cpupart_t *cp = (cpupart_t *)ksp->ks_private;
 150         cpupart_kstat_t *cpksp = ksp->ks_data;
 151 
 152         if (rw == KSTAT_WRITE)
 153                 return (EACCES);
 154 
 155         cpksp->cpk_updates.value.ui64 = cp->cp_updates;
 156         cpksp->cpk_runnable.value.ui64 = cp->cp_nrunnable_cum;
 157         cpksp->cpk_waiting.value.ui64 = cp->cp_nwaiting_cum;
 158         cpksp->cpk_ncpus.value.ui32 = cp->cp_ncpus;
 159         cpksp->cpk_avenrun_1min.value.ui32 = cp->cp_hp_avenrun[0] >>
 160             (16 - FSHIFT);
 161         cpksp->cpk_avenrun_5min.value.ui32 = cp->cp_hp_avenrun[1] >>
 162             (16 - FSHIFT);
 163         cpksp->cpk_avenrun_15min.value.ui32 = cp->cp_hp_avenrun[2] >>
 164             (16 - FSHIFT);
 165         return (0);
 166 }
 167 
 168 static void
 169 cpupart_kstat_create(cpupart_t *cp)
 170 {
 171         kstat_t *ksp;
 172         zoneid_t zoneid;
 173 
 174         ASSERT(MUTEX_HELD(&cpu_lock));
 175 
 176         /*
 177          * We have a bit of a chicken-egg problem since this code will
 178          * get called to create the kstats for CP_DEFAULT before the
 179          * pools framework gets initialized.  We circumvent the problem
 180          * by special-casing cp_default.
 181          */
 182         if (cp != &cp_default && pool_pset_enabled())
 183                 zoneid = GLOBAL_ZONEID;
 184         else
 185                 zoneid = ALL_ZONES;
 186         ksp = kstat_create_zone("unix", cp->cp_id, "pset", "misc",
 187             KSTAT_TYPE_NAMED,
 188             sizeof (cpupart_kstat_t) / sizeof (kstat_named_t), 0, zoneid);
 189         if (ksp != NULL) {
 190                 cpupart_kstat_t *cpksp = ksp->ks_data;
 191 
 192                 kstat_named_init(&cpksp->cpk_updates, "updates",
 193                     KSTAT_DATA_UINT64);
 194                 kstat_named_init(&cpksp->cpk_runnable, "runnable",
 195                     KSTAT_DATA_UINT64);
 196                 kstat_named_init(&cpksp->cpk_waiting, "waiting",
 197                     KSTAT_DATA_UINT64);
 198                 kstat_named_init(&cpksp->cpk_ncpus, "ncpus",
 199                     KSTAT_DATA_UINT32);
 200                 kstat_named_init(&cpksp->cpk_avenrun_1min, "avenrun_1min",
 201                     KSTAT_DATA_UINT32);
 202                 kstat_named_init(&cpksp->cpk_avenrun_5min, "avenrun_5min",
 203                     KSTAT_DATA_UINT32);
 204                 kstat_named_init(&cpksp->cpk_avenrun_15min, "avenrun_15min",
 205                     KSTAT_DATA_UINT32);
 206 
 207                 ksp->ks_update = cpupart_kstat_update;
 208                 ksp->ks_private = cp;
 209 
 210                 kstat_install(ksp);
 211         }
 212         cp->cp_kstat = ksp;
 213 }
 214 
 215 /*
 216  * Initialize the cpupart's lgrp partions (lpls)
 217  */
 218 static void
 219 cpupart_lpl_initialize(cpupart_t *cp)
 220 {
 221         int i, sz;
 222 
 223         sz = cp->cp_nlgrploads = lgrp_plat_max_lgrps();
 224         cp->cp_lgrploads = kmem_zalloc(sizeof (lpl_t) * sz, KM_SLEEP);
 225 
 226         for (i = 0; i < sz; i++) {
 227                 /*
 228                  * The last entry of the lpl's resource set is always NULL
 229                  * by design (to facilitate iteration)...hence the "oversizing"
 230                  * by 1.
 231                  */
 232                 cp->cp_lgrploads[i].lpl_rset_sz = sz + 1;
 233                 cp->cp_lgrploads[i].lpl_rset =
 234                     kmem_zalloc(sizeof (struct lgrp_ld *) * (sz + 1), KM_SLEEP);
 235                 cp->cp_lgrploads[i].lpl_id2rset =
 236                     kmem_zalloc(sizeof (int) * (sz + 1), KM_SLEEP);
 237                 cp->cp_lgrploads[i].lpl_lgrpid = i;
 238         }
 239 }
 240 
 241 /*
 242  * Teardown the cpupart's lgrp partitions
 243  */
 244 static void
 245 cpupart_lpl_teardown(cpupart_t *cp)
 246 {
 247         int i, sz;
 248         lpl_t *lpl;
 249 
 250         for (i = 0; i < cp->cp_nlgrploads; i++) {
 251                 lpl = &cp->cp_lgrploads[i];
 252 
 253                 sz = lpl->lpl_rset_sz;
 254                 kmem_free(lpl->lpl_rset, sizeof (struct lgrp_ld *) * sz);
 255                 kmem_free(lpl->lpl_id2rset, sizeof (int) * sz);
 256                 lpl->lpl_rset = NULL;
 257                 lpl->lpl_id2rset = NULL;
 258         }
 259         kmem_free(cp->cp_lgrploads, sizeof (lpl_t) * cp->cp_nlgrploads);
 260         cp->cp_lgrploads = NULL;
 261 }
 262 
 263 /*
 264  * Initialize the default partition and kpreempt disp queue.
 265  */
 266 void
 267 cpupart_initialize_default(void)
 268 {
 269         lgrp_id_t i;
 270 
 271         cp_list_head = &cp_default;
 272         cp_default.cp_next = &cp_default;
 273         cp_default.cp_prev = &cp_default;
 274         cp_default.cp_id = CP_DEFAULT;
 275         cp_default.cp_kp_queue.disp_maxrunpri = -1;
 276         cp_default.cp_kp_queue.disp_max_unbound_pri = -1;
 277         cp_default.cp_kp_queue.disp_cpu = NULL;
 278         cp_default.cp_gen = 0;
 279         cp_default.cp_loadavg.lg_cur = 0;
 280         cp_default.cp_loadavg.lg_len = 0;
 281         cp_default.cp_loadavg.lg_total = 0;
 282         for (i = 0; i < S_LOADAVG_SZ; i++) {
 283                 cp_default.cp_loadavg.lg_loads[i] = 0;
 284         }
 285         DISP_LOCK_INIT(&cp_default.cp_kp_queue.disp_lock);
 286         cp_id_next = CP_DEFAULT + 1;
 287         cpupart_kstat_create(&cp_default);
 288         cp_numparts = 1;
 289         if (cp_max_numparts == 0)       /* allow for /etc/system tuning */
 290                 cp_max_numparts = max_ncpus * 2 + 1;
 291         /*
 292          * Allocate space for cp_default list of lgrploads
 293          */
 294         cpupart_lpl_initialize(&cp_default);
 295 
 296         /*
 297          * The initial lpl topology is created in a special lpl list
 298          * lpl_bootstrap. It should be copied to cp_default.
 299          * NOTE: lpl_topo_bootstrap() also updates CPU0 cpu_lpl pointer to point
 300          *       to the correct lpl in the cp_default.cp_lgrploads list.
 301          */
 302         lpl_topo_bootstrap(cp_default.cp_lgrploads,
 303             cp_default.cp_nlgrploads);
 304 
 305 
 306         cp_default.cp_attr = PSET_NOESCAPE;
 307         cp_numparts_nonempty = 1;
 308         /*
 309          * Set t0's home
 310          */
 311         t0.t_lpl = &cp_default.cp_lgrploads[LGRP_ROOTID];
 312 
 313         bitset_init(&cp_default.cp_cmt_pgs);
 314         bitset_init_fanout(&cp_default.cp_haltset, cp_haltset_fanout);
 315 
 316         bitset_resize(&cp_default.cp_haltset, max_ncpus);
 317 }
 318 
 319 
 320 static int
 321 cpupart_move_cpu(cpu_t *cp, cpupart_t *newpp, int forced)
 322 {
 323         cpupart_t *oldpp;
 324         cpu_t   *ncp, *newlist;
 325         kthread_t *t;
 326         int     move_threads = 1;
 327         lgrp_id_t lgrpid;
 328         proc_t  *p;
 329         int lgrp_diff_lpl;
 330         lpl_t   *cpu_lpl;
 331         int     ret;
 332         boolean_t unbind_all_threads = (forced != 0);
 333 
 334         ASSERT(MUTEX_HELD(&cpu_lock));
 335         ASSERT(newpp != NULL);
 336 
 337         oldpp = cp->cpu_part;
 338         ASSERT(oldpp != NULL);
 339         ASSERT(oldpp->cp_ncpus > 0);
 340 
 341         if (newpp == oldpp) {
 342                 /*
 343                  * Don't need to do anything.
 344                  */
 345                 return (0);
 346         }
 347 
 348         cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_OUT);
 349 
 350         if (!disp_bound_partition(cp, 0)) {
 351                 /*
 352                  * Don't need to move threads if there are no threads in
 353                  * the partition.  Note that threads can't enter the
 354                  * partition while we're holding cpu_lock.
 355                  */
 356                 move_threads = 0;
 357         } else if (oldpp->cp_ncpus == 1) {
 358                 /*
 359                  * The last CPU is removed from a partition which has threads
 360                  * running in it. Some of these threads may be bound to this
 361                  * CPU.
 362                  *
 363                  * Attempt to unbind threads from the CPU and from the processor
 364                  * set. Note that no threads should be bound to this CPU since
 365                  * cpupart_move_threads will refuse to move bound threads to
 366                  * other CPUs.
 367                  */
 368                 (void) cpu_unbind(oldpp->cp_cpulist->cpu_id, B_FALSE);
 369                 (void) cpupart_unbind_threads(oldpp, B_FALSE);
 370 
 371                 if (!disp_bound_partition(cp, 0)) {
 372                         /*
 373                          * No bound threads in this partition any more
 374                          */
 375                         move_threads = 0;
 376                 } else {
 377                         /*
 378                          * There are still threads bound to the partition
 379                          */
 380                         cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN);
 381                         return (EBUSY);
 382                 }
 383         }
 384 
 385         /*
 386          * If forced flag is set unbind any threads from this CPU.
 387          * Otherwise unbind soft-bound threads only.
 388          */
 389         if ((ret = cpu_unbind(cp->cpu_id, unbind_all_threads)) != 0) {
 390                 cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN);
 391                 return (ret);
 392         }
 393 
 394         /*
 395          * Stop further threads weak binding to this cpu.
 396          */
 397         cpu_inmotion = cp;
 398         membar_enter();
 399 
 400         /*
 401          * Notify the Processor Groups subsystem that the CPU
 402          * will be moving cpu partitions. This is done before
 403          * CPUs are paused to provide an opportunity for any
 404          * needed memory allocations.
 405          */
 406         pg_cpupart_out(cp, oldpp);
 407         pg_cpupart_in(cp, newpp);
 408 
 409 again:
 410         if (move_threads) {
 411                 int loop_count;
 412                 /*
 413                  * Check for threads strong or weak bound to this CPU.
 414                  */
 415                 for (loop_count = 0; disp_bound_threads(cp, 0); loop_count++) {
 416                         if (loop_count >= 5) {
 417                                 cpu_state_change_notify(cp->cpu_id,
 418                                     CPU_CPUPART_IN);
 419                                 pg_cpupart_out(cp, newpp);
 420                                 pg_cpupart_in(cp, oldpp);
 421                                 cpu_inmotion = NULL;
 422                                 return (EBUSY); /* some threads still bound */
 423                         }
 424                         delay(1);
 425                 }
 426         }
 427 
 428         /*
 429          * Before we actually start changing data structures, notify
 430          * the cyclic subsystem that we want to move this CPU out of its
 431          * partition.
 432          */
 433         if (!cyclic_move_out(cp)) {
 434                 /*
 435                  * This CPU must be the last CPU in a processor set with
 436                  * a bound cyclic.
 437                  */
 438                 cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN);
 439                 pg_cpupart_out(cp, newpp);
 440                 pg_cpupart_in(cp, oldpp);
 441                 cpu_inmotion = NULL;
 442                 return (EBUSY);
 443         }
 444 
 445         pause_cpus(cp, NULL);
 446 
 447         if (move_threads) {
 448                 /*
 449                  * The thread on cpu before the pause thread may have read
 450                  * cpu_inmotion before we raised the barrier above.  Check
 451                  * again.
 452                  */
 453                 if (disp_bound_threads(cp, 1)) {
 454                         start_cpus();
 455                         goto again;
 456                 }
 457 
 458         }
 459 
 460         /*
 461          * Now that CPUs are paused, let the PG subsystem perform
 462          * any necessary data structure updates.
 463          */
 464         pg_cpupart_move(cp, oldpp, newpp);
 465 
 466         /* save this cpu's lgroup -- it'll be the same in the new partition */
 467         lgrpid = cp->cpu_lpl->lpl_lgrpid;
 468 
 469         cpu_lpl = cp->cpu_lpl;
 470         /*
 471          * let the lgroup framework know cp has left the partition
 472          */
 473         lgrp_config(LGRP_CONFIG_CPUPART_DEL, (uintptr_t)cp, lgrpid);
 474 
 475         /* move out of old partition */
 476         oldpp->cp_ncpus--;
 477         if (oldpp->cp_ncpus > 0) {
 478 
 479                 ncp = cp->cpu_prev_part->cpu_next_part = cp->cpu_next_part;
 480                 cp->cpu_next_part->cpu_prev_part = cp->cpu_prev_part;
 481                 if (oldpp->cp_cpulist == cp) {
 482                         oldpp->cp_cpulist = ncp;
 483                 }
 484         } else {
 485                 ncp = oldpp->cp_cpulist = NULL;
 486                 cp_numparts_nonempty--;
 487                 ASSERT(cp_numparts_nonempty != 0);
 488         }
 489         oldpp->cp_gen++;
 490 
 491         /* move into new partition */
 492         newlist = newpp->cp_cpulist;
 493         if (newlist == NULL) {
 494                 newpp->cp_cpulist = cp->cpu_next_part = cp->cpu_prev_part = cp;
 495                 cp_numparts_nonempty++;
 496                 ASSERT(cp_numparts_nonempty != 0);
 497         } else {
 498                 cp->cpu_next_part = newlist;
 499                 cp->cpu_prev_part = newlist->cpu_prev_part;
 500                 newlist->cpu_prev_part->cpu_next_part = cp;
 501                 newlist->cpu_prev_part = cp;
 502         }
 503         cp->cpu_part = newpp;
 504         newpp->cp_ncpus++;
 505         newpp->cp_gen++;
 506 
 507         ASSERT(bitset_is_null(&newpp->cp_haltset));
 508         ASSERT(bitset_is_null(&oldpp->cp_haltset));
 509 
 510         /*
 511          * let the lgroup framework know cp has entered the partition
 512          */
 513         lgrp_config(LGRP_CONFIG_CPUPART_ADD, (uintptr_t)cp, lgrpid);
 514 
 515         /*
 516          * If necessary, move threads off processor.
 517          */
 518         if (move_threads) {
 519                 ASSERT(ncp != NULL);
 520 
 521                 /*
 522                  * Walk thru the active process list to look for
 523                  * threads that need to have a new home lgroup,
 524                  * or the last CPU they run on is the same CPU
 525                  * being moved out of the partition.
 526                  */
 527 
 528                 for (p = practive; p != NULL; p = p->p_next) {
 529 
 530                         t = p->p_tlist;
 531 
 532                         if (t == NULL)
 533                                 continue;
 534 
 535                         lgrp_diff_lpl = 0;
 536 
 537                         do {
 538 
 539                                 ASSERT(t->t_lpl != NULL);
 540 
 541                                 /*
 542                                  * Update the count of how many threads are
 543                                  * in this CPU's lgroup but have a different lpl
 544                                  */
 545 
 546                                 if (t->t_lpl != cpu_lpl &&
 547                                     t->t_lpl->lpl_lgrpid == lgrpid)
 548                                         lgrp_diff_lpl++;
 549                                 /*
 550                                  * If the lgroup that t is assigned to no
 551                                  * longer has any CPUs in t's partition,
 552                                  * we'll have to choose a new lgroup for t.
 553                                  */
 554 
 555                                 if (!LGRP_CPUS_IN_PART(t->t_lpl->lpl_lgrpid,
 556                                     t->t_cpupart)) {
 557                                         lgrp_move_thread(t,
 558                                             lgrp_choose(t, t->t_cpupart), 0);
 559                                 }
 560 
 561                                 /*
 562                                  * make sure lpl points to our own partition
 563                                  */
 564                                 ASSERT(t->t_lpl >= t->t_cpupart->cp_lgrploads &&
 565                                     (t->t_lpl < t->t_cpupart->cp_lgrploads +
 566                                     t->t_cpupart->cp_nlgrploads));
 567 
 568                                 ASSERT(t->t_lpl->lpl_ncpu > 0);
 569 
 570                                 /* Update CPU last ran on if it was this CPU */
 571                                 if (t->t_cpu == cp && t->t_cpupart == oldpp &&
 572                                     t->t_bound_cpu != cp) {
 573                                         t->t_cpu = disp_lowpri_cpu(ncp,
 574                                             t->t_lpl, t->t_pri, NULL);
 575                                 }
 576                                 t = t->t_forw;
 577                         } while (t != p->p_tlist);
 578 
 579                         /*
 580                          * Didn't find any threads in the same lgroup as this
 581                          * CPU with a different lpl, so remove the lgroup from
 582                          * the process lgroup bitmask.
 583                          */
 584 
 585                         if (lgrp_diff_lpl)
 586                                 klgrpset_del(p->p_lgrpset, lgrpid);
 587                 }
 588 
 589                 /*
 590                  * Walk thread list looking for threads that need to be
 591                  * rehomed, since there are some threads that are not in
 592                  * their process's p_tlist.
 593                  */
 594 
 595                 t = curthread;
 596 
 597                 do {
 598                         ASSERT(t != NULL && t->t_lpl != NULL);
 599 
 600                         /*
 601                          * If the lgroup that t is assigned to no
 602                          * longer has any CPUs in t's partition,
 603                          * we'll have to choose a new lgroup for t.
 604                          * Also, choose best lgroup for home when
 605                          * thread has specified lgroup affinities,
 606                          * since there may be an lgroup with more
 607                          * affinity available after moving CPUs
 608                          * around.
 609                          */
 610                         if (!LGRP_CPUS_IN_PART(t->t_lpl->lpl_lgrpid,
 611                             t->t_cpupart) || t->t_lgrp_affinity) {
 612                                 lgrp_move_thread(t,
 613                                     lgrp_choose(t, t->t_cpupart), 1);
 614                         }
 615 
 616                         /* make sure lpl points to our own partition */
 617                         ASSERT((t->t_lpl >= t->t_cpupart->cp_lgrploads) &&
 618                             (t->t_lpl < t->t_cpupart->cp_lgrploads +
 619                             t->t_cpupart->cp_nlgrploads));
 620 
 621                         ASSERT(t->t_lpl->lpl_ncpu > 0);
 622 
 623                         /* Update CPU last ran on if it was this CPU */
 624                         if (t->t_cpu == cp && t->t_cpupart == oldpp &&
 625                             t->t_bound_cpu != cp) {
 626                                 t->t_cpu = disp_lowpri_cpu(ncp, t->t_lpl,
 627                                     t->t_pri, NULL);
 628                         }
 629 
 630                         t = t->t_next;
 631                 } while (t != curthread);
 632 
 633                 /*
 634                  * Clear off the CPU's run queue, and the kp queue if the
 635                  * partition is now empty.
 636                  */
 637                 disp_cpu_inactive(cp);
 638 
 639                 /*
 640                  * Make cp switch to a thread from the new partition.
 641                  */
 642                 cp->cpu_runrun = 1;
 643                 cp->cpu_kprunrun = 1;
 644         }
 645 
 646         cpu_inmotion = NULL;
 647         start_cpus();
 648 
 649         /*
 650          * Let anyone interested know that cpu has been added to the set.
 651          */
 652         cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN);
 653 
 654         /*
 655          * Now let the cyclic subsystem know that it can reshuffle cyclics
 656          * bound to the new processor set.
 657          */
 658         cyclic_move_in(cp);
 659 
 660         return (0);
 661 }
 662 
 663 /*
 664  * Check if thread can be moved to a new cpu partition.  Called by
 665  * cpupart_move_thread() and pset_bind_start().
 666  */
 667 int
 668 cpupart_movable_thread(kthread_id_t tp, cpupart_t *cp, int ignore)
 669 {
 670         ASSERT(MUTEX_HELD(&cpu_lock));
 671         ASSERT(MUTEX_HELD(&ttoproc(tp)->p_lock));
 672         ASSERT(cp != NULL);
 673         ASSERT(THREAD_LOCK_HELD(tp));
 674 
 675         /*
 676          * CPU-bound threads can't be moved.
 677          */
 678         if (!ignore) {
 679                 cpu_t *boundcpu = tp->t_bound_cpu ? tp->t_bound_cpu :
 680                     tp->t_weakbound_cpu;
 681                 if (boundcpu != NULL && boundcpu->cpu_part != cp)
 682                         return (EBUSY);
 683         }
 684 
 685         if (tp->t_cid == sysdccid) {
 686                 return (EINVAL);        /* For now, sysdc threads can't move */
 687         }
 688 
 689         return (0);
 690 }
 691 
 692 /*
 693  * Move thread to new partition.  If ignore is non-zero, then CPU
 694  * bindings should be ignored (this is used when destroying a
 695  * partition).
 696  */
 697 static int
 698 cpupart_move_thread(kthread_id_t tp, cpupart_t *newpp, int ignore,
 699     void *projbuf, void *zonebuf)
 700 {
 701         cpupart_t *oldpp = tp->t_cpupart;
 702         int ret;
 703 
 704         ASSERT(MUTEX_HELD(&cpu_lock));
 705         ASSERT(MUTEX_HELD(&pidlock));
 706         ASSERT(MUTEX_HELD(&ttoproc(tp)->p_lock));
 707         ASSERT(newpp != NULL);
 708 
 709         if (newpp->cp_cpulist == NULL)
 710                 return (EINVAL);
 711 
 712         /*
 713          * Check for errors first.
 714          */
 715         thread_lock(tp);
 716         if ((ret = cpupart_movable_thread(tp, newpp, ignore)) != 0) {
 717                 thread_unlock(tp);
 718                 return (ret);
 719         }
 720 
 721         /* move the thread */
 722         if (oldpp != newpp) {
 723                 /*
 724                  * Make the thread switch to the new partition.
 725                  */
 726                 tp->t_cpupart = newpp;
 727                 ASSERT(tp->t_lpl != NULL);
 728                 /*
 729                  * Leave the thread on the same lgroup if possible; otherwise
 730                  * choose a new lgroup for it.  In either case, update its
 731                  * t_lpl.
 732                  */
 733                 if (LGRP_CPUS_IN_PART(tp->t_lpl->lpl_lgrpid, newpp) &&
 734                     tp->t_lgrp_affinity == NULL) {
 735                         /*
 736                          * The thread's lgroup has CPUs in the thread's new
 737                          * partition, so the thread can stay assigned to the
 738                          * same lgroup.  Update its t_lpl to point to the
 739                          * lpl_t for its lgroup in its new partition.
 740                          */
 741                         lgrp_move_thread(tp, &tp->t_cpupart->\
 742                             cp_lgrploads[tp->t_lpl->lpl_lgrpid], 1);
 743                 } else {
 744                         /*
 745                          * The thread's lgroup has no cpus in its new
 746                          * partition or it has specified lgroup affinities,
 747                          * so choose the best lgroup for the thread and
 748                          * assign it to that lgroup.
 749                          */
 750                         lgrp_move_thread(tp, lgrp_choose(tp, tp->t_cpupart),
 751                             1);
 752                 }
 753                 /*
 754                  * make sure lpl points to our own partition
 755                  */
 756                 ASSERT((tp->t_lpl >= tp->t_cpupart->cp_lgrploads) &&
 757                     (tp->t_lpl < tp->t_cpupart->cp_lgrploads +
 758                     tp->t_cpupart->cp_nlgrploads));
 759 
 760                 ASSERT(tp->t_lpl->lpl_ncpu > 0);
 761 
 762                 if (tp->t_state == TS_ONPROC) {
 763                         cpu_surrender(tp);
 764                 } else if (tp->t_state == TS_RUN) {
 765                         (void) dispdeq(tp);
 766                         setbackdq(tp);
 767                 }
 768         }
 769 
 770         /*
 771          * Our binding has changed; set TP_CHANGEBIND.
 772          */
 773         tp->t_proc_flag |= TP_CHANGEBIND;
 774         aston(tp);
 775 
 776         thread_unlock(tp);
 777         fss_changepset(tp, newpp, projbuf, zonebuf);
 778 
 779         return (0);             /* success */
 780 }
 781 
 782 
 783 /*
 784  * This function binds a thread to a partition.  Must be called with the
 785  * p_lock of the containing process held (to keep the thread from going
 786  * away), and thus also with cpu_lock held (since cpu_lock must be
 787  * acquired before p_lock).  If ignore is non-zero, then CPU bindings
 788  * should be ignored (this is used when destroying a partition).
 789  */
 790 int
 791 cpupart_bind_thread(kthread_id_t tp, psetid_t psid, int ignore, void *projbuf,
 792     void *zonebuf)
 793 {
 794         cpupart_t       *newpp;
 795 
 796         ASSERT(pool_lock_held());
 797         ASSERT(MUTEX_HELD(&cpu_lock));
 798         ASSERT(MUTEX_HELD(&pidlock));
 799         ASSERT(MUTEX_HELD(&ttoproc(tp)->p_lock));
 800 
 801         if (psid == PS_NONE)
 802                 newpp = &cp_default;
 803         else {
 804                 newpp = cpupart_find(psid);
 805                 if (newpp == NULL) {
 806                         return (EINVAL);
 807                 }
 808         }
 809         return (cpupart_move_thread(tp, newpp, ignore, projbuf, zonebuf));
 810 }
 811 
 812 
 813 /*
 814  * Create a new partition.  On MP systems, this also allocates a
 815  * kpreempt disp queue for that partition.
 816  */
 817 int
 818 cpupart_create(psetid_t *psid)
 819 {
 820         cpupart_t       *pp;
 821 
 822         ASSERT(pool_lock_held());
 823 
 824         pp = kmem_zalloc(sizeof (cpupart_t), KM_SLEEP);
 825 
 826         mutex_enter(&cpu_lock);
 827         if (cp_numparts == cp_max_numparts) {
 828                 mutex_exit(&cpu_lock);
 829                 kmem_free(pp, sizeof (cpupart_t));
 830                 return (ENOMEM);
 831         }
 832         cp_numparts++;
 833         /* find the next free partition ID */
 834         while (cpupart_find(CPTOPS(cp_id_next)) != NULL)
 835                 cp_id_next++;
 836         pp->cp_id = cp_id_next++;
 837         pp->cp_ncpus = 0;
 838         pp->cp_cpulist = NULL;
 839         pp->cp_attr = 0;
 840         klgrpset_clear(pp->cp_lgrpset);
 841         pp->cp_kp_queue.disp_maxrunpri = -1;
 842         pp->cp_kp_queue.disp_max_unbound_pri = -1;
 843         pp->cp_kp_queue.disp_cpu = NULL;
 844         pp->cp_gen = 0;
 845         DISP_LOCK_INIT(&pp->cp_kp_queue.disp_lock);
 846         *psid = CPTOPS(pp->cp_id);
 847         disp_kp_alloc(&pp->cp_kp_queue, v.v_nglobpris);
 848         cpupart_kstat_create(pp);
 849         cpupart_lpl_initialize(pp);
 850 
 851         bitset_init(&pp->cp_cmt_pgs);
 852 
 853         /*
 854          * Initialize and size the partition's bitset of halted CPUs.
 855          */
 856         bitset_init_fanout(&pp->cp_haltset, cp_haltset_fanout);
 857         bitset_resize(&pp->cp_haltset, max_ncpus);
 858 
 859         /*
 860          * Pause all CPUs while changing the partition list, to make sure
 861          * the clock thread (which traverses the list without holding
 862          * cpu_lock) isn't running.
 863          */
 864         pause_cpus(NULL, NULL);
 865         pp->cp_next = cp_list_head;
 866         pp->cp_prev = cp_list_head->cp_prev;
 867         cp_list_head->cp_prev->cp_next = pp;
 868         cp_list_head->cp_prev = pp;
 869         start_cpus();
 870         mutex_exit(&cpu_lock);
 871 
 872         return (0);
 873 }
 874 
 875 /*
 876  * Move threads from specified partition to cp_default. If `force' is specified,
 877  * move all threads, otherwise move only soft-bound threads.
 878  */
 879 static int
 880 cpupart_unbind_threads(cpupart_t *pp, boolean_t unbind_all)
 881 {
 882         void    *projbuf, *zonebuf;
 883         kthread_t *t;
 884         proc_t  *p;
 885         int     err = 0;
 886         psetid_t psid = pp->cp_id;
 887 
 888         ASSERT(pool_lock_held());
 889         ASSERT(MUTEX_HELD(&cpu_lock));
 890 
 891         if (pp == NULL || pp == &cp_default) {
 892                 return (EINVAL);
 893         }
 894 
 895         /*
 896          * Pre-allocate enough buffers for FSS for all active projects and
 897          * for all active zones on the system.  Unused buffers will be
 898          * freed later by fss_freebuf().
 899          */
 900         projbuf = fss_allocbuf(FSS_NPROJ_BUF, FSS_ALLOC_PROJ);
 901         zonebuf = fss_allocbuf(FSS_NPROJ_BUF, FSS_ALLOC_ZONE);
 902 
 903         mutex_enter(&pidlock);
 904         t = curthread;
 905         do {
 906                 if (t->t_bind_pset == psid) {
 907 again:                  p = ttoproc(t);
 908                         mutex_enter(&p->p_lock);
 909                         if (ttoproc(t) != p) {
 910                                 /*
 911                                  * lwp_exit has changed this thread's process
 912                                  * pointer before we grabbed its p_lock.
 913                                  */
 914                                 mutex_exit(&p->p_lock);
 915                                 goto again;
 916                         }
 917 
 918                         /*
 919                          * Can only unbind threads which have revocable binding
 920                          * unless force unbinding requested.
 921                          */
 922                         if (unbind_all || TB_PSET_IS_SOFT(t)) {
 923                                 err = cpupart_bind_thread(t, PS_NONE, 1,
 924                                     projbuf, zonebuf);
 925                                 if (err) {
 926                                         mutex_exit(&p->p_lock);
 927                                         mutex_exit(&pidlock);
 928                                         fss_freebuf(projbuf, FSS_ALLOC_PROJ);
 929                                         fss_freebuf(zonebuf, FSS_ALLOC_ZONE);
 930                                         return (err);
 931                                 }
 932                                 t->t_bind_pset = PS_NONE;
 933                         }
 934                         mutex_exit(&p->p_lock);
 935                 }
 936                 t = t->t_next;
 937         } while (t != curthread);
 938 
 939         mutex_exit(&pidlock);
 940         fss_freebuf(projbuf, FSS_ALLOC_PROJ);
 941         fss_freebuf(zonebuf, FSS_ALLOC_ZONE);
 942         return (err);
 943 }
 944 
 945 /*
 946  * Destroy a partition.
 947  */
 948 int
 949 cpupart_destroy(psetid_t psid)
 950 {
 951         cpu_t   *cp, *first_cp;
 952         cpupart_t *pp, *newpp;
 953         int     err = 0;
 954 
 955         ASSERT(pool_lock_held());
 956         mutex_enter(&cpu_lock);
 957 
 958         pp = cpupart_find(psid);
 959         if (pp == NULL || pp == &cp_default) {
 960                 mutex_exit(&cpu_lock);
 961                 return (EINVAL);
 962         }
 963 
 964         /*
 965          * Unbind all the threads currently bound to the partition.
 966          */
 967         err = cpupart_unbind_threads(pp, B_TRUE);
 968         if (err) {
 969                 mutex_exit(&cpu_lock);
 970                 return (err);
 971         }
 972 
 973         newpp = &cp_default;
 974         while ((cp = pp->cp_cpulist) != NULL) {
 975                 if (err = cpupart_move_cpu(cp, newpp, 0)) {
 976                         mutex_exit(&cpu_lock);
 977                         return (err);
 978                 }
 979         }
 980 
 981         ASSERT(bitset_is_null(&pp->cp_cmt_pgs));
 982         ASSERT(bitset_is_null(&pp->cp_haltset));
 983 
 984         /*
 985          * Teardown the partition's group of active CMT PGs and halted
 986          * CPUs now that they have all left.
 987          */
 988         bitset_fini(&pp->cp_cmt_pgs);
 989         bitset_fini(&pp->cp_haltset);
 990 
 991         /*
 992          * Reset the pointers in any offline processors so they won't
 993          * try to rejoin the destroyed partition when they're turned
 994          * online.
 995          */
 996         first_cp = cp = CPU;
 997         do {
 998                 if (cp->cpu_part == pp) {
 999                         ASSERT(cp->cpu_flags & CPU_OFFLINE);
1000                         cp->cpu_part = newpp;
1001                 }
1002                 cp = cp->cpu_next;
1003         } while (cp != first_cp);
1004 
1005         /*
1006          * Pause all CPUs while changing the partition list, to make sure
1007          * the clock thread (which traverses the list without holding
1008          * cpu_lock) isn't running.
1009          */
1010         pause_cpus(NULL, NULL);
1011         pp->cp_prev->cp_next = pp->cp_next;
1012         pp->cp_next->cp_prev = pp->cp_prev;
1013         if (cp_list_head == pp)
1014                 cp_list_head = pp->cp_next;
1015         start_cpus();
1016 
1017         if (cp_id_next > pp->cp_id)
1018                 cp_id_next = pp->cp_id;
1019 
1020         if (pp->cp_kstat)
1021                 kstat_delete(pp->cp_kstat);
1022 
1023         cp_numparts--;
1024 
1025         disp_kp_free(&pp->cp_kp_queue);
1026 
1027         cpupart_lpl_teardown(pp);
1028 
1029         kmem_free(pp, sizeof (cpupart_t));
1030         mutex_exit(&cpu_lock);
1031 
1032         return (err);
1033 }
1034 
1035 
1036 /*
1037  * Return the ID of the partition to which the specified processor belongs.
1038  */
1039 psetid_t
1040 cpupart_query_cpu(cpu_t *cp)
1041 {
1042         ASSERT(MUTEX_HELD(&cpu_lock));
1043 
1044         return (CPTOPS(cp->cpu_part->cp_id));
1045 }
1046 
1047 
1048 /*
1049  * Attach a processor to an existing partition.
1050  */
1051 int
1052 cpupart_attach_cpu(psetid_t psid, cpu_t *cp, int forced)
1053 {
1054         cpupart_t       *pp;
1055         int             err;
1056 
1057         ASSERT(pool_lock_held());
1058         ASSERT(MUTEX_HELD(&cpu_lock));
1059 
1060         pp = cpupart_find(psid);
1061         if (pp == NULL)
1062                 return (EINVAL);
1063         if (cp->cpu_flags & CPU_OFFLINE)
1064                 return (EINVAL);
1065 
1066         err = cpupart_move_cpu(cp, pp, forced);
1067         return (err);
1068 }
1069 
1070 /*
1071  * Get a list of cpus belonging to the partition.  If numcpus is NULL,
1072  * this just checks for a valid partition.  If numcpus is non-NULL but
1073  * cpulist is NULL, the current number of cpus is stored in *numcpus.
1074  * If both are non-NULL, the current number of cpus is stored in *numcpus,
1075  * and a list of those cpus up to the size originally in *numcpus is
1076  * stored in cpulist[].  Also, store the processor set id in *psid.
1077  * This is useful in case the processor set id passed in was PS_MYID.
1078  */
1079 int
1080 cpupart_get_cpus(psetid_t *psid, processorid_t *cpulist, uint_t *numcpus)
1081 {
1082         cpupart_t       *pp;
1083         uint_t          ncpus;
1084         cpu_t           *c;
1085         int             i;
1086 
1087         mutex_enter(&cpu_lock);
1088         pp = cpupart_find(*psid);
1089         if (pp == NULL) {
1090                 mutex_exit(&cpu_lock);
1091                 return (EINVAL);
1092         }
1093         *psid = CPTOPS(pp->cp_id);
1094         ncpus = pp->cp_ncpus;
1095         if (numcpus) {
1096                 if (ncpus > *numcpus) {
1097                         /*
1098                          * Only copy as many cpus as were passed in, but
1099                          * pass back the real number.
1100                          */
1101                         uint_t t = ncpus;
1102                         ncpus = *numcpus;
1103                         *numcpus = t;
1104                 } else
1105                         *numcpus = ncpus;
1106 
1107                 if (cpulist) {
1108                         c = pp->cp_cpulist;
1109                         for (i = 0; i < ncpus; i++) {
1110                                 ASSERT(c != NULL);
1111                                 cpulist[i] = c->cpu_id;
1112                                 c = c->cpu_next_part;
1113                         }
1114                 }
1115         }
1116         mutex_exit(&cpu_lock);
1117         return (0);
1118 }
1119 
1120 /*
1121  * Reallocate kpreempt queues for each CPU partition.  Called from
1122  * disp_setup when a new scheduling class is loaded that increases the
1123  * number of priorities in the system.
1124  */
1125 void
1126 cpupart_kpqalloc(pri_t npri)
1127 {
1128         cpupart_t *cpp;
1129 
1130         ASSERT(MUTEX_HELD(&cpu_lock));
1131         cpp = cp_list_head;
1132         do {
1133                 disp_kp_alloc(&cpp->cp_kp_queue, npri);
1134                 cpp = cpp->cp_next;
1135         } while (cpp != cp_list_head);
1136 }
1137 
1138 int
1139 cpupart_get_loadavg(psetid_t psid, int *buf, int nelem)
1140 {
1141         cpupart_t *cp;
1142         int i;
1143 
1144         ASSERT(nelem >= 0);
1145         ASSERT(nelem <= LOADAVG_NSTATS);
1146         ASSERT(MUTEX_HELD(&cpu_lock));
1147 
1148         cp = cpupart_find(psid);
1149         if (cp == NULL)
1150                 return (EINVAL);
1151         for (i = 0; i < nelem; i++)
1152                 buf[i] = cp->cp_hp_avenrun[i] >> (16 - FSHIFT);
1153 
1154         return (0);
1155 }
1156 
1157 
1158 uint_t
1159 cpupart_list(psetid_t *list, uint_t nelem, int flag)
1160 {
1161         uint_t numpart = 0;
1162         cpupart_t *cp;
1163 
1164         ASSERT(MUTEX_HELD(&cpu_lock));
1165         ASSERT(flag == CP_ALL || flag == CP_NONEMPTY);
1166 
1167         if (list != NULL) {
1168                 cp = cp_list_head;
1169                 do {
1170                         if (((flag == CP_ALL) && (cp != &cp_default)) ||
1171                             ((flag == CP_NONEMPTY) && (cp->cp_ncpus != 0))) {
1172                                 if (numpart == nelem)
1173                                         break;
1174                                 list[numpart++] = CPTOPS(cp->cp_id);
1175                         }
1176                         cp = cp->cp_next;
1177                 } while (cp != cp_list_head);
1178         }
1179 
1180         ASSERT(numpart < cp_numparts);
1181 
1182         if (flag == CP_ALL)
1183                 numpart = cp_numparts - 1; /* leave out default partition */
1184         else if (flag == CP_NONEMPTY)
1185                 numpart = cp_numparts_nonempty;
1186 
1187         return (numpart);
1188 }
1189 
1190 int
1191 cpupart_setattr(psetid_t psid, uint_t attr)
1192 {
1193         cpupart_t *cp;
1194 
1195         ASSERT(pool_lock_held());
1196 
1197         mutex_enter(&cpu_lock);
1198         if ((cp = cpupart_find(psid)) == NULL) {
1199                 mutex_exit(&cpu_lock);
1200                 return (EINVAL);
1201         }
1202         /*
1203          * PSET_NOESCAPE attribute for default cpu partition is always set
1204          */
1205         if (cp == &cp_default && !(attr & PSET_NOESCAPE)) {
1206                 mutex_exit(&cpu_lock);
1207                 return (EINVAL);
1208         }
1209         cp->cp_attr = attr;
1210         mutex_exit(&cpu_lock);
1211         return (0);
1212 }
1213 
1214 int
1215 cpupart_getattr(psetid_t psid, uint_t *attrp)
1216 {
1217         cpupart_t *cp;
1218 
1219         mutex_enter(&cpu_lock);
1220         if ((cp = cpupart_find(psid)) == NULL) {
1221                 mutex_exit(&cpu_lock);
1222                 return (EINVAL);
1223         }
1224         *attrp = cp->cp_attr;
1225         mutex_exit(&cpu_lock);
1226         return (0);
1227 }