1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1996, 2010, Oracle and/or its affiliates. All rights reserved.
  23  *
  24  * Copyright 2018 Joyent, Inc.
  25  * Copyright (c) 2017 by Delphix. All rights reserved.
  26  */
  27 
  28 #include <sys/types.h>
  29 #include <sys/systm.h>
  30 #include <sys/cmn_err.h>
  31 #include <sys/cpuvar.h>
  32 #include <sys/thread.h>
  33 #include <sys/disp.h>
  34 #include <sys/kmem.h>
  35 #include <sys/debug.h>
  36 #include <sys/cpupart.h>
  37 #include <sys/pset.h>
  38 #include <sys/var.h>
  39 #include <sys/cyclic.h>
  40 #include <sys/lgrp.h>
  41 #include <sys/pghw.h>
  42 #include <sys/loadavg.h>
  43 #include <sys/class.h>
  44 #include <sys/fss.h>
  45 #include <sys/pool.h>
  46 #include <sys/pool_pset.h>
  47 #include <sys/policy.h>
  48 
  49 /*
  50  * Calling pool_lock() protects the pools configuration, which includes
  51  * CPU partitions.  cpu_lock protects the CPU partition list, and prevents
  52  * partitions from being created or destroyed while the lock is held.
  53  * The lock ordering with respect to related locks is:
  54  *
  55  *    pool_lock() ---> cpu_lock  --->  pidlock  -->  p_lock
  56  *
  57  * Blocking memory allocations may be made while holding "pool_lock"
  58  * or cpu_lock.
  59  */
  60 
  61 /*
  62  * The cp_default partition is allocated statically, but its lgroup load average
  63  * (lpl) list is allocated dynamically after kmem subsystem is initialized. This
  64  * saves some memory since the space allocated reflects the actual number of
  65  * lgroups supported by the platform. The lgrp facility provides a temporary
  66  * space to hold lpl information during system bootstrap.
  67  */
  68 
  69 cpupart_t               *cp_list_head;
  70 cpupart_t               cp_default;
  71 static cpupartid_t      cp_id_next;
  72 uint_t                  cp_numparts;
  73 uint_t                  cp_numparts_nonempty;
  74 
  75 /*
  76  * Need to limit total number of partitions to avoid slowing down the
  77  * clock code too much.  The clock code traverses the list of
  78  * partitions and needs to be able to execute in a reasonable amount
  79  * of time (less than 1/hz seconds).  The maximum is sized based on
  80  * max_ncpus so it shouldn't be a problem unless there are large
  81  * numbers of empty partitions.
  82  */
  83 static uint_t           cp_max_numparts;
  84 
  85 /*
  86  * Processor sets and CPU partitions are different but related concepts.
  87  * A processor set is a user-level abstraction allowing users to create
  88  * sets of CPUs and bind threads exclusively to those sets.  A CPU
  89  * partition is a kernel dispatcher object consisting of a set of CPUs
  90  * and a global dispatch queue.  The processor set abstraction is
  91  * implemented via a CPU partition, and currently there is a 1-1
  92  * mapping between processor sets and partitions (excluding the default
  93  * partition, which is not visible as a processor set).  Hence, the
  94  * numbering for processor sets and CPU partitions is identical.  This
  95  * may not always be true in the future, and these macros could become
  96  * less trivial if we support e.g. a processor set containing multiple
  97  * CPU partitions.
  98  */
  99 #define PSTOCP(psid)    ((cpupartid_t)((psid) == PS_NONE ? CP_DEFAULT : (psid)))
 100 #define CPTOPS(cpid)    ((psetid_t)((cpid) == CP_DEFAULT ? PS_NONE : (cpid)))
 101 
 102 static int cpupart_unbind_threads(cpupart_t *, boolean_t);
 103 
 104 /*
 105  * Find a CPU partition given a processor set ID.
 106  */
 107 static cpupart_t *
 108 cpupart_find_all(psetid_t psid)
 109 {
 110         cpupart_t *cp;
 111         cpupartid_t cpid = PSTOCP(psid);
 112 
 113         ASSERT(MUTEX_HELD(&cpu_lock));
 114 
 115         /* default partition not visible as a processor set */
 116         if (psid == CP_DEFAULT)
 117                 return (NULL);
 118 
 119         if (psid == PS_MYID)
 120                 return (curthread->t_cpupart);
 121 
 122         cp = cp_list_head;
 123         do {
 124                 if (cp->cp_id == cpid)
 125                         return (cp);
 126                 cp = cp->cp_next;
 127         } while (cp != cp_list_head);
 128         return (NULL);
 129 }
 130 
 131 /*
 132  * Find a CPU partition given a processor set ID if the processor set
 133  * should be visible from the calling zone.
 134  */
 135 cpupart_t *
 136 cpupart_find(psetid_t psid)
 137 {
 138         cpupart_t *cp;
 139 
 140         ASSERT(MUTEX_HELD(&cpu_lock));
 141         cp = cpupart_find_all(psid);
 142         if (cp != NULL && !INGLOBALZONE(curproc) && pool_pset_enabled() &&
 143             zone_pset_get(curproc->p_zone) != CPTOPS(cp->cp_id))
 144                         return (NULL);
 145         return (cp);
 146 }
 147 
 148 static int
 149 cpupart_kstat_update(kstat_t *ksp, int rw)
 150 {
 151         cpupart_t *cp = (cpupart_t *)ksp->ks_private;
 152         cpupart_kstat_t *cpksp = ksp->ks_data;
 153 
 154         if (rw == KSTAT_WRITE)
 155                 return (EACCES);
 156 
 157         cpksp->cpk_updates.value.ui64 = cp->cp_updates;
 158         cpksp->cpk_runnable.value.ui64 = cp->cp_nrunnable_cum;
 159         cpksp->cpk_waiting.value.ui64 = cp->cp_nwaiting_cum;
 160         cpksp->cpk_ncpus.value.ui32 = cp->cp_ncpus;
 161         cpksp->cpk_avenrun_1min.value.ui32 = cp->cp_hp_avenrun[0] >>
 162             (16 - FSHIFT);
 163         cpksp->cpk_avenrun_5min.value.ui32 = cp->cp_hp_avenrun[1] >>
 164             (16 - FSHIFT);
 165         cpksp->cpk_avenrun_15min.value.ui32 = cp->cp_hp_avenrun[2] >>
 166             (16 - FSHIFT);
 167         return (0);
 168 }
 169 
 170 static void
 171 cpupart_kstat_create(cpupart_t *cp)
 172 {
 173         kstat_t *ksp;
 174         zoneid_t zoneid;
 175 
 176         ASSERT(MUTEX_HELD(&cpu_lock));
 177 
 178         /*
 179          * We have a bit of a chicken-egg problem since this code will
 180          * get called to create the kstats for CP_DEFAULT before the
 181          * pools framework gets initialized.  We circumvent the problem
 182          * by special-casing cp_default.
 183          */
 184         if (cp != &cp_default && pool_pset_enabled())
 185                 zoneid = GLOBAL_ZONEID;
 186         else
 187                 zoneid = ALL_ZONES;
 188         ksp = kstat_create_zone("unix", cp->cp_id, "pset", "misc",
 189             KSTAT_TYPE_NAMED,
 190             sizeof (cpupart_kstat_t) / sizeof (kstat_named_t), 0, zoneid);
 191         if (ksp != NULL) {
 192                 cpupart_kstat_t *cpksp = ksp->ks_data;
 193 
 194                 kstat_named_init(&cpksp->cpk_updates, "updates",
 195                     KSTAT_DATA_UINT64);
 196                 kstat_named_init(&cpksp->cpk_runnable, "runnable",
 197                     KSTAT_DATA_UINT64);
 198                 kstat_named_init(&cpksp->cpk_waiting, "waiting",
 199                     KSTAT_DATA_UINT64);
 200                 kstat_named_init(&cpksp->cpk_ncpus, "ncpus",
 201                     KSTAT_DATA_UINT32);
 202                 kstat_named_init(&cpksp->cpk_avenrun_1min, "avenrun_1min",
 203                     KSTAT_DATA_UINT32);
 204                 kstat_named_init(&cpksp->cpk_avenrun_5min, "avenrun_5min",
 205                     KSTAT_DATA_UINT32);
 206                 kstat_named_init(&cpksp->cpk_avenrun_15min, "avenrun_15min",
 207                     KSTAT_DATA_UINT32);
 208 
 209                 ksp->ks_update = cpupart_kstat_update;
 210                 ksp->ks_private = cp;
 211 
 212                 kstat_install(ksp);
 213         }
 214         cp->cp_kstat = ksp;
 215 }
 216 
 217 /*
 218  * Initialize the cpupart's lgrp partions (lpls)
 219  */
 220 static void
 221 cpupart_lpl_initialize(cpupart_t *cp)
 222 {
 223         int i, sz;
 224 
 225         sz = cp->cp_nlgrploads = lgrp_plat_max_lgrps();
 226         cp->cp_lgrploads = kmem_zalloc(sizeof (lpl_t) * sz, KM_SLEEP);
 227 
 228         for (i = 0; i < sz; i++) {
 229                 /*
 230                  * The last entry of the lpl's resource set is always NULL
 231                  * by design (to facilitate iteration)...hence the "oversizing"
 232                  * by 1.
 233                  */
 234                 cp->cp_lgrploads[i].lpl_rset_sz = sz + 1;
 235                 cp->cp_lgrploads[i].lpl_rset =
 236                     kmem_zalloc(sizeof (struct lgrp_ld *) * (sz + 1), KM_SLEEP);
 237                 cp->cp_lgrploads[i].lpl_id2rset =
 238                     kmem_zalloc(sizeof (int) * (sz + 1), KM_SLEEP);
 239                 cp->cp_lgrploads[i].lpl_lgrpid = i;
 240         }
 241 }
 242 
 243 /*
 244  * Teardown the cpupart's lgrp partitions
 245  */
 246 static void
 247 cpupart_lpl_teardown(cpupart_t *cp)
 248 {
 249         int i, sz;
 250         lpl_t *lpl;
 251 
 252         for (i = 0; i < cp->cp_nlgrploads; i++) {
 253                 lpl = &cp->cp_lgrploads[i];
 254 
 255                 sz = lpl->lpl_rset_sz;
 256                 kmem_free(lpl->lpl_rset, sizeof (struct lgrp_ld *) * sz);
 257                 kmem_free(lpl->lpl_id2rset, sizeof (int) * sz);
 258                 lpl->lpl_rset = NULL;
 259                 lpl->lpl_id2rset = NULL;
 260         }
 261         kmem_free(cp->cp_lgrploads, sizeof (lpl_t) * cp->cp_nlgrploads);
 262         cp->cp_lgrploads = NULL;
 263 }
 264 
 265 /*
 266  * Initialize the default partition and kpreempt disp queue.
 267  */
 268 void
 269 cpupart_initialize_default(void)
 270 {
 271         lgrp_id_t i;
 272 
 273         cp_list_head = &cp_default;
 274         cp_default.cp_next = &cp_default;
 275         cp_default.cp_prev = &cp_default;
 276         cp_default.cp_id = CP_DEFAULT;
 277         cp_default.cp_kp_queue.disp_maxrunpri = -1;
 278         cp_default.cp_kp_queue.disp_max_unbound_pri = -1;
 279         cp_default.cp_kp_queue.disp_cpu = NULL;
 280         cp_default.cp_gen = 0;
 281         cp_default.cp_loadavg.lg_cur = 0;
 282         cp_default.cp_loadavg.lg_len = 0;
 283         cp_default.cp_loadavg.lg_total = 0;
 284         for (i = 0; i < S_LOADAVG_SZ; i++) {
 285                 cp_default.cp_loadavg.lg_loads[i] = 0;
 286         }
 287         DISP_LOCK_INIT(&cp_default.cp_kp_queue.disp_lock);
 288         cp_id_next = CP_DEFAULT + 1;
 289         cpupart_kstat_create(&cp_default);
 290         cp_numparts = 1;
 291         if (cp_max_numparts == 0)       /* allow for /etc/system tuning */
 292                 cp_max_numparts = max_ncpus * 2 + 1;
 293         /*
 294          * Allocate space for cp_default list of lgrploads
 295          */
 296         cpupart_lpl_initialize(&cp_default);
 297 
 298         /*
 299          * The initial lpl topology is created in a special lpl list
 300          * lpl_bootstrap. It should be copied to cp_default.
 301          * NOTE: lpl_topo_bootstrap() also updates CPU0 cpu_lpl pointer to point
 302          *       to the correct lpl in the cp_default.cp_lgrploads list.
 303          */
 304         lpl_topo_bootstrap(cp_default.cp_lgrploads,
 305             cp_default.cp_nlgrploads);
 306 
 307 
 308         cp_default.cp_attr = PSET_NOESCAPE;
 309         cp_numparts_nonempty = 1;
 310         /*
 311          * Set t0's home
 312          */
 313         t0.t_lpl = &cp_default.cp_lgrploads[LGRP_ROOTID];
 314 
 315         bitset_init(&cp_default.cp_cmt_pgs);
 316         bitset_init_fanout(&cp_default.cp_haltset, cp_haltset_fanout);
 317 
 318         bitset_resize(&cp_default.cp_haltset, max_ncpus);
 319 }
 320 
 321 
 322 static int
 323 cpupart_move_cpu(cpu_t *cp, cpupart_t *newpp, int forced)
 324 {
 325         cpupart_t *oldpp;
 326         cpu_t   *ncp, *newlist;
 327         kthread_t *t;
 328         int     move_threads = 1;
 329         lgrp_id_t lgrpid;
 330         proc_t  *p;
 331         int lgrp_diff_lpl;
 332         lpl_t   *cpu_lpl;
 333         int     ret;
 334         boolean_t unbind_all_threads = (forced != 0);
 335 
 336         ASSERT(MUTEX_HELD(&cpu_lock));
 337         ASSERT(newpp != NULL);
 338 
 339         oldpp = cp->cpu_part;
 340         ASSERT(oldpp != NULL);
 341         ASSERT(oldpp->cp_ncpus > 0);
 342 
 343         if (newpp == oldpp) {
 344                 /*
 345                  * Don't need to do anything.
 346                  */
 347                 return (0);
 348         }
 349 
 350         cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_OUT);
 351 
 352         if (!disp_bound_partition(cp, 0)) {
 353                 /*
 354                  * Don't need to move threads if there are no threads in
 355                  * the partition.  Note that threads can't enter the
 356                  * partition while we're holding cpu_lock.
 357                  */
 358                 move_threads = 0;
 359         } else if (oldpp->cp_ncpus == 1) {
 360                 /*
 361                  * The last CPU is removed from a partition which has threads
 362                  * running in it. Some of these threads may be bound to this
 363                  * CPU.
 364                  *
 365                  * Attempt to unbind threads from the CPU and from the processor
 366                  * set. Note that no threads should be bound to this CPU since
 367                  * cpupart_move_threads will refuse to move bound threads to
 368                  * other CPUs.
 369                  */
 370                 (void) cpu_unbind(oldpp->cp_cpulist->cpu_id, B_FALSE);
 371                 (void) cpupart_unbind_threads(oldpp, B_FALSE);
 372 
 373                 if (!disp_bound_partition(cp, 0)) {
 374                         /*
 375                          * No bound threads in this partition any more
 376                          */
 377                         move_threads = 0;
 378                 } else {
 379                         /*
 380                          * There are still threads bound to the partition
 381                          */
 382                         cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN);
 383                         return (EBUSY);
 384                 }
 385         }
 386 
 387         /*
 388          * If forced flag is set unbind any threads from this CPU.
 389          * Otherwise unbind soft-bound threads only.
 390          */
 391         if ((ret = cpu_unbind(cp->cpu_id, unbind_all_threads)) != 0) {
 392                 cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN);
 393                 return (ret);
 394         }
 395 
 396         /*
 397          * Stop further threads weak binding to this cpu.
 398          */
 399         cpu_inmotion = cp;
 400         membar_enter();
 401 
 402         /*
 403          * Notify the Processor Groups subsystem that the CPU
 404          * will be moving cpu partitions. This is done before
 405          * CPUs are paused to provide an opportunity for any
 406          * needed memory allocations.
 407          */
 408         pg_cpupart_out(cp, oldpp);
 409         pg_cpupart_in(cp, newpp);
 410 
 411 again:
 412         if (move_threads) {
 413                 int loop_count;
 414                 /*
 415                  * Check for threads strong or weak bound to this CPU.
 416                  */
 417                 for (loop_count = 0; disp_bound_threads(cp, 0); loop_count++) {
 418                         if (loop_count >= 5) {
 419                                 cpu_state_change_notify(cp->cpu_id,
 420                                     CPU_CPUPART_IN);
 421                                 pg_cpupart_out(cp, newpp);
 422                                 pg_cpupart_in(cp, oldpp);
 423                                 cpu_inmotion = NULL;
 424                                 return (EBUSY); /* some threads still bound */
 425                         }
 426                         delay(1);
 427                 }
 428         }
 429 
 430         /*
 431          * Before we actually start changing data structures, notify
 432          * the cyclic subsystem that we want to move this CPU out of its
 433          * partition.
 434          */
 435         if (!cyclic_move_out(cp)) {
 436                 /*
 437                  * This CPU must be the last CPU in a processor set with
 438                  * a bound cyclic.
 439                  */
 440                 cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN);
 441                 pg_cpupart_out(cp, newpp);
 442                 pg_cpupart_in(cp, oldpp);
 443                 cpu_inmotion = NULL;
 444                 return (EBUSY);
 445         }
 446 
 447         pause_cpus(cp, NULL);
 448 
 449         if (move_threads) {
 450                 /*
 451                  * The thread on cpu before the pause thread may have read
 452                  * cpu_inmotion before we raised the barrier above.  Check
 453                  * again.
 454                  */
 455                 if (disp_bound_threads(cp, 1)) {
 456                         start_cpus();
 457                         goto again;
 458                 }
 459 
 460         }
 461 
 462         /*
 463          * Now that CPUs are paused, let the PG subsystem perform
 464          * any necessary data structure updates.
 465          */
 466         pg_cpupart_move(cp, oldpp, newpp);
 467 
 468         /* save this cpu's lgroup -- it'll be the same in the new partition */
 469         lgrpid = cp->cpu_lpl->lpl_lgrpid;
 470 
 471         cpu_lpl = cp->cpu_lpl;
 472         /*
 473          * let the lgroup framework know cp has left the partition
 474          */
 475         lgrp_config(LGRP_CONFIG_CPUPART_DEL, (uintptr_t)cp, lgrpid);
 476 
 477         /* move out of old partition */
 478         oldpp->cp_ncpus--;
 479         if (oldpp->cp_ncpus > 0) {
 480 
 481                 ncp = cp->cpu_prev_part->cpu_next_part = cp->cpu_next_part;
 482                 cp->cpu_next_part->cpu_prev_part = cp->cpu_prev_part;
 483                 if (oldpp->cp_cpulist == cp) {
 484                         oldpp->cp_cpulist = ncp;
 485                 }
 486         } else {
 487                 ncp = oldpp->cp_cpulist = NULL;
 488                 cp_numparts_nonempty--;
 489                 ASSERT(cp_numparts_nonempty != 0);
 490         }
 491         oldpp->cp_gen++;
 492 
 493         /* move into new partition */
 494         newlist = newpp->cp_cpulist;
 495         if (newlist == NULL) {
 496                 newpp->cp_cpulist = cp->cpu_next_part = cp->cpu_prev_part = cp;
 497                 cp_numparts_nonempty++;
 498                 ASSERT(cp_numparts_nonempty != 0);
 499         } else {
 500                 cp->cpu_next_part = newlist;
 501                 cp->cpu_prev_part = newlist->cpu_prev_part;
 502                 newlist->cpu_prev_part->cpu_next_part = cp;
 503                 newlist->cpu_prev_part = cp;
 504         }
 505         cp->cpu_part = newpp;
 506         newpp->cp_ncpus++;
 507         newpp->cp_gen++;
 508 
 509         ASSERT(bitset_is_null(&newpp->cp_haltset));
 510         ASSERT(bitset_is_null(&oldpp->cp_haltset));
 511 
 512         /*
 513          * let the lgroup framework know cp has entered the partition
 514          */
 515         lgrp_config(LGRP_CONFIG_CPUPART_ADD, (uintptr_t)cp, lgrpid);
 516 
 517         /*
 518          * If necessary, move threads off processor.
 519          */
 520         if (move_threads) {
 521                 ASSERT(ncp != NULL);
 522 
 523                 /*
 524                  * Walk thru the active process list to look for
 525                  * threads that need to have a new home lgroup,
 526                  * or the last CPU they run on is the same CPU
 527                  * being moved out of the partition.
 528                  */
 529 
 530                 for (p = practive; p != NULL; p = p->p_next) {
 531 
 532                         t = p->p_tlist;
 533 
 534                         if (t == NULL)
 535                                 continue;
 536 
 537                         lgrp_diff_lpl = 0;
 538 
 539                         do {
 540 
 541                                 ASSERT(t->t_lpl != NULL);
 542 
 543                                 /*
 544                                  * Update the count of how many threads are
 545                                  * in this CPU's lgroup but have a different lpl
 546                                  */
 547 
 548                                 if (t->t_lpl != cpu_lpl &&
 549                                     t->t_lpl->lpl_lgrpid == lgrpid)
 550                                         lgrp_diff_lpl++;
 551                                 /*
 552                                  * If the lgroup that t is assigned to no
 553                                  * longer has any CPUs in t's partition,
 554                                  * we'll have to choose a new lgroup for t.
 555                                  */
 556 
 557                                 if (!LGRP_CPUS_IN_PART(t->t_lpl->lpl_lgrpid,
 558                                     t->t_cpupart)) {
 559                                         lgrp_move_thread(t,
 560                                             lgrp_choose(t, t->t_cpupart), 0);
 561                                 }
 562 
 563                                 /*
 564                                  * make sure lpl points to our own partition
 565                                  */
 566                                 ASSERT(t->t_lpl >= t->t_cpupart->cp_lgrploads &&
 567                                     (t->t_lpl < t->t_cpupart->cp_lgrploads +
 568                                     t->t_cpupart->cp_nlgrploads));
 569 
 570                                 ASSERT(t->t_lpl->lpl_ncpu > 0);
 571 
 572                                 /* Update CPU last ran on if it was this CPU */
 573                                 if (t->t_cpu == cp && t->t_cpupart == oldpp &&
 574                                     t->t_bound_cpu != cp) {
 575                                         t->t_cpu = disp_lowpri_cpu(ncp, t,
 576                                             t->t_pri);
 577                                 }
 578                                 t = t->t_forw;
 579                         } while (t != p->p_tlist);
 580 
 581                         /*
 582                          * Didn't find any threads in the same lgroup as this
 583                          * CPU with a different lpl, so remove the lgroup from
 584                          * the process lgroup bitmask.
 585                          */
 586 
 587                         if (lgrp_diff_lpl)
 588                                 klgrpset_del(p->p_lgrpset, lgrpid);
 589                 }
 590 
 591                 /*
 592                  * Walk thread list looking for threads that need to be
 593                  * rehomed, since there are some threads that are not in
 594                  * their process's p_tlist.
 595                  */
 596 
 597                 t = curthread;
 598 
 599                 do {
 600                         ASSERT(t != NULL && t->t_lpl != NULL);
 601 
 602                         /*
 603                          * If the lgroup that t is assigned to no
 604                          * longer has any CPUs in t's partition,
 605                          * we'll have to choose a new lgroup for t.
 606                          * Also, choose best lgroup for home when
 607                          * thread has specified lgroup affinities,
 608                          * since there may be an lgroup with more
 609                          * affinity available after moving CPUs
 610                          * around.
 611                          */
 612                         if (!LGRP_CPUS_IN_PART(t->t_lpl->lpl_lgrpid,
 613                             t->t_cpupart) || t->t_lgrp_affinity) {
 614                                 lgrp_move_thread(t,
 615                                     lgrp_choose(t, t->t_cpupart), 1);
 616                         }
 617 
 618                         /* make sure lpl points to our own partition */
 619                         ASSERT((t->t_lpl >= t->t_cpupart->cp_lgrploads) &&
 620                             (t->t_lpl < t->t_cpupart->cp_lgrploads +
 621                             t->t_cpupart->cp_nlgrploads));
 622 
 623                         ASSERT(t->t_lpl->lpl_ncpu > 0);
 624 
 625                         /* Update CPU last ran on if it was this CPU */
 626                         if (t->t_cpu == cp && t->t_cpupart == oldpp &&
 627                             t->t_bound_cpu != cp) {
 628                                 t->t_cpu = disp_lowpri_cpu(ncp, t,
 629                                     t->t_pri);
 630                         }
 631 
 632                         t = t->t_next;
 633                 } while (t != curthread);
 634 
 635                 /*
 636                  * Clear off the CPU's run queue, and the kp queue if the
 637                  * partition is now empty.
 638                  */
 639                 disp_cpu_inactive(cp);
 640 
 641                 /*
 642                  * Make cp switch to a thread from the new partition.
 643                  */
 644                 cp->cpu_runrun = 1;
 645                 cp->cpu_kprunrun = 1;
 646         }
 647 
 648         cpu_inmotion = NULL;
 649         start_cpus();
 650 
 651         /*
 652          * Let anyone interested know that cpu has been added to the set.
 653          */
 654         cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN);
 655 
 656         /*
 657          * Now let the cyclic subsystem know that it can reshuffle cyclics
 658          * bound to the new processor set.
 659          */
 660         cyclic_move_in(cp);
 661 
 662         return (0);
 663 }
 664 
 665 /*
 666  * Check if thread can be moved to a new cpu partition.  Called by
 667  * cpupart_move_thread() and pset_bind_start().
 668  */
 669 int
 670 cpupart_movable_thread(kthread_id_t tp, cpupart_t *cp, int ignore)
 671 {
 672         ASSERT(MUTEX_HELD(&cpu_lock));
 673         ASSERT(MUTEX_HELD(&ttoproc(tp)->p_lock));
 674         ASSERT(cp != NULL);
 675         ASSERT(THREAD_LOCK_HELD(tp));
 676 
 677         /*
 678          * CPU-bound threads can't be moved.
 679          */
 680         if (!ignore) {
 681                 cpu_t *boundcpu = tp->t_bound_cpu ? tp->t_bound_cpu :
 682                     tp->t_weakbound_cpu;
 683                 if (boundcpu != NULL && boundcpu->cpu_part != cp)
 684                         return (EBUSY);
 685         }
 686 
 687         if (tp->t_cid == sysdccid) {
 688                 return (EINVAL);        /* For now, sysdc threads can't move */
 689         }
 690 
 691         return (0);
 692 }
 693 
 694 /*
 695  * Move thread to new partition.  If ignore is non-zero, then CPU
 696  * bindings should be ignored (this is used when destroying a
 697  * partition).
 698  */
 699 static int
 700 cpupart_move_thread(kthread_id_t tp, cpupart_t *newpp, int ignore,
 701     void *projbuf, void *zonebuf)
 702 {
 703         cpupart_t *oldpp = tp->t_cpupart;
 704         int ret;
 705 
 706         ASSERT(MUTEX_HELD(&cpu_lock));
 707         ASSERT(MUTEX_HELD(&pidlock));
 708         ASSERT(MUTEX_HELD(&ttoproc(tp)->p_lock));
 709         ASSERT(newpp != NULL);
 710 
 711         if (newpp->cp_cpulist == NULL)
 712                 return (EINVAL);
 713 
 714         /*
 715          * Check for errors first.
 716          */
 717         thread_lock(tp);
 718         if ((ret = cpupart_movable_thread(tp, newpp, ignore)) != 0) {
 719                 thread_unlock(tp);
 720                 return (ret);
 721         }
 722 
 723         /* move the thread */
 724         if (oldpp != newpp) {
 725                 /*
 726                  * Make the thread switch to the new partition.
 727                  */
 728                 tp->t_cpupart = newpp;
 729                 ASSERT(tp->t_lpl != NULL);
 730                 /*
 731                  * Leave the thread on the same lgroup if possible; otherwise
 732                  * choose a new lgroup for it.  In either case, update its
 733                  * t_lpl.
 734                  */
 735                 if (LGRP_CPUS_IN_PART(tp->t_lpl->lpl_lgrpid, newpp) &&
 736                     tp->t_lgrp_affinity == NULL) {
 737                         /*
 738                          * The thread's lgroup has CPUs in the thread's new
 739                          * partition, so the thread can stay assigned to the
 740                          * same lgroup.  Update its t_lpl to point to the
 741                          * lpl_t for its lgroup in its new partition.
 742                          */
 743                         lgrp_move_thread(tp, &tp->t_cpupart->\
 744                             cp_lgrploads[tp->t_lpl->lpl_lgrpid], 1);
 745                 } else {
 746                         /*
 747                          * The thread's lgroup has no cpus in its new
 748                          * partition or it has specified lgroup affinities,
 749                          * so choose the best lgroup for the thread and
 750                          * assign it to that lgroup.
 751                          */
 752                         lgrp_move_thread(tp, lgrp_choose(tp, tp->t_cpupart),
 753                             1);
 754                 }
 755                 /*
 756                  * make sure lpl points to our own partition
 757                  */
 758                 ASSERT((tp->t_lpl >= tp->t_cpupart->cp_lgrploads) &&
 759                     (tp->t_lpl < tp->t_cpupart->cp_lgrploads +
 760                     tp->t_cpupart->cp_nlgrploads));
 761 
 762                 ASSERT(tp->t_lpl->lpl_ncpu > 0);
 763 
 764                 if (tp->t_state == TS_ONPROC) {
 765                         cpu_surrender(tp);
 766                 } else if (tp->t_state == TS_RUN) {
 767                         (void) dispdeq(tp);
 768                         setbackdq(tp);
 769                 }
 770         }
 771 
 772         /*
 773          * Our binding has changed; set TP_CHANGEBIND.
 774          */
 775         tp->t_proc_flag |= TP_CHANGEBIND;
 776         aston(tp);
 777 
 778         thread_unlock(tp);
 779         fss_changepset(tp, newpp, projbuf, zonebuf);
 780 
 781         return (0);             /* success */
 782 }
 783 
 784 
 785 /*
 786  * This function binds a thread to a partition.  Must be called with the
 787  * p_lock of the containing process held (to keep the thread from going
 788  * away), and thus also with cpu_lock held (since cpu_lock must be
 789  * acquired before p_lock).  If ignore is non-zero, then CPU bindings
 790  * should be ignored (this is used when destroying a partition).
 791  */
 792 int
 793 cpupart_bind_thread(kthread_id_t tp, psetid_t psid, int ignore, void *projbuf,
 794     void *zonebuf)
 795 {
 796         cpupart_t       *newpp;
 797 
 798         ASSERT(pool_lock_held());
 799         ASSERT(MUTEX_HELD(&cpu_lock));
 800         ASSERT(MUTEX_HELD(&pidlock));
 801         ASSERT(MUTEX_HELD(&ttoproc(tp)->p_lock));
 802 
 803         if (psid == PS_NONE)
 804                 newpp = &cp_default;
 805         else {
 806                 newpp = cpupart_find(psid);
 807                 if (newpp == NULL) {
 808                         return (EINVAL);
 809                 }
 810         }
 811         return (cpupart_move_thread(tp, newpp, ignore, projbuf, zonebuf));
 812 }
 813 
 814 
 815 /*
 816  * Create a new partition.  On MP systems, this also allocates a
 817  * kpreempt disp queue for that partition.
 818  */
 819 int
 820 cpupart_create(psetid_t *psid)
 821 {
 822         cpupart_t       *pp;
 823 
 824         ASSERT(pool_lock_held());
 825 
 826         pp = kmem_zalloc(sizeof (cpupart_t), KM_SLEEP);
 827 
 828         mutex_enter(&cpu_lock);
 829         if (cp_numparts == cp_max_numparts) {
 830                 mutex_exit(&cpu_lock);
 831                 kmem_free(pp, sizeof (cpupart_t));
 832                 return (ENOMEM);
 833         }
 834         cp_numparts++;
 835         /* find the next free partition ID */
 836         while (cpupart_find(CPTOPS(cp_id_next)) != NULL)
 837                 cp_id_next++;
 838         pp->cp_id = cp_id_next++;
 839         pp->cp_ncpus = 0;
 840         pp->cp_cpulist = NULL;
 841         pp->cp_attr = 0;
 842         klgrpset_clear(pp->cp_lgrpset);
 843         pp->cp_kp_queue.disp_maxrunpri = -1;
 844         pp->cp_kp_queue.disp_max_unbound_pri = -1;
 845         pp->cp_kp_queue.disp_cpu = NULL;
 846         pp->cp_gen = 0;
 847         DISP_LOCK_INIT(&pp->cp_kp_queue.disp_lock);
 848         *psid = CPTOPS(pp->cp_id);
 849         disp_kp_alloc(&pp->cp_kp_queue, v.v_nglobpris);
 850         cpupart_kstat_create(pp);
 851         cpupart_lpl_initialize(pp);
 852 
 853         bitset_init(&pp->cp_cmt_pgs);
 854 
 855         /*
 856          * Initialize and size the partition's bitset of halted CPUs.
 857          */
 858         bitset_init_fanout(&pp->cp_haltset, cp_haltset_fanout);
 859         bitset_resize(&pp->cp_haltset, max_ncpus);
 860 
 861         /*
 862          * Pause all CPUs while changing the partition list, to make sure
 863          * the clock thread (which traverses the list without holding
 864          * cpu_lock) isn't running.
 865          */
 866         pause_cpus(NULL, NULL);
 867         pp->cp_next = cp_list_head;
 868         pp->cp_prev = cp_list_head->cp_prev;
 869         cp_list_head->cp_prev->cp_next = pp;
 870         cp_list_head->cp_prev = pp;
 871         start_cpus();
 872         mutex_exit(&cpu_lock);
 873 
 874         return (0);
 875 }
 876 
 877 /*
 878  * Move threads from specified partition to cp_default. If `force' is specified,
 879  * move all threads, otherwise move only soft-bound threads.
 880  */
 881 static int
 882 cpupart_unbind_threads(cpupart_t *pp, boolean_t unbind_all)
 883 {
 884         void    *projbuf, *zonebuf;
 885         kthread_t *t;
 886         proc_t  *p;
 887         int     err = 0;
 888         psetid_t psid = pp->cp_id;
 889 
 890         ASSERT(pool_lock_held());
 891         ASSERT(MUTEX_HELD(&cpu_lock));
 892 
 893         if (pp == NULL || pp == &cp_default) {
 894                 return (EINVAL);
 895         }
 896 
 897         /*
 898          * Pre-allocate enough buffers for FSS for all active projects and
 899          * for all active zones on the system.  Unused buffers will be
 900          * freed later by fss_freebuf().
 901          */
 902         projbuf = fss_allocbuf(FSS_NPROJ_BUF, FSS_ALLOC_PROJ);
 903         zonebuf = fss_allocbuf(FSS_NPROJ_BUF, FSS_ALLOC_ZONE);
 904 
 905         mutex_enter(&pidlock);
 906         t = curthread;
 907         do {
 908                 if (t->t_bind_pset == psid) {
 909 again:                  p = ttoproc(t);
 910                         mutex_enter(&p->p_lock);
 911                         if (ttoproc(t) != p) {
 912                                 /*
 913                                  * lwp_exit has changed this thread's process
 914                                  * pointer before we grabbed its p_lock.
 915                                  */
 916                                 mutex_exit(&p->p_lock);
 917                                 goto again;
 918                         }
 919 
 920                         /*
 921                          * Can only unbind threads which have revocable binding
 922                          * unless force unbinding requested.
 923                          */
 924                         if (unbind_all || TB_PSET_IS_SOFT(t)) {
 925                                 err = cpupart_bind_thread(t, PS_NONE, 1,
 926                                     projbuf, zonebuf);
 927                                 if (err) {
 928                                         mutex_exit(&p->p_lock);
 929                                         mutex_exit(&pidlock);
 930                                         fss_freebuf(projbuf, FSS_ALLOC_PROJ);
 931                                         fss_freebuf(zonebuf, FSS_ALLOC_ZONE);
 932                                         return (err);
 933                                 }
 934                                 t->t_bind_pset = PS_NONE;
 935                         }
 936                         mutex_exit(&p->p_lock);
 937                 }
 938                 t = t->t_next;
 939         } while (t != curthread);
 940 
 941         mutex_exit(&pidlock);
 942         fss_freebuf(projbuf, FSS_ALLOC_PROJ);
 943         fss_freebuf(zonebuf, FSS_ALLOC_ZONE);
 944         return (err);
 945 }
 946 
 947 /*
 948  * Destroy a partition.
 949  */
 950 int
 951 cpupart_destroy(psetid_t psid)
 952 {
 953         cpu_t   *cp, *first_cp;
 954         cpupart_t *pp, *newpp;
 955         int     err = 0;
 956 
 957         ASSERT(pool_lock_held());
 958         mutex_enter(&cpu_lock);
 959 
 960         pp = cpupart_find(psid);
 961         if (pp == NULL || pp == &cp_default) {
 962                 mutex_exit(&cpu_lock);
 963                 return (EINVAL);
 964         }
 965 
 966         /*
 967          * Unbind all the threads currently bound to the partition.
 968          */
 969         err = cpupart_unbind_threads(pp, B_TRUE);
 970         if (err) {
 971                 mutex_exit(&cpu_lock);
 972                 return (err);
 973         }
 974 
 975         newpp = &cp_default;
 976         while ((cp = pp->cp_cpulist) != NULL) {
 977                 if (err = cpupart_move_cpu(cp, newpp, 0)) {
 978                         mutex_exit(&cpu_lock);
 979                         return (err);
 980                 }
 981         }
 982 
 983         ASSERT(bitset_is_null(&pp->cp_cmt_pgs));
 984         ASSERT(bitset_is_null(&pp->cp_haltset));
 985 
 986         /*
 987          * Teardown the partition's group of active CMT PGs and halted
 988          * CPUs now that they have all left.
 989          */
 990         bitset_fini(&pp->cp_cmt_pgs);
 991         bitset_fini(&pp->cp_haltset);
 992 
 993         /*
 994          * Reset the pointers in any offline processors so they won't
 995          * try to rejoin the destroyed partition when they're turned
 996          * online.
 997          */
 998         first_cp = cp = CPU;
 999         do {
1000                 if (cp->cpu_part == pp) {
1001                         ASSERT(cp->cpu_flags & CPU_OFFLINE);
1002                         cp->cpu_part = newpp;
1003                 }
1004                 cp = cp->cpu_next;
1005         } while (cp != first_cp);
1006 
1007         /*
1008          * Pause all CPUs while changing the partition list, to make sure
1009          * the clock thread (which traverses the list without holding
1010          * cpu_lock) isn't running.
1011          */
1012         pause_cpus(NULL, NULL);
1013         pp->cp_prev->cp_next = pp->cp_next;
1014         pp->cp_next->cp_prev = pp->cp_prev;
1015         if (cp_list_head == pp)
1016                 cp_list_head = pp->cp_next;
1017         start_cpus();
1018 
1019         if (cp_id_next > pp->cp_id)
1020                 cp_id_next = pp->cp_id;
1021 
1022         if (pp->cp_kstat)
1023                 kstat_delete(pp->cp_kstat);
1024 
1025         cp_numparts--;
1026 
1027         disp_kp_free(&pp->cp_kp_queue);
1028 
1029         cpupart_lpl_teardown(pp);
1030 
1031         kmem_free(pp, sizeof (cpupart_t));
1032         mutex_exit(&cpu_lock);
1033 
1034         return (err);
1035 }
1036 
1037 
1038 /*
1039  * Return the ID of the partition to which the specified processor belongs.
1040  */
1041 psetid_t
1042 cpupart_query_cpu(cpu_t *cp)
1043 {
1044         ASSERT(MUTEX_HELD(&cpu_lock));
1045 
1046         return (CPTOPS(cp->cpu_part->cp_id));
1047 }
1048 
1049 
1050 /*
1051  * Attach a processor to an existing partition.
1052  */
1053 int
1054 cpupart_attach_cpu(psetid_t psid, cpu_t *cp, int forced)
1055 {
1056         cpupart_t       *pp;
1057         int             err;
1058 
1059         ASSERT(pool_lock_held());
1060         ASSERT(MUTEX_HELD(&cpu_lock));
1061 
1062         pp = cpupart_find(psid);
1063         if (pp == NULL)
1064                 return (EINVAL);
1065         if (cp->cpu_flags & CPU_OFFLINE)
1066                 return (EINVAL);
1067 
1068         err = cpupart_move_cpu(cp, pp, forced);
1069         return (err);
1070 }
1071 
1072 /*
1073  * Get a list of cpus belonging to the partition.  If numcpus is NULL,
1074  * this just checks for a valid partition.  If numcpus is non-NULL but
1075  * cpulist is NULL, the current number of cpus is stored in *numcpus.
1076  * If both are non-NULL, the current number of cpus is stored in *numcpus,
1077  * and a list of those cpus up to the size originally in *numcpus is
1078  * stored in cpulist[].  Also, store the processor set id in *psid.
1079  * This is useful in case the processor set id passed in was PS_MYID.
1080  */
1081 int
1082 cpupart_get_cpus(psetid_t *psid, processorid_t *cpulist, uint_t *numcpus)
1083 {
1084         cpupart_t       *pp;
1085         uint_t          ncpus;
1086         cpu_t           *c;
1087         int             i;
1088 
1089         mutex_enter(&cpu_lock);
1090         pp = cpupart_find(*psid);
1091         if (pp == NULL) {
1092                 mutex_exit(&cpu_lock);
1093                 return (EINVAL);
1094         }
1095         *psid = CPTOPS(pp->cp_id);
1096         ncpus = pp->cp_ncpus;
1097         if (numcpus) {
1098                 if (ncpus > *numcpus) {
1099                         /*
1100                          * Only copy as many cpus as were passed in, but
1101                          * pass back the real number.
1102                          */
1103                         uint_t t = ncpus;
1104                         ncpus = *numcpus;
1105                         *numcpus = t;
1106                 } else
1107                         *numcpus = ncpus;
1108 
1109                 if (cpulist) {
1110                         c = pp->cp_cpulist;
1111                         for (i = 0; i < ncpus; i++) {
1112                                 ASSERT(c != NULL);
1113                                 cpulist[i] = c->cpu_id;
1114                                 c = c->cpu_next_part;
1115                         }
1116                 }
1117         }
1118         mutex_exit(&cpu_lock);
1119         return (0);
1120 }
1121 
1122 /*
1123  * Reallocate kpreempt queues for each CPU partition.  Called from
1124  * disp_setup when a new scheduling class is loaded that increases the
1125  * number of priorities in the system.
1126  */
1127 void
1128 cpupart_kpqalloc(pri_t npri)
1129 {
1130         cpupart_t *cpp;
1131 
1132         ASSERT(MUTEX_HELD(&cpu_lock));
1133         cpp = cp_list_head;
1134         do {
1135                 disp_kp_alloc(&cpp->cp_kp_queue, npri);
1136                 cpp = cpp->cp_next;
1137         } while (cpp != cp_list_head);
1138 }
1139 
1140 int
1141 cpupart_get_loadavg(psetid_t psid, int *buf, int nelem)
1142 {
1143         cpupart_t *cp;
1144         int i;
1145 
1146         ASSERT(nelem >= 0);
1147         ASSERT(nelem <= LOADAVG_NSTATS);
1148         ASSERT(MUTEX_HELD(&cpu_lock));
1149 
1150         cp = cpupart_find(psid);
1151         if (cp == NULL)
1152                 return (EINVAL);
1153         for (i = 0; i < nelem; i++)
1154                 buf[i] = cp->cp_hp_avenrun[i] >> (16 - FSHIFT);
1155 
1156         return (0);
1157 }
1158 
1159 
1160 uint_t
1161 cpupart_list(psetid_t *list, uint_t nelem, int flag)
1162 {
1163         uint_t numpart = 0;
1164         cpupart_t *cp;
1165 
1166         ASSERT(MUTEX_HELD(&cpu_lock));
1167         ASSERT(flag == CP_ALL || flag == CP_NONEMPTY);
1168 
1169         if (list != NULL) {
1170                 cp = cp_list_head;
1171                 do {
1172                         if (((flag == CP_ALL) && (cp != &cp_default)) ||
1173                             ((flag == CP_NONEMPTY) && (cp->cp_ncpus != 0))) {
1174                                 if (numpart == nelem)
1175                                         break;
1176                                 list[numpart++] = CPTOPS(cp->cp_id);
1177                         }
1178                         cp = cp->cp_next;
1179                 } while (cp != cp_list_head);
1180         }
1181 
1182         ASSERT(numpart < cp_numparts);
1183 
1184         if (flag == CP_ALL)
1185                 numpart = cp_numparts - 1; /* leave out default partition */
1186         else if (flag == CP_NONEMPTY)
1187                 numpart = cp_numparts_nonempty;
1188 
1189         return (numpart);
1190 }
1191 
1192 int
1193 cpupart_setattr(psetid_t psid, uint_t attr)
1194 {
1195         cpupart_t *cp;
1196 
1197         ASSERT(pool_lock_held());
1198 
1199         mutex_enter(&cpu_lock);
1200         if ((cp = cpupart_find(psid)) == NULL) {
1201                 mutex_exit(&cpu_lock);
1202                 return (EINVAL);
1203         }
1204         /*
1205          * PSET_NOESCAPE attribute for default cpu partition is always set
1206          */
1207         if (cp == &cp_default && !(attr & PSET_NOESCAPE)) {
1208                 mutex_exit(&cpu_lock);
1209                 return (EINVAL);
1210         }
1211         cp->cp_attr = attr;
1212         mutex_exit(&cpu_lock);
1213         return (0);
1214 }
1215 
1216 int
1217 cpupart_getattr(psetid_t psid, uint_t *attrp)
1218 {
1219         cpupart_t *cp;
1220 
1221         mutex_enter(&cpu_lock);
1222         if ((cp = cpupart_find(psid)) == NULL) {
1223                 mutex_exit(&cpu_lock);
1224                 return (EINVAL);
1225         }
1226         *attrp = cp->cp_attr;
1227         mutex_exit(&cpu_lock);
1228         return (0);
1229 }