1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 1996, 2010, Oracle and/or its affiliates. All rights reserved.
23 *
24 * Copyright 2018 Joyent, Inc.
25 * Copyright (c) 2017 by Delphix. All rights reserved.
26 */
27
28 #include <sys/types.h>
29 #include <sys/systm.h>
30 #include <sys/cmn_err.h>
31 #include <sys/cpuvar.h>
32 #include <sys/thread.h>
33 #include <sys/disp.h>
34 #include <sys/kmem.h>
35 #include <sys/debug.h>
36 #include <sys/cpupart.h>
37 #include <sys/pset.h>
38 #include <sys/var.h>
39 #include <sys/cyclic.h>
40 #include <sys/lgrp.h>
41 #include <sys/pghw.h>
42 #include <sys/loadavg.h>
43 #include <sys/class.h>
44 #include <sys/fss.h>
45 #include <sys/pool.h>
46 #include <sys/pool_pset.h>
47 #include <sys/policy.h>
48
49 /*
50 * Calling pool_lock() protects the pools configuration, which includes
51 * CPU partitions. cpu_lock protects the CPU partition list, and prevents
52 * partitions from being created or destroyed while the lock is held.
53 * The lock ordering with respect to related locks is:
54 *
55 * pool_lock() ---> cpu_lock ---> pidlock --> p_lock
56 *
57 * Blocking memory allocations may be made while holding "pool_lock"
58 * or cpu_lock.
59 */
60
61 /*
62 * The cp_default partition is allocated statically, but its lgroup load average
63 * (lpl) list is allocated dynamically after kmem subsystem is initialized. This
64 * saves some memory since the space allocated reflects the actual number of
65 * lgroups supported by the platform. The lgrp facility provides a temporary
66 * space to hold lpl information during system bootstrap.
67 */
68
69 cpupart_t *cp_list_head;
70 cpupart_t cp_default;
71 static cpupartid_t cp_id_next;
72 uint_t cp_numparts;
73 uint_t cp_numparts_nonempty;
74
75 /*
76 * Need to limit total number of partitions to avoid slowing down the
77 * clock code too much. The clock code traverses the list of
78 * partitions and needs to be able to execute in a reasonable amount
79 * of time (less than 1/hz seconds). The maximum is sized based on
80 * max_ncpus so it shouldn't be a problem unless there are large
81 * numbers of empty partitions.
82 */
83 static uint_t cp_max_numparts;
84
85 /*
86 * Processor sets and CPU partitions are different but related concepts.
87 * A processor set is a user-level abstraction allowing users to create
88 * sets of CPUs and bind threads exclusively to those sets. A CPU
89 * partition is a kernel dispatcher object consisting of a set of CPUs
90 * and a global dispatch queue. The processor set abstraction is
91 * implemented via a CPU partition, and currently there is a 1-1
92 * mapping between processor sets and partitions (excluding the default
93 * partition, which is not visible as a processor set). Hence, the
94 * numbering for processor sets and CPU partitions is identical. This
95 * may not always be true in the future, and these macros could become
96 * less trivial if we support e.g. a processor set containing multiple
97 * CPU partitions.
98 */
99 #define PSTOCP(psid) ((cpupartid_t)((psid) == PS_NONE ? CP_DEFAULT : (psid)))
100 #define CPTOPS(cpid) ((psetid_t)((cpid) == CP_DEFAULT ? PS_NONE : (cpid)))
101
102 static int cpupart_unbind_threads(cpupart_t *, boolean_t);
103
104 /*
105 * Find a CPU partition given a processor set ID.
106 */
107 static cpupart_t *
108 cpupart_find_all(psetid_t psid)
109 {
110 cpupart_t *cp;
111 cpupartid_t cpid = PSTOCP(psid);
112
113 ASSERT(MUTEX_HELD(&cpu_lock));
114
115 /* default partition not visible as a processor set */
116 if (psid == CP_DEFAULT)
117 return (NULL);
118
119 if (psid == PS_MYID)
120 return (curthread->t_cpupart);
121
122 cp = cp_list_head;
123 do {
124 if (cp->cp_id == cpid)
125 return (cp);
126 cp = cp->cp_next;
127 } while (cp != cp_list_head);
128 return (NULL);
129 }
130
131 /*
132 * Find a CPU partition given a processor set ID if the processor set
133 * should be visible from the calling zone.
134 */
135 cpupart_t *
136 cpupart_find(psetid_t psid)
137 {
138 cpupart_t *cp;
139
140 ASSERT(MUTEX_HELD(&cpu_lock));
141 cp = cpupart_find_all(psid);
142 if (cp != NULL && !INGLOBALZONE(curproc) && pool_pset_enabled() &&
143 zone_pset_get(curproc->p_zone) != CPTOPS(cp->cp_id))
144 return (NULL);
145 return (cp);
146 }
147
148 static int
149 cpupart_kstat_update(kstat_t *ksp, int rw)
150 {
151 cpupart_t *cp = (cpupart_t *)ksp->ks_private;
152 cpupart_kstat_t *cpksp = ksp->ks_data;
153
154 if (rw == KSTAT_WRITE)
155 return (EACCES);
156
157 cpksp->cpk_updates.value.ui64 = cp->cp_updates;
158 cpksp->cpk_runnable.value.ui64 = cp->cp_nrunnable_cum;
159 cpksp->cpk_waiting.value.ui64 = cp->cp_nwaiting_cum;
160 cpksp->cpk_ncpus.value.ui32 = cp->cp_ncpus;
161 cpksp->cpk_avenrun_1min.value.ui32 = cp->cp_hp_avenrun[0] >>
162 (16 - FSHIFT);
163 cpksp->cpk_avenrun_5min.value.ui32 = cp->cp_hp_avenrun[1] >>
164 (16 - FSHIFT);
165 cpksp->cpk_avenrun_15min.value.ui32 = cp->cp_hp_avenrun[2] >>
166 (16 - FSHIFT);
167 return (0);
168 }
169
170 static void
171 cpupart_kstat_create(cpupart_t *cp)
172 {
173 kstat_t *ksp;
174 zoneid_t zoneid;
175
176 ASSERT(MUTEX_HELD(&cpu_lock));
177
178 /*
179 * We have a bit of a chicken-egg problem since this code will
180 * get called to create the kstats for CP_DEFAULT before the
181 * pools framework gets initialized. We circumvent the problem
182 * by special-casing cp_default.
183 */
184 if (cp != &cp_default && pool_pset_enabled())
185 zoneid = GLOBAL_ZONEID;
186 else
187 zoneid = ALL_ZONES;
188 ksp = kstat_create_zone("unix", cp->cp_id, "pset", "misc",
189 KSTAT_TYPE_NAMED,
190 sizeof (cpupart_kstat_t) / sizeof (kstat_named_t), 0, zoneid);
191 if (ksp != NULL) {
192 cpupart_kstat_t *cpksp = ksp->ks_data;
193
194 kstat_named_init(&cpksp->cpk_updates, "updates",
195 KSTAT_DATA_UINT64);
196 kstat_named_init(&cpksp->cpk_runnable, "runnable",
197 KSTAT_DATA_UINT64);
198 kstat_named_init(&cpksp->cpk_waiting, "waiting",
199 KSTAT_DATA_UINT64);
200 kstat_named_init(&cpksp->cpk_ncpus, "ncpus",
201 KSTAT_DATA_UINT32);
202 kstat_named_init(&cpksp->cpk_avenrun_1min, "avenrun_1min",
203 KSTAT_DATA_UINT32);
204 kstat_named_init(&cpksp->cpk_avenrun_5min, "avenrun_5min",
205 KSTAT_DATA_UINT32);
206 kstat_named_init(&cpksp->cpk_avenrun_15min, "avenrun_15min",
207 KSTAT_DATA_UINT32);
208
209 ksp->ks_update = cpupart_kstat_update;
210 ksp->ks_private = cp;
211
212 kstat_install(ksp);
213 }
214 cp->cp_kstat = ksp;
215 }
216
217 /*
218 * Initialize the cpupart's lgrp partions (lpls)
219 */
220 static void
221 cpupart_lpl_initialize(cpupart_t *cp)
222 {
223 int i, sz;
224
225 sz = cp->cp_nlgrploads = lgrp_plat_max_lgrps();
226 cp->cp_lgrploads = kmem_zalloc(sizeof (lpl_t) * sz, KM_SLEEP);
227
228 for (i = 0; i < sz; i++) {
229 /*
230 * The last entry of the lpl's resource set is always NULL
231 * by design (to facilitate iteration)...hence the "oversizing"
232 * by 1.
233 */
234 cp->cp_lgrploads[i].lpl_rset_sz = sz + 1;
235 cp->cp_lgrploads[i].lpl_rset =
236 kmem_zalloc(sizeof (struct lgrp_ld *) * (sz + 1), KM_SLEEP);
237 cp->cp_lgrploads[i].lpl_id2rset =
238 kmem_zalloc(sizeof (int) * (sz + 1), KM_SLEEP);
239 cp->cp_lgrploads[i].lpl_lgrpid = i;
240 }
241 }
242
243 /*
244 * Teardown the cpupart's lgrp partitions
245 */
246 static void
247 cpupart_lpl_teardown(cpupart_t *cp)
248 {
249 int i, sz;
250 lpl_t *lpl;
251
252 for (i = 0; i < cp->cp_nlgrploads; i++) {
253 lpl = &cp->cp_lgrploads[i];
254
255 sz = lpl->lpl_rset_sz;
256 kmem_free(lpl->lpl_rset, sizeof (struct lgrp_ld *) * sz);
257 kmem_free(lpl->lpl_id2rset, sizeof (int) * sz);
258 lpl->lpl_rset = NULL;
259 lpl->lpl_id2rset = NULL;
260 }
261 kmem_free(cp->cp_lgrploads, sizeof (lpl_t) * cp->cp_nlgrploads);
262 cp->cp_lgrploads = NULL;
263 }
264
265 /*
266 * Initialize the default partition and kpreempt disp queue.
267 */
268 void
269 cpupart_initialize_default(void)
270 {
271 lgrp_id_t i;
272
273 cp_list_head = &cp_default;
274 cp_default.cp_next = &cp_default;
275 cp_default.cp_prev = &cp_default;
276 cp_default.cp_id = CP_DEFAULT;
277 cp_default.cp_kp_queue.disp_maxrunpri = -1;
278 cp_default.cp_kp_queue.disp_max_unbound_pri = -1;
279 cp_default.cp_kp_queue.disp_cpu = NULL;
280 cp_default.cp_gen = 0;
281 cp_default.cp_loadavg.lg_cur = 0;
282 cp_default.cp_loadavg.lg_len = 0;
283 cp_default.cp_loadavg.lg_total = 0;
284 for (i = 0; i < S_LOADAVG_SZ; i++) {
285 cp_default.cp_loadavg.lg_loads[i] = 0;
286 }
287 DISP_LOCK_INIT(&cp_default.cp_kp_queue.disp_lock);
288 cp_id_next = CP_DEFAULT + 1;
289 cpupart_kstat_create(&cp_default);
290 cp_numparts = 1;
291 if (cp_max_numparts == 0) /* allow for /etc/system tuning */
292 cp_max_numparts = max_ncpus * 2 + 1;
293 /*
294 * Allocate space for cp_default list of lgrploads
295 */
296 cpupart_lpl_initialize(&cp_default);
297
298 /*
299 * The initial lpl topology is created in a special lpl list
300 * lpl_bootstrap. It should be copied to cp_default.
301 * NOTE: lpl_topo_bootstrap() also updates CPU0 cpu_lpl pointer to point
302 * to the correct lpl in the cp_default.cp_lgrploads list.
303 */
304 lpl_topo_bootstrap(cp_default.cp_lgrploads,
305 cp_default.cp_nlgrploads);
306
307
308 cp_default.cp_attr = PSET_NOESCAPE;
309 cp_numparts_nonempty = 1;
310 /*
311 * Set t0's home
312 */
313 t0.t_lpl = &cp_default.cp_lgrploads[LGRP_ROOTID];
314
315 bitset_init(&cp_default.cp_cmt_pgs);
316 bitset_init_fanout(&cp_default.cp_haltset, cp_haltset_fanout);
317
318 bitset_resize(&cp_default.cp_haltset, max_ncpus);
319 }
320
321
322 static int
323 cpupart_move_cpu(cpu_t *cp, cpupart_t *newpp, int forced)
324 {
325 cpupart_t *oldpp;
326 cpu_t *ncp, *newlist;
327 kthread_t *t;
328 int move_threads = 1;
329 lgrp_id_t lgrpid;
330 proc_t *p;
331 int lgrp_diff_lpl;
332 lpl_t *cpu_lpl;
333 int ret;
334 boolean_t unbind_all_threads = (forced != 0);
335
336 ASSERT(MUTEX_HELD(&cpu_lock));
337 ASSERT(newpp != NULL);
338
339 oldpp = cp->cpu_part;
340 ASSERT(oldpp != NULL);
341 ASSERT(oldpp->cp_ncpus > 0);
342
343 if (newpp == oldpp) {
344 /*
345 * Don't need to do anything.
346 */
347 return (0);
348 }
349
350 cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_OUT);
351
352 if (!disp_bound_partition(cp, 0)) {
353 /*
354 * Don't need to move threads if there are no threads in
355 * the partition. Note that threads can't enter the
356 * partition while we're holding cpu_lock.
357 */
358 move_threads = 0;
359 } else if (oldpp->cp_ncpus == 1) {
360 /*
361 * The last CPU is removed from a partition which has threads
362 * running in it. Some of these threads may be bound to this
363 * CPU.
364 *
365 * Attempt to unbind threads from the CPU and from the processor
366 * set. Note that no threads should be bound to this CPU since
367 * cpupart_move_threads will refuse to move bound threads to
368 * other CPUs.
369 */
370 (void) cpu_unbind(oldpp->cp_cpulist->cpu_id, B_FALSE);
371 (void) cpupart_unbind_threads(oldpp, B_FALSE);
372
373 if (!disp_bound_partition(cp, 0)) {
374 /*
375 * No bound threads in this partition any more
376 */
377 move_threads = 0;
378 } else {
379 /*
380 * There are still threads bound to the partition
381 */
382 cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN);
383 return (EBUSY);
384 }
385 }
386
387 /*
388 * If forced flag is set unbind any threads from this CPU.
389 * Otherwise unbind soft-bound threads only.
390 */
391 if ((ret = cpu_unbind(cp->cpu_id, unbind_all_threads)) != 0) {
392 cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN);
393 return (ret);
394 }
395
396 /*
397 * Stop further threads weak binding to this cpu.
398 */
399 cpu_inmotion = cp;
400 membar_enter();
401
402 /*
403 * Notify the Processor Groups subsystem that the CPU
404 * will be moving cpu partitions. This is done before
405 * CPUs are paused to provide an opportunity for any
406 * needed memory allocations.
407 */
408 pg_cpupart_out(cp, oldpp);
409 pg_cpupart_in(cp, newpp);
410
411 again:
412 if (move_threads) {
413 int loop_count;
414 /*
415 * Check for threads strong or weak bound to this CPU.
416 */
417 for (loop_count = 0; disp_bound_threads(cp, 0); loop_count++) {
418 if (loop_count >= 5) {
419 cpu_state_change_notify(cp->cpu_id,
420 CPU_CPUPART_IN);
421 pg_cpupart_out(cp, newpp);
422 pg_cpupart_in(cp, oldpp);
423 cpu_inmotion = NULL;
424 return (EBUSY); /* some threads still bound */
425 }
426 delay(1);
427 }
428 }
429
430 /*
431 * Before we actually start changing data structures, notify
432 * the cyclic subsystem that we want to move this CPU out of its
433 * partition.
434 */
435 if (!cyclic_move_out(cp)) {
436 /*
437 * This CPU must be the last CPU in a processor set with
438 * a bound cyclic.
439 */
440 cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN);
441 pg_cpupart_out(cp, newpp);
442 pg_cpupart_in(cp, oldpp);
443 cpu_inmotion = NULL;
444 return (EBUSY);
445 }
446
447 pause_cpus(cp, NULL);
448
449 if (move_threads) {
450 /*
451 * The thread on cpu before the pause thread may have read
452 * cpu_inmotion before we raised the barrier above. Check
453 * again.
454 */
455 if (disp_bound_threads(cp, 1)) {
456 start_cpus();
457 goto again;
458 }
459
460 }
461
462 /*
463 * Now that CPUs are paused, let the PG subsystem perform
464 * any necessary data structure updates.
465 */
466 pg_cpupart_move(cp, oldpp, newpp);
467
468 /* save this cpu's lgroup -- it'll be the same in the new partition */
469 lgrpid = cp->cpu_lpl->lpl_lgrpid;
470
471 cpu_lpl = cp->cpu_lpl;
472 /*
473 * let the lgroup framework know cp has left the partition
474 */
475 lgrp_config(LGRP_CONFIG_CPUPART_DEL, (uintptr_t)cp, lgrpid);
476
477 /* move out of old partition */
478 oldpp->cp_ncpus--;
479 if (oldpp->cp_ncpus > 0) {
480
481 ncp = cp->cpu_prev_part->cpu_next_part = cp->cpu_next_part;
482 cp->cpu_next_part->cpu_prev_part = cp->cpu_prev_part;
483 if (oldpp->cp_cpulist == cp) {
484 oldpp->cp_cpulist = ncp;
485 }
486 } else {
487 ncp = oldpp->cp_cpulist = NULL;
488 cp_numparts_nonempty--;
489 ASSERT(cp_numparts_nonempty != 0);
490 }
491 oldpp->cp_gen++;
492
493 /* move into new partition */
494 newlist = newpp->cp_cpulist;
495 if (newlist == NULL) {
496 newpp->cp_cpulist = cp->cpu_next_part = cp->cpu_prev_part = cp;
497 cp_numparts_nonempty++;
498 ASSERT(cp_numparts_nonempty != 0);
499 } else {
500 cp->cpu_next_part = newlist;
501 cp->cpu_prev_part = newlist->cpu_prev_part;
502 newlist->cpu_prev_part->cpu_next_part = cp;
503 newlist->cpu_prev_part = cp;
504 }
505 cp->cpu_part = newpp;
506 newpp->cp_ncpus++;
507 newpp->cp_gen++;
508
509 ASSERT(bitset_is_null(&newpp->cp_haltset));
510 ASSERT(bitset_is_null(&oldpp->cp_haltset));
511
512 /*
513 * let the lgroup framework know cp has entered the partition
514 */
515 lgrp_config(LGRP_CONFIG_CPUPART_ADD, (uintptr_t)cp, lgrpid);
516
517 /*
518 * If necessary, move threads off processor.
519 */
520 if (move_threads) {
521 ASSERT(ncp != NULL);
522
523 /*
524 * Walk thru the active process list to look for
525 * threads that need to have a new home lgroup,
526 * or the last CPU they run on is the same CPU
527 * being moved out of the partition.
528 */
529
530 for (p = practive; p != NULL; p = p->p_next) {
531
532 t = p->p_tlist;
533
534 if (t == NULL)
535 continue;
536
537 lgrp_diff_lpl = 0;
538
539 do {
540
541 ASSERT(t->t_lpl != NULL);
542
543 /*
544 * Update the count of how many threads are
545 * in this CPU's lgroup but have a different lpl
546 */
547
548 if (t->t_lpl != cpu_lpl &&
549 t->t_lpl->lpl_lgrpid == lgrpid)
550 lgrp_diff_lpl++;
551 /*
552 * If the lgroup that t is assigned to no
553 * longer has any CPUs in t's partition,
554 * we'll have to choose a new lgroup for t.
555 */
556
557 if (!LGRP_CPUS_IN_PART(t->t_lpl->lpl_lgrpid,
558 t->t_cpupart)) {
559 lgrp_move_thread(t,
560 lgrp_choose(t, t->t_cpupart), 0);
561 }
562
563 /*
564 * make sure lpl points to our own partition
565 */
566 ASSERT(t->t_lpl >= t->t_cpupart->cp_lgrploads &&
567 (t->t_lpl < t->t_cpupart->cp_lgrploads +
568 t->t_cpupart->cp_nlgrploads));
569
570 ASSERT(t->t_lpl->lpl_ncpu > 0);
571
572 /* Update CPU last ran on if it was this CPU */
573 if (t->t_cpu == cp && t->t_cpupart == oldpp &&
574 t->t_bound_cpu != cp) {
575 t->t_cpu = disp_lowpri_cpu(ncp, t,
576 t->t_pri);
577 }
578 t = t->t_forw;
579 } while (t != p->p_tlist);
580
581 /*
582 * Didn't find any threads in the same lgroup as this
583 * CPU with a different lpl, so remove the lgroup from
584 * the process lgroup bitmask.
585 */
586
587 if (lgrp_diff_lpl)
588 klgrpset_del(p->p_lgrpset, lgrpid);
589 }
590
591 /*
592 * Walk thread list looking for threads that need to be
593 * rehomed, since there are some threads that are not in
594 * their process's p_tlist.
595 */
596
597 t = curthread;
598
599 do {
600 ASSERT(t != NULL && t->t_lpl != NULL);
601
602 /*
603 * If the lgroup that t is assigned to no
604 * longer has any CPUs in t's partition,
605 * we'll have to choose a new lgroup for t.
606 * Also, choose best lgroup for home when
607 * thread has specified lgroup affinities,
608 * since there may be an lgroup with more
609 * affinity available after moving CPUs
610 * around.
611 */
612 if (!LGRP_CPUS_IN_PART(t->t_lpl->lpl_lgrpid,
613 t->t_cpupart) || t->t_lgrp_affinity) {
614 lgrp_move_thread(t,
615 lgrp_choose(t, t->t_cpupart), 1);
616 }
617
618 /* make sure lpl points to our own partition */
619 ASSERT((t->t_lpl >= t->t_cpupart->cp_lgrploads) &&
620 (t->t_lpl < t->t_cpupart->cp_lgrploads +
621 t->t_cpupart->cp_nlgrploads));
622
623 ASSERT(t->t_lpl->lpl_ncpu > 0);
624
625 /* Update CPU last ran on if it was this CPU */
626 if (t->t_cpu == cp && t->t_cpupart == oldpp &&
627 t->t_bound_cpu != cp) {
628 t->t_cpu = disp_lowpri_cpu(ncp, t,
629 t->t_pri);
630 }
631
632 t = t->t_next;
633 } while (t != curthread);
634
635 /*
636 * Clear off the CPU's run queue, and the kp queue if the
637 * partition is now empty.
638 */
639 disp_cpu_inactive(cp);
640
641 /*
642 * Make cp switch to a thread from the new partition.
643 */
644 cp->cpu_runrun = 1;
645 cp->cpu_kprunrun = 1;
646 }
647
648 cpu_inmotion = NULL;
649 start_cpus();
650
651 /*
652 * Let anyone interested know that cpu has been added to the set.
653 */
654 cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN);
655
656 /*
657 * Now let the cyclic subsystem know that it can reshuffle cyclics
658 * bound to the new processor set.
659 */
660 cyclic_move_in(cp);
661
662 return (0);
663 }
664
665 /*
666 * Check if thread can be moved to a new cpu partition. Called by
667 * cpupart_move_thread() and pset_bind_start().
668 */
669 int
670 cpupart_movable_thread(kthread_id_t tp, cpupart_t *cp, int ignore)
671 {
672 ASSERT(MUTEX_HELD(&cpu_lock));
673 ASSERT(MUTEX_HELD(&ttoproc(tp)->p_lock));
674 ASSERT(cp != NULL);
675 ASSERT(THREAD_LOCK_HELD(tp));
676
677 /*
678 * CPU-bound threads can't be moved.
679 */
680 if (!ignore) {
681 cpu_t *boundcpu = tp->t_bound_cpu ? tp->t_bound_cpu :
682 tp->t_weakbound_cpu;
683 if (boundcpu != NULL && boundcpu->cpu_part != cp)
684 return (EBUSY);
685 }
686
687 if (tp->t_cid == sysdccid) {
688 return (EINVAL); /* For now, sysdc threads can't move */
689 }
690
691 return (0);
692 }
693
694 /*
695 * Move thread to new partition. If ignore is non-zero, then CPU
696 * bindings should be ignored (this is used when destroying a
697 * partition).
698 */
699 static int
700 cpupart_move_thread(kthread_id_t tp, cpupart_t *newpp, int ignore,
701 void *projbuf, void *zonebuf)
702 {
703 cpupart_t *oldpp = tp->t_cpupart;
704 int ret;
705
706 ASSERT(MUTEX_HELD(&cpu_lock));
707 ASSERT(MUTEX_HELD(&pidlock));
708 ASSERT(MUTEX_HELD(&ttoproc(tp)->p_lock));
709 ASSERT(newpp != NULL);
710
711 if (newpp->cp_cpulist == NULL)
712 return (EINVAL);
713
714 /*
715 * Check for errors first.
716 */
717 thread_lock(tp);
718 if ((ret = cpupart_movable_thread(tp, newpp, ignore)) != 0) {
719 thread_unlock(tp);
720 return (ret);
721 }
722
723 /* move the thread */
724 if (oldpp != newpp) {
725 /*
726 * Make the thread switch to the new partition.
727 */
728 tp->t_cpupart = newpp;
729 ASSERT(tp->t_lpl != NULL);
730 /*
731 * Leave the thread on the same lgroup if possible; otherwise
732 * choose a new lgroup for it. In either case, update its
733 * t_lpl.
734 */
735 if (LGRP_CPUS_IN_PART(tp->t_lpl->lpl_lgrpid, newpp) &&
736 tp->t_lgrp_affinity == NULL) {
737 /*
738 * The thread's lgroup has CPUs in the thread's new
739 * partition, so the thread can stay assigned to the
740 * same lgroup. Update its t_lpl to point to the
741 * lpl_t for its lgroup in its new partition.
742 */
743 lgrp_move_thread(tp, &tp->t_cpupart->\
744 cp_lgrploads[tp->t_lpl->lpl_lgrpid], 1);
745 } else {
746 /*
747 * The thread's lgroup has no cpus in its new
748 * partition or it has specified lgroup affinities,
749 * so choose the best lgroup for the thread and
750 * assign it to that lgroup.
751 */
752 lgrp_move_thread(tp, lgrp_choose(tp, tp->t_cpupart),
753 1);
754 }
755 /*
756 * make sure lpl points to our own partition
757 */
758 ASSERT((tp->t_lpl >= tp->t_cpupart->cp_lgrploads) &&
759 (tp->t_lpl < tp->t_cpupart->cp_lgrploads +
760 tp->t_cpupart->cp_nlgrploads));
761
762 ASSERT(tp->t_lpl->lpl_ncpu > 0);
763
764 if (tp->t_state == TS_ONPROC) {
765 cpu_surrender(tp);
766 } else if (tp->t_state == TS_RUN) {
767 (void) dispdeq(tp);
768 setbackdq(tp);
769 }
770 }
771
772 /*
773 * Our binding has changed; set TP_CHANGEBIND.
774 */
775 tp->t_proc_flag |= TP_CHANGEBIND;
776 aston(tp);
777
778 thread_unlock(tp);
779 fss_changepset(tp, newpp, projbuf, zonebuf);
780
781 return (0); /* success */
782 }
783
784
785 /*
786 * This function binds a thread to a partition. Must be called with the
787 * p_lock of the containing process held (to keep the thread from going
788 * away), and thus also with cpu_lock held (since cpu_lock must be
789 * acquired before p_lock). If ignore is non-zero, then CPU bindings
790 * should be ignored (this is used when destroying a partition).
791 */
792 int
793 cpupart_bind_thread(kthread_id_t tp, psetid_t psid, int ignore, void *projbuf,
794 void *zonebuf)
795 {
796 cpupart_t *newpp;
797
798 ASSERT(pool_lock_held());
799 ASSERT(MUTEX_HELD(&cpu_lock));
800 ASSERT(MUTEX_HELD(&pidlock));
801 ASSERT(MUTEX_HELD(&ttoproc(tp)->p_lock));
802
803 if (psid == PS_NONE)
804 newpp = &cp_default;
805 else {
806 newpp = cpupart_find(psid);
807 if (newpp == NULL) {
808 return (EINVAL);
809 }
810 }
811 return (cpupart_move_thread(tp, newpp, ignore, projbuf, zonebuf));
812 }
813
814
815 /*
816 * Create a new partition. On MP systems, this also allocates a
817 * kpreempt disp queue for that partition.
818 */
819 int
820 cpupart_create(psetid_t *psid)
821 {
822 cpupart_t *pp;
823
824 ASSERT(pool_lock_held());
825
826 pp = kmem_zalloc(sizeof (cpupart_t), KM_SLEEP);
827
828 mutex_enter(&cpu_lock);
829 if (cp_numparts == cp_max_numparts) {
830 mutex_exit(&cpu_lock);
831 kmem_free(pp, sizeof (cpupart_t));
832 return (ENOMEM);
833 }
834 cp_numparts++;
835 /* find the next free partition ID */
836 while (cpupart_find(CPTOPS(cp_id_next)) != NULL)
837 cp_id_next++;
838 pp->cp_id = cp_id_next++;
839 pp->cp_ncpus = 0;
840 pp->cp_cpulist = NULL;
841 pp->cp_attr = 0;
842 klgrpset_clear(pp->cp_lgrpset);
843 pp->cp_kp_queue.disp_maxrunpri = -1;
844 pp->cp_kp_queue.disp_max_unbound_pri = -1;
845 pp->cp_kp_queue.disp_cpu = NULL;
846 pp->cp_gen = 0;
847 DISP_LOCK_INIT(&pp->cp_kp_queue.disp_lock);
848 *psid = CPTOPS(pp->cp_id);
849 disp_kp_alloc(&pp->cp_kp_queue, v.v_nglobpris);
850 cpupart_kstat_create(pp);
851 cpupart_lpl_initialize(pp);
852
853 bitset_init(&pp->cp_cmt_pgs);
854
855 /*
856 * Initialize and size the partition's bitset of halted CPUs.
857 */
858 bitset_init_fanout(&pp->cp_haltset, cp_haltset_fanout);
859 bitset_resize(&pp->cp_haltset, max_ncpus);
860
861 /*
862 * Pause all CPUs while changing the partition list, to make sure
863 * the clock thread (which traverses the list without holding
864 * cpu_lock) isn't running.
865 */
866 pause_cpus(NULL, NULL);
867 pp->cp_next = cp_list_head;
868 pp->cp_prev = cp_list_head->cp_prev;
869 cp_list_head->cp_prev->cp_next = pp;
870 cp_list_head->cp_prev = pp;
871 start_cpus();
872 mutex_exit(&cpu_lock);
873
874 return (0);
875 }
876
877 /*
878 * Move threads from specified partition to cp_default. If `force' is specified,
879 * move all threads, otherwise move only soft-bound threads.
880 */
881 static int
882 cpupart_unbind_threads(cpupart_t *pp, boolean_t unbind_all)
883 {
884 void *projbuf, *zonebuf;
885 kthread_t *t;
886 proc_t *p;
887 int err = 0;
888 psetid_t psid = pp->cp_id;
889
890 ASSERT(pool_lock_held());
891 ASSERT(MUTEX_HELD(&cpu_lock));
892
893 if (pp == NULL || pp == &cp_default) {
894 return (EINVAL);
895 }
896
897 /*
898 * Pre-allocate enough buffers for FSS for all active projects and
899 * for all active zones on the system. Unused buffers will be
900 * freed later by fss_freebuf().
901 */
902 projbuf = fss_allocbuf(FSS_NPROJ_BUF, FSS_ALLOC_PROJ);
903 zonebuf = fss_allocbuf(FSS_NPROJ_BUF, FSS_ALLOC_ZONE);
904
905 mutex_enter(&pidlock);
906 t = curthread;
907 do {
908 if (t->t_bind_pset == psid) {
909 again: p = ttoproc(t);
910 mutex_enter(&p->p_lock);
911 if (ttoproc(t) != p) {
912 /*
913 * lwp_exit has changed this thread's process
914 * pointer before we grabbed its p_lock.
915 */
916 mutex_exit(&p->p_lock);
917 goto again;
918 }
919
920 /*
921 * Can only unbind threads which have revocable binding
922 * unless force unbinding requested.
923 */
924 if (unbind_all || TB_PSET_IS_SOFT(t)) {
925 err = cpupart_bind_thread(t, PS_NONE, 1,
926 projbuf, zonebuf);
927 if (err) {
928 mutex_exit(&p->p_lock);
929 mutex_exit(&pidlock);
930 fss_freebuf(projbuf, FSS_ALLOC_PROJ);
931 fss_freebuf(zonebuf, FSS_ALLOC_ZONE);
932 return (err);
933 }
934 t->t_bind_pset = PS_NONE;
935 }
936 mutex_exit(&p->p_lock);
937 }
938 t = t->t_next;
939 } while (t != curthread);
940
941 mutex_exit(&pidlock);
942 fss_freebuf(projbuf, FSS_ALLOC_PROJ);
943 fss_freebuf(zonebuf, FSS_ALLOC_ZONE);
944 return (err);
945 }
946
947 /*
948 * Destroy a partition.
949 */
950 int
951 cpupart_destroy(psetid_t psid)
952 {
953 cpu_t *cp, *first_cp;
954 cpupart_t *pp, *newpp;
955 int err = 0;
956
957 ASSERT(pool_lock_held());
958 mutex_enter(&cpu_lock);
959
960 pp = cpupart_find(psid);
961 if (pp == NULL || pp == &cp_default) {
962 mutex_exit(&cpu_lock);
963 return (EINVAL);
964 }
965
966 /*
967 * Unbind all the threads currently bound to the partition.
968 */
969 err = cpupart_unbind_threads(pp, B_TRUE);
970 if (err) {
971 mutex_exit(&cpu_lock);
972 return (err);
973 }
974
975 newpp = &cp_default;
976 while ((cp = pp->cp_cpulist) != NULL) {
977 if (err = cpupart_move_cpu(cp, newpp, 0)) {
978 mutex_exit(&cpu_lock);
979 return (err);
980 }
981 }
982
983 ASSERT(bitset_is_null(&pp->cp_cmt_pgs));
984 ASSERT(bitset_is_null(&pp->cp_haltset));
985
986 /*
987 * Teardown the partition's group of active CMT PGs and halted
988 * CPUs now that they have all left.
989 */
990 bitset_fini(&pp->cp_cmt_pgs);
991 bitset_fini(&pp->cp_haltset);
992
993 /*
994 * Reset the pointers in any offline processors so they won't
995 * try to rejoin the destroyed partition when they're turned
996 * online.
997 */
998 first_cp = cp = CPU;
999 do {
1000 if (cp->cpu_part == pp) {
1001 ASSERT(cp->cpu_flags & CPU_OFFLINE);
1002 cp->cpu_part = newpp;
1003 }
1004 cp = cp->cpu_next;
1005 } while (cp != first_cp);
1006
1007 /*
1008 * Pause all CPUs while changing the partition list, to make sure
1009 * the clock thread (which traverses the list without holding
1010 * cpu_lock) isn't running.
1011 */
1012 pause_cpus(NULL, NULL);
1013 pp->cp_prev->cp_next = pp->cp_next;
1014 pp->cp_next->cp_prev = pp->cp_prev;
1015 if (cp_list_head == pp)
1016 cp_list_head = pp->cp_next;
1017 start_cpus();
1018
1019 if (cp_id_next > pp->cp_id)
1020 cp_id_next = pp->cp_id;
1021
1022 if (pp->cp_kstat)
1023 kstat_delete(pp->cp_kstat);
1024
1025 cp_numparts--;
1026
1027 disp_kp_free(&pp->cp_kp_queue);
1028
1029 cpupart_lpl_teardown(pp);
1030
1031 kmem_free(pp, sizeof (cpupart_t));
1032 mutex_exit(&cpu_lock);
1033
1034 return (err);
1035 }
1036
1037
1038 /*
1039 * Return the ID of the partition to which the specified processor belongs.
1040 */
1041 psetid_t
1042 cpupart_query_cpu(cpu_t *cp)
1043 {
1044 ASSERT(MUTEX_HELD(&cpu_lock));
1045
1046 return (CPTOPS(cp->cpu_part->cp_id));
1047 }
1048
1049
1050 /*
1051 * Attach a processor to an existing partition.
1052 */
1053 int
1054 cpupart_attach_cpu(psetid_t psid, cpu_t *cp, int forced)
1055 {
1056 cpupart_t *pp;
1057 int err;
1058
1059 ASSERT(pool_lock_held());
1060 ASSERT(MUTEX_HELD(&cpu_lock));
1061
1062 pp = cpupart_find(psid);
1063 if (pp == NULL)
1064 return (EINVAL);
1065 if (cp->cpu_flags & CPU_OFFLINE)
1066 return (EINVAL);
1067
1068 err = cpupart_move_cpu(cp, pp, forced);
1069 return (err);
1070 }
1071
1072 /*
1073 * Get a list of cpus belonging to the partition. If numcpus is NULL,
1074 * this just checks for a valid partition. If numcpus is non-NULL but
1075 * cpulist is NULL, the current number of cpus is stored in *numcpus.
1076 * If both are non-NULL, the current number of cpus is stored in *numcpus,
1077 * and a list of those cpus up to the size originally in *numcpus is
1078 * stored in cpulist[]. Also, store the processor set id in *psid.
1079 * This is useful in case the processor set id passed in was PS_MYID.
1080 */
1081 int
1082 cpupart_get_cpus(psetid_t *psid, processorid_t *cpulist, uint_t *numcpus)
1083 {
1084 cpupart_t *pp;
1085 uint_t ncpus;
1086 cpu_t *c;
1087 int i;
1088
1089 mutex_enter(&cpu_lock);
1090 pp = cpupart_find(*psid);
1091 if (pp == NULL) {
1092 mutex_exit(&cpu_lock);
1093 return (EINVAL);
1094 }
1095 *psid = CPTOPS(pp->cp_id);
1096 ncpus = pp->cp_ncpus;
1097 if (numcpus) {
1098 if (ncpus > *numcpus) {
1099 /*
1100 * Only copy as many cpus as were passed in, but
1101 * pass back the real number.
1102 */
1103 uint_t t = ncpus;
1104 ncpus = *numcpus;
1105 *numcpus = t;
1106 } else
1107 *numcpus = ncpus;
1108
1109 if (cpulist) {
1110 c = pp->cp_cpulist;
1111 for (i = 0; i < ncpus; i++) {
1112 ASSERT(c != NULL);
1113 cpulist[i] = c->cpu_id;
1114 c = c->cpu_next_part;
1115 }
1116 }
1117 }
1118 mutex_exit(&cpu_lock);
1119 return (0);
1120 }
1121
1122 /*
1123 * Reallocate kpreempt queues for each CPU partition. Called from
1124 * disp_setup when a new scheduling class is loaded that increases the
1125 * number of priorities in the system.
1126 */
1127 void
1128 cpupart_kpqalloc(pri_t npri)
1129 {
1130 cpupart_t *cpp;
1131
1132 ASSERT(MUTEX_HELD(&cpu_lock));
1133 cpp = cp_list_head;
1134 do {
1135 disp_kp_alloc(&cpp->cp_kp_queue, npri);
1136 cpp = cpp->cp_next;
1137 } while (cpp != cp_list_head);
1138 }
1139
1140 int
1141 cpupart_get_loadavg(psetid_t psid, int *buf, int nelem)
1142 {
1143 cpupart_t *cp;
1144 int i;
1145
1146 ASSERT(nelem >= 0);
1147 ASSERT(nelem <= LOADAVG_NSTATS);
1148 ASSERT(MUTEX_HELD(&cpu_lock));
1149
1150 cp = cpupart_find(psid);
1151 if (cp == NULL)
1152 return (EINVAL);
1153 for (i = 0; i < nelem; i++)
1154 buf[i] = cp->cp_hp_avenrun[i] >> (16 - FSHIFT);
1155
1156 return (0);
1157 }
1158
1159
1160 uint_t
1161 cpupart_list(psetid_t *list, uint_t nelem, int flag)
1162 {
1163 uint_t numpart = 0;
1164 cpupart_t *cp;
1165
1166 ASSERT(MUTEX_HELD(&cpu_lock));
1167 ASSERT(flag == CP_ALL || flag == CP_NONEMPTY);
1168
1169 if (list != NULL) {
1170 cp = cp_list_head;
1171 do {
1172 if (((flag == CP_ALL) && (cp != &cp_default)) ||
1173 ((flag == CP_NONEMPTY) && (cp->cp_ncpus != 0))) {
1174 if (numpart == nelem)
1175 break;
1176 list[numpart++] = CPTOPS(cp->cp_id);
1177 }
1178 cp = cp->cp_next;
1179 } while (cp != cp_list_head);
1180 }
1181
1182 ASSERT(numpart < cp_numparts);
1183
1184 if (flag == CP_ALL)
1185 numpart = cp_numparts - 1; /* leave out default partition */
1186 else if (flag == CP_NONEMPTY)
1187 numpart = cp_numparts_nonempty;
1188
1189 return (numpart);
1190 }
1191
1192 int
1193 cpupart_setattr(psetid_t psid, uint_t attr)
1194 {
1195 cpupart_t *cp;
1196
1197 ASSERT(pool_lock_held());
1198
1199 mutex_enter(&cpu_lock);
1200 if ((cp = cpupart_find(psid)) == NULL) {
1201 mutex_exit(&cpu_lock);
1202 return (EINVAL);
1203 }
1204 /*
1205 * PSET_NOESCAPE attribute for default cpu partition is always set
1206 */
1207 if (cp == &cp_default && !(attr & PSET_NOESCAPE)) {
1208 mutex_exit(&cpu_lock);
1209 return (EINVAL);
1210 }
1211 cp->cp_attr = attr;
1212 mutex_exit(&cpu_lock);
1213 return (0);
1214 }
1215
1216 int
1217 cpupart_getattr(psetid_t psid, uint_t *attrp)
1218 {
1219 cpupart_t *cp;
1220
1221 mutex_enter(&cpu_lock);
1222 if ((cp = cpupart_find(psid)) == NULL) {
1223 mutex_exit(&cpu_lock);
1224 return (EINVAL);
1225 }
1226 *attrp = cp->cp_attr;
1227 mutex_exit(&cpu_lock);
1228 return (0);
1229 }