1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 1996, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2017 by Delphix. All rights reserved.
24 */
25
26 #include <sys/types.h>
27 #include <sys/systm.h>
28 #include <sys/cmn_err.h>
29 #include <sys/cpuvar.h>
30 #include <sys/thread.h>
31 #include <sys/disp.h>
32 #include <sys/kmem.h>
33 #include <sys/debug.h>
34 #include <sys/cpupart.h>
35 #include <sys/pset.h>
36 #include <sys/var.h>
37 #include <sys/cyclic.h>
38 #include <sys/lgrp.h>
39 #include <sys/pghw.h>
40 #include <sys/loadavg.h>
41 #include <sys/class.h>
42 #include <sys/fss.h>
43 #include <sys/pool.h>
44 #include <sys/pool_pset.h>
45 #include <sys/policy.h>
46
47 /*
48 * Calling pool_lock() protects the pools configuration, which includes
49 * CPU partitions. cpu_lock protects the CPU partition list, and prevents
50 * partitions from being created or destroyed while the lock is held.
51 * The lock ordering with respect to related locks is:
52 *
53 * pool_lock() ---> cpu_lock ---> pidlock --> p_lock
54 *
55 * Blocking memory allocations may be made while holding "pool_lock"
56 * or cpu_lock.
57 */
58
59 /*
60 * The cp_default partition is allocated statically, but its lgroup load average
61 * (lpl) list is allocated dynamically after kmem subsystem is initialized. This
62 * saves some memory since the space allocated reflects the actual number of
63 * lgroups supported by the platform. The lgrp facility provides a temporary
64 * space to hold lpl information during system bootstrap.
65 */
66
67 cpupart_t *cp_list_head;
68 cpupart_t cp_default;
69 static cpupartid_t cp_id_next;
70 uint_t cp_numparts;
71 uint_t cp_numparts_nonempty;
72
73 /*
74 * Need to limit total number of partitions to avoid slowing down the
75 * clock code too much. The clock code traverses the list of
76 * partitions and needs to be able to execute in a reasonable amount
77 * of time (less than 1/hz seconds). The maximum is sized based on
78 * max_ncpus so it shouldn't be a problem unless there are large
79 * numbers of empty partitions.
80 */
81 static uint_t cp_max_numparts;
82
83 /*
84 * Processor sets and CPU partitions are different but related concepts.
85 * A processor set is a user-level abstraction allowing users to create
86 * sets of CPUs and bind threads exclusively to those sets. A CPU
87 * partition is a kernel dispatcher object consisting of a set of CPUs
88 * and a global dispatch queue. The processor set abstraction is
89 * implemented via a CPU partition, and currently there is a 1-1
90 * mapping between processor sets and partitions (excluding the default
91 * partition, which is not visible as a processor set). Hence, the
92 * numbering for processor sets and CPU partitions is identical. This
93 * may not always be true in the future, and these macros could become
94 * less trivial if we support e.g. a processor set containing multiple
95 * CPU partitions.
96 */
97 #define PSTOCP(psid) ((cpupartid_t)((psid) == PS_NONE ? CP_DEFAULT : (psid)))
98 #define CPTOPS(cpid) ((psetid_t)((cpid) == CP_DEFAULT ? PS_NONE : (cpid)))
99
100 static int cpupart_unbind_threads(cpupart_t *, boolean_t);
101
102 /*
103 * Find a CPU partition given a processor set ID.
104 */
105 static cpupart_t *
106 cpupart_find_all(psetid_t psid)
107 {
108 cpupart_t *cp;
109 cpupartid_t cpid = PSTOCP(psid);
110
111 ASSERT(MUTEX_HELD(&cpu_lock));
112
113 /* default partition not visible as a processor set */
114 if (psid == CP_DEFAULT)
115 return (NULL);
116
117 if (psid == PS_MYID)
118 return (curthread->t_cpupart);
119
120 cp = cp_list_head;
121 do {
122 if (cp->cp_id == cpid)
123 return (cp);
124 cp = cp->cp_next;
125 } while (cp != cp_list_head);
126 return (NULL);
127 }
128
129 /*
130 * Find a CPU partition given a processor set ID if the processor set
131 * should be visible from the calling zone.
132 */
133 cpupart_t *
134 cpupart_find(psetid_t psid)
135 {
136 cpupart_t *cp;
137
138 ASSERT(MUTEX_HELD(&cpu_lock));
139 cp = cpupart_find_all(psid);
140 if (cp != NULL && !INGLOBALZONE(curproc) && pool_pset_enabled() &&
141 zone_pset_get(curproc->p_zone) != CPTOPS(cp->cp_id))
142 return (NULL);
143 return (cp);
144 }
145
146 static int
147 cpupart_kstat_update(kstat_t *ksp, int rw)
148 {
149 cpupart_t *cp = (cpupart_t *)ksp->ks_private;
150 cpupart_kstat_t *cpksp = ksp->ks_data;
151
152 if (rw == KSTAT_WRITE)
153 return (EACCES);
154
155 cpksp->cpk_updates.value.ui64 = cp->cp_updates;
156 cpksp->cpk_runnable.value.ui64 = cp->cp_nrunnable_cum;
157 cpksp->cpk_waiting.value.ui64 = cp->cp_nwaiting_cum;
158 cpksp->cpk_ncpus.value.ui32 = cp->cp_ncpus;
159 cpksp->cpk_avenrun_1min.value.ui32 = cp->cp_hp_avenrun[0] >>
160 (16 - FSHIFT);
161 cpksp->cpk_avenrun_5min.value.ui32 = cp->cp_hp_avenrun[1] >>
162 (16 - FSHIFT);
163 cpksp->cpk_avenrun_15min.value.ui32 = cp->cp_hp_avenrun[2] >>
164 (16 - FSHIFT);
165 return (0);
166 }
167
168 static void
169 cpupart_kstat_create(cpupart_t *cp)
170 {
171 kstat_t *ksp;
172 zoneid_t zoneid;
173
174 ASSERT(MUTEX_HELD(&cpu_lock));
175
176 /*
177 * We have a bit of a chicken-egg problem since this code will
178 * get called to create the kstats for CP_DEFAULT before the
179 * pools framework gets initialized. We circumvent the problem
180 * by special-casing cp_default.
181 */
182 if (cp != &cp_default && pool_pset_enabled())
183 zoneid = GLOBAL_ZONEID;
184 else
185 zoneid = ALL_ZONES;
186 ksp = kstat_create_zone("unix", cp->cp_id, "pset", "misc",
187 KSTAT_TYPE_NAMED,
188 sizeof (cpupart_kstat_t) / sizeof (kstat_named_t), 0, zoneid);
189 if (ksp != NULL) {
190 cpupart_kstat_t *cpksp = ksp->ks_data;
191
192 kstat_named_init(&cpksp->cpk_updates, "updates",
193 KSTAT_DATA_UINT64);
194 kstat_named_init(&cpksp->cpk_runnable, "runnable",
195 KSTAT_DATA_UINT64);
196 kstat_named_init(&cpksp->cpk_waiting, "waiting",
197 KSTAT_DATA_UINT64);
198 kstat_named_init(&cpksp->cpk_ncpus, "ncpus",
199 KSTAT_DATA_UINT32);
200 kstat_named_init(&cpksp->cpk_avenrun_1min, "avenrun_1min",
201 KSTAT_DATA_UINT32);
202 kstat_named_init(&cpksp->cpk_avenrun_5min, "avenrun_5min",
203 KSTAT_DATA_UINT32);
204 kstat_named_init(&cpksp->cpk_avenrun_15min, "avenrun_15min",
205 KSTAT_DATA_UINT32);
206
207 ksp->ks_update = cpupart_kstat_update;
208 ksp->ks_private = cp;
209
210 kstat_install(ksp);
211 }
212 cp->cp_kstat = ksp;
213 }
214
215 /*
216 * Initialize the cpupart's lgrp partions (lpls)
217 */
218 static void
219 cpupart_lpl_initialize(cpupart_t *cp)
220 {
221 int i, sz;
222
223 sz = cp->cp_nlgrploads = lgrp_plat_max_lgrps();
224 cp->cp_lgrploads = kmem_zalloc(sizeof (lpl_t) * sz, KM_SLEEP);
225
226 for (i = 0; i < sz; i++) {
227 /*
228 * The last entry of the lpl's resource set is always NULL
229 * by design (to facilitate iteration)...hence the "oversizing"
230 * by 1.
231 */
232 cp->cp_lgrploads[i].lpl_rset_sz = sz + 1;
233 cp->cp_lgrploads[i].lpl_rset =
234 kmem_zalloc(sizeof (struct lgrp_ld *) * (sz + 1), KM_SLEEP);
235 cp->cp_lgrploads[i].lpl_id2rset =
236 kmem_zalloc(sizeof (int) * (sz + 1), KM_SLEEP);
237 cp->cp_lgrploads[i].lpl_lgrpid = i;
238 }
239 }
240
241 /*
242 * Teardown the cpupart's lgrp partitions
243 */
244 static void
245 cpupart_lpl_teardown(cpupart_t *cp)
246 {
247 int i, sz;
248 lpl_t *lpl;
249
250 for (i = 0; i < cp->cp_nlgrploads; i++) {
251 lpl = &cp->cp_lgrploads[i];
252
253 sz = lpl->lpl_rset_sz;
254 kmem_free(lpl->lpl_rset, sizeof (struct lgrp_ld *) * sz);
255 kmem_free(lpl->lpl_id2rset, sizeof (int) * sz);
256 lpl->lpl_rset = NULL;
257 lpl->lpl_id2rset = NULL;
258 }
259 kmem_free(cp->cp_lgrploads, sizeof (lpl_t) * cp->cp_nlgrploads);
260 cp->cp_lgrploads = NULL;
261 }
262
263 /*
264 * Initialize the default partition and kpreempt disp queue.
265 */
266 void
267 cpupart_initialize_default(void)
268 {
269 lgrp_id_t i;
270
271 cp_list_head = &cp_default;
272 cp_default.cp_next = &cp_default;
273 cp_default.cp_prev = &cp_default;
274 cp_default.cp_id = CP_DEFAULT;
275 cp_default.cp_kp_queue.disp_maxrunpri = -1;
276 cp_default.cp_kp_queue.disp_max_unbound_pri = -1;
277 cp_default.cp_kp_queue.disp_cpu = NULL;
278 cp_default.cp_gen = 0;
279 cp_default.cp_loadavg.lg_cur = 0;
280 cp_default.cp_loadavg.lg_len = 0;
281 cp_default.cp_loadavg.lg_total = 0;
282 for (i = 0; i < S_LOADAVG_SZ; i++) {
283 cp_default.cp_loadavg.lg_loads[i] = 0;
284 }
285 DISP_LOCK_INIT(&cp_default.cp_kp_queue.disp_lock);
286 cp_id_next = CP_DEFAULT + 1;
287 cpupart_kstat_create(&cp_default);
288 cp_numparts = 1;
289 if (cp_max_numparts == 0) /* allow for /etc/system tuning */
290 cp_max_numparts = max_ncpus * 2 + 1;
291 /*
292 * Allocate space for cp_default list of lgrploads
293 */
294 cpupart_lpl_initialize(&cp_default);
295
296 /*
297 * The initial lpl topology is created in a special lpl list
298 * lpl_bootstrap. It should be copied to cp_default.
299 * NOTE: lpl_topo_bootstrap() also updates CPU0 cpu_lpl pointer to point
300 * to the correct lpl in the cp_default.cp_lgrploads list.
301 */
302 lpl_topo_bootstrap(cp_default.cp_lgrploads,
303 cp_default.cp_nlgrploads);
304
305
306 cp_default.cp_attr = PSET_NOESCAPE;
307 cp_numparts_nonempty = 1;
308 /*
309 * Set t0's home
310 */
311 t0.t_lpl = &cp_default.cp_lgrploads[LGRP_ROOTID];
312
313 bitset_init(&cp_default.cp_cmt_pgs);
314 bitset_init_fanout(&cp_default.cp_haltset, cp_haltset_fanout);
315
316 bitset_resize(&cp_default.cp_haltset, max_ncpus);
317 }
318
319
320 static int
321 cpupart_move_cpu(cpu_t *cp, cpupart_t *newpp, int forced)
322 {
323 cpupart_t *oldpp;
324 cpu_t *ncp, *newlist;
325 kthread_t *t;
326 int move_threads = 1;
327 lgrp_id_t lgrpid;
328 proc_t *p;
329 int lgrp_diff_lpl;
330 lpl_t *cpu_lpl;
331 int ret;
332 boolean_t unbind_all_threads = (forced != 0);
333
334 ASSERT(MUTEX_HELD(&cpu_lock));
335 ASSERT(newpp != NULL);
336
337 oldpp = cp->cpu_part;
338 ASSERT(oldpp != NULL);
339 ASSERT(oldpp->cp_ncpus > 0);
340
341 if (newpp == oldpp) {
342 /*
343 * Don't need to do anything.
344 */
345 return (0);
346 }
347
348 cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_OUT);
349
350 if (!disp_bound_partition(cp, 0)) {
351 /*
352 * Don't need to move threads if there are no threads in
353 * the partition. Note that threads can't enter the
354 * partition while we're holding cpu_lock.
355 */
356 move_threads = 0;
357 } else if (oldpp->cp_ncpus == 1) {
358 /*
359 * The last CPU is removed from a partition which has threads
360 * running in it. Some of these threads may be bound to this
361 * CPU.
362 *
363 * Attempt to unbind threads from the CPU and from the processor
364 * set. Note that no threads should be bound to this CPU since
365 * cpupart_move_threads will refuse to move bound threads to
366 * other CPUs.
367 */
368 (void) cpu_unbind(oldpp->cp_cpulist->cpu_id, B_FALSE);
369 (void) cpupart_unbind_threads(oldpp, B_FALSE);
370
371 if (!disp_bound_partition(cp, 0)) {
372 /*
373 * No bound threads in this partition any more
374 */
375 move_threads = 0;
376 } else {
377 /*
378 * There are still threads bound to the partition
379 */
380 cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN);
381 return (EBUSY);
382 }
383 }
384
385 /*
386 * If forced flag is set unbind any threads from this CPU.
387 * Otherwise unbind soft-bound threads only.
388 */
389 if ((ret = cpu_unbind(cp->cpu_id, unbind_all_threads)) != 0) {
390 cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN);
391 return (ret);
392 }
393
394 /*
395 * Stop further threads weak binding to this cpu.
396 */
397 cpu_inmotion = cp;
398 membar_enter();
399
400 /*
401 * Notify the Processor Groups subsystem that the CPU
402 * will be moving cpu partitions. This is done before
403 * CPUs are paused to provide an opportunity for any
404 * needed memory allocations.
405 */
406 pg_cpupart_out(cp, oldpp);
407 pg_cpupart_in(cp, newpp);
408
409 again:
410 if (move_threads) {
411 int loop_count;
412 /*
413 * Check for threads strong or weak bound to this CPU.
414 */
415 for (loop_count = 0; disp_bound_threads(cp, 0); loop_count++) {
416 if (loop_count >= 5) {
417 cpu_state_change_notify(cp->cpu_id,
418 CPU_CPUPART_IN);
419 pg_cpupart_out(cp, newpp);
420 pg_cpupart_in(cp, oldpp);
421 cpu_inmotion = NULL;
422 return (EBUSY); /* some threads still bound */
423 }
424 delay(1);
425 }
426 }
427
428 /*
429 * Before we actually start changing data structures, notify
430 * the cyclic subsystem that we want to move this CPU out of its
431 * partition.
432 */
433 if (!cyclic_move_out(cp)) {
434 /*
435 * This CPU must be the last CPU in a processor set with
436 * a bound cyclic.
437 */
438 cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN);
439 pg_cpupart_out(cp, newpp);
440 pg_cpupart_in(cp, oldpp);
441 cpu_inmotion = NULL;
442 return (EBUSY);
443 }
444
445 pause_cpus(cp, NULL);
446
447 if (move_threads) {
448 /*
449 * The thread on cpu before the pause thread may have read
450 * cpu_inmotion before we raised the barrier above. Check
451 * again.
452 */
453 if (disp_bound_threads(cp, 1)) {
454 start_cpus();
455 goto again;
456 }
457
458 }
459
460 /*
461 * Now that CPUs are paused, let the PG subsystem perform
462 * any necessary data structure updates.
463 */
464 pg_cpupart_move(cp, oldpp, newpp);
465
466 /* save this cpu's lgroup -- it'll be the same in the new partition */
467 lgrpid = cp->cpu_lpl->lpl_lgrpid;
468
469 cpu_lpl = cp->cpu_lpl;
470 /*
471 * let the lgroup framework know cp has left the partition
472 */
473 lgrp_config(LGRP_CONFIG_CPUPART_DEL, (uintptr_t)cp, lgrpid);
474
475 /* move out of old partition */
476 oldpp->cp_ncpus--;
477 if (oldpp->cp_ncpus > 0) {
478
479 ncp = cp->cpu_prev_part->cpu_next_part = cp->cpu_next_part;
480 cp->cpu_next_part->cpu_prev_part = cp->cpu_prev_part;
481 if (oldpp->cp_cpulist == cp) {
482 oldpp->cp_cpulist = ncp;
483 }
484 } else {
485 ncp = oldpp->cp_cpulist = NULL;
486 cp_numparts_nonempty--;
487 ASSERT(cp_numparts_nonempty != 0);
488 }
489 oldpp->cp_gen++;
490
491 /* move into new partition */
492 newlist = newpp->cp_cpulist;
493 if (newlist == NULL) {
494 newpp->cp_cpulist = cp->cpu_next_part = cp->cpu_prev_part = cp;
495 cp_numparts_nonempty++;
496 ASSERT(cp_numparts_nonempty != 0);
497 } else {
498 cp->cpu_next_part = newlist;
499 cp->cpu_prev_part = newlist->cpu_prev_part;
500 newlist->cpu_prev_part->cpu_next_part = cp;
501 newlist->cpu_prev_part = cp;
502 }
503 cp->cpu_part = newpp;
504 newpp->cp_ncpus++;
505 newpp->cp_gen++;
506
507 ASSERT(bitset_is_null(&newpp->cp_haltset));
508 ASSERT(bitset_is_null(&oldpp->cp_haltset));
509
510 /*
511 * let the lgroup framework know cp has entered the partition
512 */
513 lgrp_config(LGRP_CONFIG_CPUPART_ADD, (uintptr_t)cp, lgrpid);
514
515 /*
516 * If necessary, move threads off processor.
517 */
518 if (move_threads) {
519 ASSERT(ncp != NULL);
520
521 /*
522 * Walk thru the active process list to look for
523 * threads that need to have a new home lgroup,
524 * or the last CPU they run on is the same CPU
525 * being moved out of the partition.
526 */
527
528 for (p = practive; p != NULL; p = p->p_next) {
529
530 t = p->p_tlist;
531
532 if (t == NULL)
533 continue;
534
535 lgrp_diff_lpl = 0;
536
537 do {
538
539 ASSERT(t->t_lpl != NULL);
540
541 /*
542 * Update the count of how many threads are
543 * in this CPU's lgroup but have a different lpl
544 */
545
546 if (t->t_lpl != cpu_lpl &&
547 t->t_lpl->lpl_lgrpid == lgrpid)
548 lgrp_diff_lpl++;
549 /*
550 * If the lgroup that t is assigned to no
551 * longer has any CPUs in t's partition,
552 * we'll have to choose a new lgroup for t.
553 */
554
555 if (!LGRP_CPUS_IN_PART(t->t_lpl->lpl_lgrpid,
556 t->t_cpupart)) {
557 lgrp_move_thread(t,
558 lgrp_choose(t, t->t_cpupart), 0);
559 }
560
561 /*
562 * make sure lpl points to our own partition
563 */
564 ASSERT(t->t_lpl >= t->t_cpupart->cp_lgrploads &&
565 (t->t_lpl < t->t_cpupart->cp_lgrploads +
566 t->t_cpupart->cp_nlgrploads));
567
568 ASSERT(t->t_lpl->lpl_ncpu > 0);
569
570 /* Update CPU last ran on if it was this CPU */
571 if (t->t_cpu == cp && t->t_cpupart == oldpp &&
572 t->t_bound_cpu != cp) {
573 t->t_cpu = disp_lowpri_cpu(ncp,
574 t->t_lpl, t->t_pri, NULL);
575 }
576 t = t->t_forw;
577 } while (t != p->p_tlist);
578
579 /*
580 * Didn't find any threads in the same lgroup as this
581 * CPU with a different lpl, so remove the lgroup from
582 * the process lgroup bitmask.
583 */
584
585 if (lgrp_diff_lpl)
586 klgrpset_del(p->p_lgrpset, lgrpid);
587 }
588
589 /*
590 * Walk thread list looking for threads that need to be
591 * rehomed, since there are some threads that are not in
592 * their process's p_tlist.
593 */
594
595 t = curthread;
596
597 do {
598 ASSERT(t != NULL && t->t_lpl != NULL);
599
600 /*
601 * If the lgroup that t is assigned to no
602 * longer has any CPUs in t's partition,
603 * we'll have to choose a new lgroup for t.
604 * Also, choose best lgroup for home when
605 * thread has specified lgroup affinities,
606 * since there may be an lgroup with more
607 * affinity available after moving CPUs
608 * around.
609 */
610 if (!LGRP_CPUS_IN_PART(t->t_lpl->lpl_lgrpid,
611 t->t_cpupart) || t->t_lgrp_affinity) {
612 lgrp_move_thread(t,
613 lgrp_choose(t, t->t_cpupart), 1);
614 }
615
616 /* make sure lpl points to our own partition */
617 ASSERT((t->t_lpl >= t->t_cpupart->cp_lgrploads) &&
618 (t->t_lpl < t->t_cpupart->cp_lgrploads +
619 t->t_cpupart->cp_nlgrploads));
620
621 ASSERT(t->t_lpl->lpl_ncpu > 0);
622
623 /* Update CPU last ran on if it was this CPU */
624 if (t->t_cpu == cp && t->t_cpupart == oldpp &&
625 t->t_bound_cpu != cp) {
626 t->t_cpu = disp_lowpri_cpu(ncp, t->t_lpl,
627 t->t_pri, NULL);
628 }
629
630 t = t->t_next;
631 } while (t != curthread);
632
633 /*
634 * Clear off the CPU's run queue, and the kp queue if the
635 * partition is now empty.
636 */
637 disp_cpu_inactive(cp);
638
639 /*
640 * Make cp switch to a thread from the new partition.
641 */
642 cp->cpu_runrun = 1;
643 cp->cpu_kprunrun = 1;
644 }
645
646 cpu_inmotion = NULL;
647 start_cpus();
648
649 /*
650 * Let anyone interested know that cpu has been added to the set.
651 */
652 cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN);
653
654 /*
655 * Now let the cyclic subsystem know that it can reshuffle cyclics
656 * bound to the new processor set.
657 */
658 cyclic_move_in(cp);
659
660 return (0);
661 }
662
663 /*
664 * Check if thread can be moved to a new cpu partition. Called by
665 * cpupart_move_thread() and pset_bind_start().
666 */
667 int
668 cpupart_movable_thread(kthread_id_t tp, cpupart_t *cp, int ignore)
669 {
670 ASSERT(MUTEX_HELD(&cpu_lock));
671 ASSERT(MUTEX_HELD(&ttoproc(tp)->p_lock));
672 ASSERT(cp != NULL);
673 ASSERT(THREAD_LOCK_HELD(tp));
674
675 /*
676 * CPU-bound threads can't be moved.
677 */
678 if (!ignore) {
679 cpu_t *boundcpu = tp->t_bound_cpu ? tp->t_bound_cpu :
680 tp->t_weakbound_cpu;
681 if (boundcpu != NULL && boundcpu->cpu_part != cp)
682 return (EBUSY);
683 }
684
685 if (tp->t_cid == sysdccid) {
686 return (EINVAL); /* For now, sysdc threads can't move */
687 }
688
689 return (0);
690 }
691
692 /*
693 * Move thread to new partition. If ignore is non-zero, then CPU
694 * bindings should be ignored (this is used when destroying a
695 * partition).
696 */
697 static int
698 cpupart_move_thread(kthread_id_t tp, cpupart_t *newpp, int ignore,
699 void *projbuf, void *zonebuf)
700 {
701 cpupart_t *oldpp = tp->t_cpupart;
702 int ret;
703
704 ASSERT(MUTEX_HELD(&cpu_lock));
705 ASSERT(MUTEX_HELD(&pidlock));
706 ASSERT(MUTEX_HELD(&ttoproc(tp)->p_lock));
707 ASSERT(newpp != NULL);
708
709 if (newpp->cp_cpulist == NULL)
710 return (EINVAL);
711
712 /*
713 * Check for errors first.
714 */
715 thread_lock(tp);
716 if ((ret = cpupart_movable_thread(tp, newpp, ignore)) != 0) {
717 thread_unlock(tp);
718 return (ret);
719 }
720
721 /* move the thread */
722 if (oldpp != newpp) {
723 /*
724 * Make the thread switch to the new partition.
725 */
726 tp->t_cpupart = newpp;
727 ASSERT(tp->t_lpl != NULL);
728 /*
729 * Leave the thread on the same lgroup if possible; otherwise
730 * choose a new lgroup for it. In either case, update its
731 * t_lpl.
732 */
733 if (LGRP_CPUS_IN_PART(tp->t_lpl->lpl_lgrpid, newpp) &&
734 tp->t_lgrp_affinity == NULL) {
735 /*
736 * The thread's lgroup has CPUs in the thread's new
737 * partition, so the thread can stay assigned to the
738 * same lgroup. Update its t_lpl to point to the
739 * lpl_t for its lgroup in its new partition.
740 */
741 lgrp_move_thread(tp, &tp->t_cpupart->\
742 cp_lgrploads[tp->t_lpl->lpl_lgrpid], 1);
743 } else {
744 /*
745 * The thread's lgroup has no cpus in its new
746 * partition or it has specified lgroup affinities,
747 * so choose the best lgroup for the thread and
748 * assign it to that lgroup.
749 */
750 lgrp_move_thread(tp, lgrp_choose(tp, tp->t_cpupart),
751 1);
752 }
753 /*
754 * make sure lpl points to our own partition
755 */
756 ASSERT((tp->t_lpl >= tp->t_cpupart->cp_lgrploads) &&
757 (tp->t_lpl < tp->t_cpupart->cp_lgrploads +
758 tp->t_cpupart->cp_nlgrploads));
759
760 ASSERT(tp->t_lpl->lpl_ncpu > 0);
761
762 if (tp->t_state == TS_ONPROC) {
763 cpu_surrender(tp);
764 } else if (tp->t_state == TS_RUN) {
765 (void) dispdeq(tp);
766 setbackdq(tp);
767 }
768 }
769
770 /*
771 * Our binding has changed; set TP_CHANGEBIND.
772 */
773 tp->t_proc_flag |= TP_CHANGEBIND;
774 aston(tp);
775
776 thread_unlock(tp);
777 fss_changepset(tp, newpp, projbuf, zonebuf);
778
779 return (0); /* success */
780 }
781
782
783 /*
784 * This function binds a thread to a partition. Must be called with the
785 * p_lock of the containing process held (to keep the thread from going
786 * away), and thus also with cpu_lock held (since cpu_lock must be
787 * acquired before p_lock). If ignore is non-zero, then CPU bindings
788 * should be ignored (this is used when destroying a partition).
789 */
790 int
791 cpupart_bind_thread(kthread_id_t tp, psetid_t psid, int ignore, void *projbuf,
792 void *zonebuf)
793 {
794 cpupart_t *newpp;
795
796 ASSERT(pool_lock_held());
797 ASSERT(MUTEX_HELD(&cpu_lock));
798 ASSERT(MUTEX_HELD(&pidlock));
799 ASSERT(MUTEX_HELD(&ttoproc(tp)->p_lock));
800
801 if (psid == PS_NONE)
802 newpp = &cp_default;
803 else {
804 newpp = cpupart_find(psid);
805 if (newpp == NULL) {
806 return (EINVAL);
807 }
808 }
809 return (cpupart_move_thread(tp, newpp, ignore, projbuf, zonebuf));
810 }
811
812
813 /*
814 * Create a new partition. On MP systems, this also allocates a
815 * kpreempt disp queue for that partition.
816 */
817 int
818 cpupart_create(psetid_t *psid)
819 {
820 cpupart_t *pp;
821
822 ASSERT(pool_lock_held());
823
824 pp = kmem_zalloc(sizeof (cpupart_t), KM_SLEEP);
825
826 mutex_enter(&cpu_lock);
827 if (cp_numparts == cp_max_numparts) {
828 mutex_exit(&cpu_lock);
829 kmem_free(pp, sizeof (cpupart_t));
830 return (ENOMEM);
831 }
832 cp_numparts++;
833 /* find the next free partition ID */
834 while (cpupart_find(CPTOPS(cp_id_next)) != NULL)
835 cp_id_next++;
836 pp->cp_id = cp_id_next++;
837 pp->cp_ncpus = 0;
838 pp->cp_cpulist = NULL;
839 pp->cp_attr = 0;
840 klgrpset_clear(pp->cp_lgrpset);
841 pp->cp_kp_queue.disp_maxrunpri = -1;
842 pp->cp_kp_queue.disp_max_unbound_pri = -1;
843 pp->cp_kp_queue.disp_cpu = NULL;
844 pp->cp_gen = 0;
845 DISP_LOCK_INIT(&pp->cp_kp_queue.disp_lock);
846 *psid = CPTOPS(pp->cp_id);
847 disp_kp_alloc(&pp->cp_kp_queue, v.v_nglobpris);
848 cpupart_kstat_create(pp);
849 cpupart_lpl_initialize(pp);
850
851 bitset_init(&pp->cp_cmt_pgs);
852
853 /*
854 * Initialize and size the partition's bitset of halted CPUs.
855 */
856 bitset_init_fanout(&pp->cp_haltset, cp_haltset_fanout);
857 bitset_resize(&pp->cp_haltset, max_ncpus);
858
859 /*
860 * Pause all CPUs while changing the partition list, to make sure
861 * the clock thread (which traverses the list without holding
862 * cpu_lock) isn't running.
863 */
864 pause_cpus(NULL, NULL);
865 pp->cp_next = cp_list_head;
866 pp->cp_prev = cp_list_head->cp_prev;
867 cp_list_head->cp_prev->cp_next = pp;
868 cp_list_head->cp_prev = pp;
869 start_cpus();
870 mutex_exit(&cpu_lock);
871
872 return (0);
873 }
874
875 /*
876 * Move threads from specified partition to cp_default. If `force' is specified,
877 * move all threads, otherwise move only soft-bound threads.
878 */
879 static int
880 cpupart_unbind_threads(cpupart_t *pp, boolean_t unbind_all)
881 {
882 void *projbuf, *zonebuf;
883 kthread_t *t;
884 proc_t *p;
885 int err = 0;
886 psetid_t psid = pp->cp_id;
887
888 ASSERT(pool_lock_held());
889 ASSERT(MUTEX_HELD(&cpu_lock));
890
891 if (pp == NULL || pp == &cp_default) {
892 return (EINVAL);
893 }
894
895 /*
896 * Pre-allocate enough buffers for FSS for all active projects and
897 * for all active zones on the system. Unused buffers will be
898 * freed later by fss_freebuf().
899 */
900 projbuf = fss_allocbuf(FSS_NPROJ_BUF, FSS_ALLOC_PROJ);
901 zonebuf = fss_allocbuf(FSS_NPROJ_BUF, FSS_ALLOC_ZONE);
902
903 mutex_enter(&pidlock);
904 t = curthread;
905 do {
906 if (t->t_bind_pset == psid) {
907 again: p = ttoproc(t);
908 mutex_enter(&p->p_lock);
909 if (ttoproc(t) != p) {
910 /*
911 * lwp_exit has changed this thread's process
912 * pointer before we grabbed its p_lock.
913 */
914 mutex_exit(&p->p_lock);
915 goto again;
916 }
917
918 /*
919 * Can only unbind threads which have revocable binding
920 * unless force unbinding requested.
921 */
922 if (unbind_all || TB_PSET_IS_SOFT(t)) {
923 err = cpupart_bind_thread(t, PS_NONE, 1,
924 projbuf, zonebuf);
925 if (err) {
926 mutex_exit(&p->p_lock);
927 mutex_exit(&pidlock);
928 fss_freebuf(projbuf, FSS_ALLOC_PROJ);
929 fss_freebuf(zonebuf, FSS_ALLOC_ZONE);
930 return (err);
931 }
932 t->t_bind_pset = PS_NONE;
933 }
934 mutex_exit(&p->p_lock);
935 }
936 t = t->t_next;
937 } while (t != curthread);
938
939 mutex_exit(&pidlock);
940 fss_freebuf(projbuf, FSS_ALLOC_PROJ);
941 fss_freebuf(zonebuf, FSS_ALLOC_ZONE);
942 return (err);
943 }
944
945 /*
946 * Destroy a partition.
947 */
948 int
949 cpupart_destroy(psetid_t psid)
950 {
951 cpu_t *cp, *first_cp;
952 cpupart_t *pp, *newpp;
953 int err = 0;
954
955 ASSERT(pool_lock_held());
956 mutex_enter(&cpu_lock);
957
958 pp = cpupart_find(psid);
959 if (pp == NULL || pp == &cp_default) {
960 mutex_exit(&cpu_lock);
961 return (EINVAL);
962 }
963
964 /*
965 * Unbind all the threads currently bound to the partition.
966 */
967 err = cpupart_unbind_threads(pp, B_TRUE);
968 if (err) {
969 mutex_exit(&cpu_lock);
970 return (err);
971 }
972
973 newpp = &cp_default;
974 while ((cp = pp->cp_cpulist) != NULL) {
975 if (err = cpupart_move_cpu(cp, newpp, 0)) {
976 mutex_exit(&cpu_lock);
977 return (err);
978 }
979 }
980
981 ASSERT(bitset_is_null(&pp->cp_cmt_pgs));
982 ASSERT(bitset_is_null(&pp->cp_haltset));
983
984 /*
985 * Teardown the partition's group of active CMT PGs and halted
986 * CPUs now that they have all left.
987 */
988 bitset_fini(&pp->cp_cmt_pgs);
989 bitset_fini(&pp->cp_haltset);
990
991 /*
992 * Reset the pointers in any offline processors so they won't
993 * try to rejoin the destroyed partition when they're turned
994 * online.
995 */
996 first_cp = cp = CPU;
997 do {
998 if (cp->cpu_part == pp) {
999 ASSERT(cp->cpu_flags & CPU_OFFLINE);
1000 cp->cpu_part = newpp;
1001 }
1002 cp = cp->cpu_next;
1003 } while (cp != first_cp);
1004
1005 /*
1006 * Pause all CPUs while changing the partition list, to make sure
1007 * the clock thread (which traverses the list without holding
1008 * cpu_lock) isn't running.
1009 */
1010 pause_cpus(NULL, NULL);
1011 pp->cp_prev->cp_next = pp->cp_next;
1012 pp->cp_next->cp_prev = pp->cp_prev;
1013 if (cp_list_head == pp)
1014 cp_list_head = pp->cp_next;
1015 start_cpus();
1016
1017 if (cp_id_next > pp->cp_id)
1018 cp_id_next = pp->cp_id;
1019
1020 if (pp->cp_kstat)
1021 kstat_delete(pp->cp_kstat);
1022
1023 cp_numparts--;
1024
1025 disp_kp_free(&pp->cp_kp_queue);
1026
1027 cpupart_lpl_teardown(pp);
1028
1029 kmem_free(pp, sizeof (cpupart_t));
1030 mutex_exit(&cpu_lock);
1031
1032 return (err);
1033 }
1034
1035
1036 /*
1037 * Return the ID of the partition to which the specified processor belongs.
1038 */
1039 psetid_t
1040 cpupart_query_cpu(cpu_t *cp)
1041 {
1042 ASSERT(MUTEX_HELD(&cpu_lock));
1043
1044 return (CPTOPS(cp->cpu_part->cp_id));
1045 }
1046
1047
1048 /*
1049 * Attach a processor to an existing partition.
1050 */
1051 int
1052 cpupart_attach_cpu(psetid_t psid, cpu_t *cp, int forced)
1053 {
1054 cpupart_t *pp;
1055 int err;
1056
1057 ASSERT(pool_lock_held());
1058 ASSERT(MUTEX_HELD(&cpu_lock));
1059
1060 pp = cpupart_find(psid);
1061 if (pp == NULL)
1062 return (EINVAL);
1063 if (cp->cpu_flags & CPU_OFFLINE)
1064 return (EINVAL);
1065
1066 err = cpupart_move_cpu(cp, pp, forced);
1067 return (err);
1068 }
1069
1070 /*
1071 * Get a list of cpus belonging to the partition. If numcpus is NULL,
1072 * this just checks for a valid partition. If numcpus is non-NULL but
1073 * cpulist is NULL, the current number of cpus is stored in *numcpus.
1074 * If both are non-NULL, the current number of cpus is stored in *numcpus,
1075 * and a list of those cpus up to the size originally in *numcpus is
1076 * stored in cpulist[]. Also, store the processor set id in *psid.
1077 * This is useful in case the processor set id passed in was PS_MYID.
1078 */
1079 int
1080 cpupart_get_cpus(psetid_t *psid, processorid_t *cpulist, uint_t *numcpus)
1081 {
1082 cpupart_t *pp;
1083 uint_t ncpus;
1084 cpu_t *c;
1085 int i;
1086
1087 mutex_enter(&cpu_lock);
1088 pp = cpupart_find(*psid);
1089 if (pp == NULL) {
1090 mutex_exit(&cpu_lock);
1091 return (EINVAL);
1092 }
1093 *psid = CPTOPS(pp->cp_id);
1094 ncpus = pp->cp_ncpus;
1095 if (numcpus) {
1096 if (ncpus > *numcpus) {
1097 /*
1098 * Only copy as many cpus as were passed in, but
1099 * pass back the real number.
1100 */
1101 uint_t t = ncpus;
1102 ncpus = *numcpus;
1103 *numcpus = t;
1104 } else
1105 *numcpus = ncpus;
1106
1107 if (cpulist) {
1108 c = pp->cp_cpulist;
1109 for (i = 0; i < ncpus; i++) {
1110 ASSERT(c != NULL);
1111 cpulist[i] = c->cpu_id;
1112 c = c->cpu_next_part;
1113 }
1114 }
1115 }
1116 mutex_exit(&cpu_lock);
1117 return (0);
1118 }
1119
1120 /*
1121 * Reallocate kpreempt queues for each CPU partition. Called from
1122 * disp_setup when a new scheduling class is loaded that increases the
1123 * number of priorities in the system.
1124 */
1125 void
1126 cpupart_kpqalloc(pri_t npri)
1127 {
1128 cpupart_t *cpp;
1129
1130 ASSERT(MUTEX_HELD(&cpu_lock));
1131 cpp = cp_list_head;
1132 do {
1133 disp_kp_alloc(&cpp->cp_kp_queue, npri);
1134 cpp = cpp->cp_next;
1135 } while (cpp != cp_list_head);
1136 }
1137
1138 int
1139 cpupart_get_loadavg(psetid_t psid, int *buf, int nelem)
1140 {
1141 cpupart_t *cp;
1142 int i;
1143
1144 ASSERT(nelem >= 0);
1145 ASSERT(nelem <= LOADAVG_NSTATS);
1146 ASSERT(MUTEX_HELD(&cpu_lock));
1147
1148 cp = cpupart_find(psid);
1149 if (cp == NULL)
1150 return (EINVAL);
1151 for (i = 0; i < nelem; i++)
1152 buf[i] = cp->cp_hp_avenrun[i] >> (16 - FSHIFT);
1153
1154 return (0);
1155 }
1156
1157
1158 uint_t
1159 cpupart_list(psetid_t *list, uint_t nelem, int flag)
1160 {
1161 uint_t numpart = 0;
1162 cpupart_t *cp;
1163
1164 ASSERT(MUTEX_HELD(&cpu_lock));
1165 ASSERT(flag == CP_ALL || flag == CP_NONEMPTY);
1166
1167 if (list != NULL) {
1168 cp = cp_list_head;
1169 do {
1170 if (((flag == CP_ALL) && (cp != &cp_default)) ||
1171 ((flag == CP_NONEMPTY) && (cp->cp_ncpus != 0))) {
1172 if (numpart == nelem)
1173 break;
1174 list[numpart++] = CPTOPS(cp->cp_id);
1175 }
1176 cp = cp->cp_next;
1177 } while (cp != cp_list_head);
1178 }
1179
1180 ASSERT(numpart < cp_numparts);
1181
1182 if (flag == CP_ALL)
1183 numpart = cp_numparts - 1; /* leave out default partition */
1184 else if (flag == CP_NONEMPTY)
1185 numpart = cp_numparts_nonempty;
1186
1187 return (numpart);
1188 }
1189
1190 int
1191 cpupart_setattr(psetid_t psid, uint_t attr)
1192 {
1193 cpupart_t *cp;
1194
1195 ASSERT(pool_lock_held());
1196
1197 mutex_enter(&cpu_lock);
1198 if ((cp = cpupart_find(psid)) == NULL) {
1199 mutex_exit(&cpu_lock);
1200 return (EINVAL);
1201 }
1202 /*
1203 * PSET_NOESCAPE attribute for default cpu partition is always set
1204 */
1205 if (cp == &cp_default && !(attr & PSET_NOESCAPE)) {
1206 mutex_exit(&cpu_lock);
1207 return (EINVAL);
1208 }
1209 cp->cp_attr = attr;
1210 mutex_exit(&cpu_lock);
1211 return (0);
1212 }
1213
1214 int
1215 cpupart_getattr(psetid_t psid, uint_t *attrp)
1216 {
1217 cpupart_t *cp;
1218
1219 mutex_enter(&cpu_lock);
1220 if ((cp = cpupart_find(psid)) == NULL) {
1221 mutex_exit(&cpu_lock);
1222 return (EINVAL);
1223 }
1224 *attrp = cp->cp_attr;
1225 mutex_exit(&cpu_lock);
1226 return (0);
1227 }